File size: 95,729 Bytes


import inspect
import math
import os
from typing import Optional, Tuple, Union

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.nn.parameter import Parameter
import torch.nn.functional as F
from transformers.activations import ACT2FN
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    MaskedLMOutput,
    SequenceClassifierOutput,
)
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers.modeling_utils import PreTrainedModel
from transformers.generation import GenerationMixin
from dataclasses import dataclass
from transformers.utils import ModelOutput
from contextlib import nullcontext

try:
    from transformers.cache_utils import Cache, DynamicCache, EncoderDecoderCache
except ImportError:  # pragma: no cover - compatibility fallback for older Transformers
    class Cache:  # type: ignore[no-redef]
        pass

    class DynamicCache(Cache):  # type: ignore[no-redef]
        def __init__(self, *args, **kwargs):
            super().__init__()

        def get_seq_length(self):
            return 0

    class EncoderDecoderCache(Cache):  # type: ignore[no-redef]
        def __init__(self, self_attention_cache=None, cross_attention_cache=None):
            super().__init__()
            self.self_attention_cache = self_attention_cache
            self.cross_attention_cache = cross_attention_cache
            self.is_updated = {}

        @classmethod
        def from_legacy_cache(cls, past_key_values):
            cache = cls()
            cache.legacy_cache = past_key_values
            return cache

        def get_seq_length(self):
            return 0

try:
    from transformers.modeling_layers import GradientCheckpointingLayer
except ImportError:  # pragma: no cover - compatibility fallback for older Transformers
    class GradientCheckpointingLayer(nn.Module):  # type: ignore[no-redef]
        gradient_checkpointing = False

        def __init__(self, *args, **kwargs):
            super().__init__()

try:
    from transformers.utils import auto_docstring, logging
except ImportError:  # pragma: no cover - compatibility fallback
    from transformers.utils import logging  # type: ignore

    def auto_docstring(*args, **kwargs):
        if args and callable(args[0]) and len(args) == 1 and not kwargs:
            return args[0]

        def _decorator(obj):
            return obj

        return _decorator

try:
    from transformers.utils.deprecation import deprecate_kwarg
except ImportError:  # pragma: no cover - compatibility fallback
    def deprecate_kwarg(*args, **kwargs):
        def _decorator(fn):
            return fn

        return _decorator

try:
    from transformers.utils.hub import cached_file
except ImportError:  # pragma: no cover - compatibility fallback
    from transformers.utils import cached_file  # type: ignore


logger = logging.get_logger(__name__)
_HF_LOAD_KWARGS = {
    "cache_dir", "force_download", "local_files_only",
    "token", "revision", "subfolder", "use_safetensors",
}


_HF_CONFIG_LOAD_KWARGS = {
    "cache_dir",
    "force_download",
    "local_files_only",
    "token",
    "revision",
    "subfolder",
    "proxies",
}

_HF_NON_MODEL_INIT_KWARGS = {
    "trust_remote_code",
    "_from_auto",
    "adapter_kwargs",
}



def l2_norm(input, axis=1, epsilon=1e-12):
    norm = torch.norm(input, 2, axis, True)
    norm = torch.clamp(norm, min=epsilon)  # Avoid zero division
    output = torch.div(input, norm)
    return output

def initialize_linear_kaiming(layer: nn.Linear):
    if isinstance(layer, nn.Linear):
        nn.init.kaiming_uniform_(layer.weight, nonlinearity='linear')
        if layer.bias is not None:
            nn.init.zeros_(layer.bias)


def get_classifier_dropout(config) -> float:
    classifier_dropout = getattr(config, "classifier_dropout", None)
    if classifier_dropout is None:
        classifier_dropout = getattr(config, "hidden_dropout_prob", 0.0)
    return float(classifier_dropout)


def normalize_pooling_attention_mask(
    attention_mask: Optional[torch.Tensor],
) -> Optional[torch.Tensor]:
    """
    Return a boolean keep-mask of shape (batch_size, seq_length).
    Supports:
      - (B, L) masks with 1/0 or bool
      - (B, 1, L)
      - (B, 1, 1, L)
      - additive masks with 0 for keep and negative values for masked positions
    """
    if attention_mask is None:
        return None

    if attention_mask.dim() == 4:
        if attention_mask.size(1) == 1 and attention_mask.size(2) == 1:
            attention_mask = attention_mask[:, 0, 0, :]
        else:
            raise ValueError(f"Unexpected 4D attention_mask shape: {tuple(attention_mask.shape)}")
    elif attention_mask.dim() == 3:
        if attention_mask.size(1) == 1:
            attention_mask = attention_mask[:, 0, :]
        else:
            raise ValueError(f"Unexpected 3D attention_mask shape: {tuple(attention_mask.shape)}")
    elif attention_mask.dim() != 2:
        raise ValueError(f"Unexpected attention_mask shape: {tuple(attention_mask.shape)}")

    if attention_mask.dtype == torch.bool:
        return attention_mask

    if torch.is_floating_point(attention_mask) and (attention_mask < 0).any():
        # HF additive masks: 0 means keep, negative means masked
        return attention_mask == 0

    return attention_mask != 0


def masked_attention_pool(
    sequence_output: torch.Tensor,
    token_scores: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    keep_mask = normalize_pooling_attention_mask(attention_mask)

    if keep_mask is not None:
        empty_rows = keep_mask.sum(dim=1) == 0
        if empty_rows.any():
            keep_mask = keep_mask.clone()
            keep_mask[empty_rows, 0] = True

        token_scores = token_scores.masked_fill(~keep_mask.unsqueeze(-1), float("-inf"))

    weights = torch.softmax(token_scores.float(), dim=1).to(dtype=sequence_output.dtype)
    pooled_output = torch.sum(weights * sequence_output, dim=1)
    return pooled_output


def apply_chunking_to_forward(forward_fn, chunk_size: int, chunk_dim: int, *input_tensors) -> torch.Tensor:
    """Local copy of the HF utility to reduce cross-version import fragility."""
    if len(input_tensors) == 0:
        raise ValueError(f"{input_tensors} has to be a tuple/list of tensors")

    num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
    if num_args_in_forward_chunk_fn != len(input_tensors):
        raise ValueError(
            f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input tensors are given"
        )

    if chunk_size > 0:
        tensor_shape = input_tensors[0].shape[chunk_dim]
        for input_tensor in input_tensors:
            if input_tensor.shape[chunk_dim] != tensor_shape:
                raise ValueError(
                    f"All input tenors have to be of the same shape: {tensor_shape}, found shape {input_tensor.shape[chunk_dim]}"
                )
        if input_tensors[0].shape[chunk_dim] % chunk_size != 0:
            raise ValueError(
                f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk size {chunk_size}"
            )
        num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size
        input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors)
        output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks))
        return torch.cat(output_chunks, dim=chunk_dim)

    return forward_fn(*input_tensors)


def prune_linear_layer(layer: nn.Linear, index: torch.LongTensor, dim: int = 0) -> nn.Linear:
    """Local copy of the HF utility to reduce cross-version import fragility."""
    index = index.to(layer.weight.device)
    weight = layer.weight.index_select(dim, index).detach().clone()
    if layer.bias is not None:
        if dim == 1:
            bias = layer.bias.detach().clone()
        else:
            bias = layer.bias[index].detach().clone()
    new_size = list(layer.weight.size())
    new_size[dim] = len(index)
    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
    new_layer.weight.requires_grad = False
    new_layer.weight.copy_(weight.contiguous())
    new_layer.weight.requires_grad = True
    if layer.bias is not None:
        new_layer.bias.requires_grad = False
        new_layer.bias.copy_(bias.contiguous())
        new_layer.bias.requires_grad = True
    return new_layer


def find_pruneable_heads_and_indices(
    heads: list[int], n_heads: int, head_size: int, already_pruned_heads: set[int]
) -> tuple[set[int], torch.LongTensor]:
    """Local copy of the HF utility that was removed from newer Transformers."""
    mask = torch.ones(n_heads, head_size)
    heads = set(heads) - already_pruned_heads
    for head in heads:
        head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
        mask[head] = 0
    mask = mask.view(-1).contiguous().eq(1)
    index = torch.arange(len(mask))[mask].long()
    return heads, index

logger = logging.get_logger(__name__)

def load_tf_weights_in_megatron_bert(model, config, tf_checkpoint_path):
    """Load tf checkpoints in a pytorch model."""
    try:
        import re

        import numpy as np
        import tensorflow as tf
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise
    tf_path = os.path.abspath(tf_checkpoint_path)
    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)
    names = []
    arrays = []
    for name, shape in init_vars:
        logger.info(f"Loading TF weight {name} with shape {shape}")
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
        arrays.append(array)

    for name, array in zip(names, arrays):
        name = name.split("/")
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
        # which are not required for using pretrained model
        if any(
            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
            for n in name
        ):
            logger.info(f"Skipping {'/'.join(name)}")
            continue
        pointer = model
        for m_name in name:
            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
                scope_names = re.split(r"_(\d+)", m_name)
            else:
                scope_names = [m_name]
            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
                pointer = getattr(pointer, "bias")
            elif scope_names[0] == "output_weights":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "squad":
                pointer = getattr(pointer, "classifier")
            else:
                try:
                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
                    logger.info(f"Skipping {'/'.join(name)}")
                    continue
            if len(scope_names) >= 2:
                num = int(scope_names[1])
                pointer = pointer[num]
        if m_name[-11:] == "_embeddings":
            pointer = getattr(pointer, "weight")
        elif m_name == "kernel":
            array = np.transpose(array)
        if pointer.shape != array.shape:
            raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
        logger.info(f"Initialize PyTorch weight {name}")
        pointer.data = torch.from_numpy(array)
    return model
def _extract_base_model_state_dict(
    state_dict: dict[str, torch.Tensor],
    base_prefix: str = "bert",
) -> dict[str, torch.Tensor]:
    prefix = f"{base_prefix}."
    if any(k.startswith(prefix) for k in state_dict.keys()):
        return {k[len(prefix):]: v for k, v in state_dict.items() if k.startswith(prefix)}
    return state_dict


def _split_pretrained_kwargs(kwargs):
    """
    Split kwargs into:
      - config/hub loading kwargs
      - weight file preference kwargs
      - state-dict reading kwargs
      - remaining kwargs (config overrides or model __init__ kwargs)
    """
    kwargs = dict(kwargs)

    for k in _HF_NON_MODEL_INIT_KWARGS:
        kwargs.pop(k, None)

    config_load_kwargs = {
        k: kwargs.pop(k) for k in list(kwargs) if k in _HF_CONFIG_LOAD_KWARGS
    }

    use_safetensors = kwargs.pop("use_safetensors", None)
    weights_only = kwargs.pop("weights_only", True)

    return config_load_kwargs, use_safetensors, weights_only, kwargs

def _resolve_weights_file(
    pretrained_model_name_or_path,
    use_safetensors=None,
    **load_kwargs,
) -> str:
    """
    Resolve a single weight file path from either a local directory or the Hub.

    use_safetensors:
      - True  -> require model.safetensors
      - False -> require pytorch_model.bin
      - None  -> prefer safetensors, then fall back to bin
    """
    pretrained_model_name_or_path = os.fspath(pretrained_model_name_or_path)

    if use_safetensors is True:
        candidates = ("model.safetensors",)
    elif use_safetensors is False:
        candidates = ("pytorch_model.bin",)
    else:
        candidates = ("model.safetensors", "pytorch_model.bin")

    subfolder = load_kwargs.get("subfolder")

    if os.path.isdir(pretrained_model_name_or_path):
        base_dir = (
            os.path.join(pretrained_model_name_or_path, subfolder)
            if subfolder
            else pretrained_model_name_or_path
        )
        for name in candidates:
            path = os.path.join(base_dir, name)
            if os.path.exists(path):
                return path

    for name in candidates:
        try:
            path = cached_file(pretrained_model_name_or_path, name, **load_kwargs)
            if path is not None:
                return path
        except Exception:
            pass

    raise FileNotFoundError(
        f"No checkpoint file found in {pretrained_model_name_or_path!r} "
        f"(candidates: {', '.join(candidates)})"
    )


def _read_state_dict(weights_path, weights_only: bool = True) -> dict[str, torch.Tensor]:
    weights_path = os.fspath(weights_path)

    if weights_path.endswith(".safetensors"):
        from safetensors.torch import load_file as safe_load_file
        return safe_load_file(weights_path, device="cpu")

    try:
        return torch.load(weights_path, map_location="cpu", weights_only=weights_only)
    except TypeError:
        # Older torch versions do not support weights_only
        return torch.load(weights_path, map_location="cpu")

def _autocast_disabled(device_type: str):
    try:
        return torch.amp.autocast(device_type=device_type, enabled=False)
    except (AttributeError, TypeError):
        # older torch fallback
        if device_type == "cuda":
            return torch.cuda.amp.autocast(enabled=False)
        if device_type == "cpu" and hasattr(torch, "cpu") and hasattr(torch.cpu, "amp"):
            return torch.cpu.amp.autocast(enabled=False)
        return nullcontext()
    

class _SafeFromPretrainedMixin:
    """
    Simplified custom-model loader that preserves the useful HF behavior:

      - if config is None or a path/string:
          kwargs matching config fields update the config via
          config_class.from_pretrained(..., return_unused_kwargs=True)

      - remaining kwargs are passed to model __init__

      - supports:
          output_loading_info
          state_dict
          ignore_mismatched_sizes
          use_safetensors
          weights_only

    This is still intentionally much simpler than the full HF loader:
      - no sharded checkpoints
      - no device_map / offload / low_cpu_mem_usage
      - no quantized loaders
      - no tensor parallel / dispatch logic
    """

    @classmethod
    def _adapt_state_dict(cls, state_dict):
        """
        Hook for subclasses that need to rewrite checkpoint keys before loading.
        Example: stripping a leading 'bert.' prefix for base-model-only loads.
        """
        return state_dict

    @staticmethod
    def _filter_keys_with_patterns(keys, patterns):
        if not patterns:
            return list(keys)

        import re

        compiled = [re.compile(p) if isinstance(p, str) else p for p in patterns]
        return [k for k in keys if not any(p.search(k) for p in compiled)]

    @classmethod
    def _resolve_config_and_init_kwargs(
        cls,
        pretrained_model_name_or_path,
        config,
        config_load_kwargs,
        other_kwargs,
    ):
        """
        Mirror HF behavior:
          - config instance: use it directly, pass remaining kwargs to __init__
          - config path / no config: load config and split overrides via return_unused_kwargs=True
        """
        if isinstance(config, PretrainedConfig):
            return config, other_kwargs

        if config is None:
            config_source = pretrained_model_name_or_path
        elif isinstance(config, (str, os.PathLike)):
            config_source = config
        else:
            raise TypeError(
                "`config` must be None, a path-like object, or an instance of PretrainedConfig"
            )

        if config_source is None:
            raise ValueError(
                "You must provide either `pretrained_model_name_or_path` or `config` "
                "to load a configuration."
            )

        config, init_kwargs = cls.config_class.from_pretrained(
            config_source,
            return_unused_kwargs=True,
            **config_load_kwargs,
            **other_kwargs,
        )
        return config, init_kwargs

    @staticmethod
    def _remove_mismatched_keys(model, state_dict):
        """
        Remove keys whose tensor shapes do not match the current model.
        Returns:
          filtered_state_dict, mismatched_keys
        where mismatched_keys is a list of:
          (key, checkpoint_shape, model_shape)
        """
        state_dict = dict(state_dict)
        model_state = model.state_dict()
        mismatched_keys = []

        for key in list(state_dict.keys()):
            if key not in model_state:
                continue

            loaded_value = state_dict[key]
            model_value = model_state[key]

            if not isinstance(loaded_value, torch.Tensor):
                continue
            if not isinstance(model_value, torch.Tensor):
                continue

            if tuple(loaded_value.shape) != tuple(model_value.shape):
                mismatched_keys.append(
                    (key, tuple(loaded_value.shape), tuple(model_value.shape))
                )
                state_dict.pop(key)

        return state_dict, mismatched_keys

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        output_loading_info = kwargs.pop("output_loading_info", False)
        state_dict = kwargs.pop("state_dict", None)
        config = kwargs.pop("config", None)
        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
        strict = kwargs.pop("strict", False)

        config_load_kwargs, use_safetensors, weights_only, other_kwargs = _split_pretrained_kwargs(kwargs)

        # 1) Resolve config and route config overrides correctly
        config, init_kwargs = cls._resolve_config_and_init_kwargs(
            pretrained_model_name_or_path=pretrained_model_name_or_path,
            config=config,
            config_load_kwargs=config_load_kwargs,
            other_kwargs=other_kwargs,
        )

        # 2) Build model
        model = cls(config, *model_args, **init_kwargs)

        # 3) Read checkpoint if state_dict was not supplied explicitly
        if state_dict is None:
            if pretrained_model_name_or_path is None:
                raise ValueError(
                    "`pretrained_model_name_or_path` cannot be None when `state_dict` is not provided."
                )

            weights_path = _resolve_weights_file(
                pretrained_model_name_or_path,
                use_safetensors=use_safetensors,
                **config_load_kwargs,
            )
            state_dict = _read_state_dict(
                weights_path,
                weights_only=True if weights_only is None else bool(weights_only),
            )

        if not isinstance(state_dict, dict):
            raise TypeError(
                f"`state_dict` must be a dict-like mapping of parameter names to tensors, got {type(state_dict).__name__}"
            )

        # 4) Allow subclasses to rewrite checkpoint keys
        state_dict = cls._adapt_state_dict(dict(state_dict))

        # 5) Optionally drop shape-mismatched tensors
        mismatched_keys = []
        if ignore_mismatched_sizes:
            state_dict, mismatched_keys = cls._remove_mismatched_keys(model, state_dict)

        # 6) Load
        incompatible = model.load_state_dict(state_dict, strict=strict)

        # 7) Re-tie if the model defines tied weights
        if hasattr(model, "tie_weights"):
            model.tie_weights()

        if hasattr(model, "assert_mlm_head_is_valid"):
            model.assert_mlm_head_is_valid()

        model.eval()

        missing_keys = list(incompatible.missing_keys)
        unexpected_keys = list(incompatible.unexpected_keys)

        # Honor standard HF ignore patterns if the class defines them
        missing_keys = cls._filter_keys_with_patterns(
            missing_keys,
            getattr(model, "_keys_to_ignore_on_load_missing", None),
        )
        unexpected_keys = cls._filter_keys_with_patterns(
            unexpected_keys,
            getattr(model, "_keys_to_ignore_on_load_unexpected", None),
        )

        info = {
            "missing_keys": missing_keys,
            "unexpected_keys": unexpected_keys,
            "mismatched_keys": mismatched_keys,
            "error_msgs": [],
        }

        return (model, info) if output_loading_info else model



class MegatronBertConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MegatronBertModel`]. It is used to instantiate a
    MEGATRON_BERT model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the MEGATRON_BERT
    [nvidia/megatron-bert-uncased-345m](https://huggingface.co/nvidia/megatron-bert-uncased-345m) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 29056):
            Vocabulary size of the MEGATRON_BERT model. Defines the number of different tokens that can be represented
            by the `inputs_ids` passed when calling [`MegatronBertModel`].
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        intermediate_size (`int`, *optional*, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"silu"` and `"gelu_new"` are supported.
        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention probabilities.
        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        type_vocab_size (`int`, *optional*, defaults to 2):
            The vocabulary size of the `token_type_ids` passed when calling [`MegatronBertModel`].
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
        is_decoder (`bool`, *optional*, defaults to `False`):
            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.

    Examples:

    ```python
    >>> from transformers import MegatronBertConfig, MegatronBertModel

    >>> # Initializing a MEGATRON_BERT google-bert/bert-base-uncased style configuration
    >>> configuration = MegatronBertConfig()

    >>> # Initializing a model (with random weights) from the google-bert/bert-base-uncased style configuration
    >>> model = MegatronBertModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "megatron-bert"

    def __init__(
        self,
        vocab_size=29056,
        hidden_size=1024,
        num_hidden_layers=24,
        num_attention_heads=16,
        intermediate_size=4096,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=0,
        position_embedding_type="absolute",
        use_cache=True,
        is_decoder=False,
        add_cross_attention=False,
        **kwargs,
    ):
        super().__init__(pad_token_id=pad_token_id, **kwargs)

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.position_embedding_type = position_embedding_type
        self.use_cache = use_cache
        self.is_decoder = is_decoder
        self.add_cross_attention = add_cross_attention



class MegatronBertEmbeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(
            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
        )
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")

    @staticmethod
    def _make_position_ids(seq_length: int, device: torch.device, past_key_values_length: int = 0):
        return torch.arange(
            past_key_values_length,
            past_key_values_length + seq_length,
            dtype=torch.long,
            device=device,
        ).unsqueeze(0)

    def forward(
        self,
        input_ids=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        past_key_values_length: int = 0,
    ):
        if input_ids is not None:
            input_shape = input_ids.size()
            device = input_ids.device
        else:
            input_shape = inputs_embeds.size()[:-1]
            device = inputs_embeds.device

        seq_length = input_shape[1]

        if position_ids is None and self.position_embedding_type == "absolute":
            position_ids = self._make_position_ids(
                seq_length, device, past_key_values_length
            )

        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        embeddings = inputs_embeds + self.token_type_embeddings(token_type_ids)

        if self.position_embedding_type == "absolute":
            embeddings = embeddings + self.position_embeddings(position_ids)

        return self.dropout(embeddings)
    
# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->MegatronBert

class MegatronBertSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None, layer_idx=None):
        super().__init__()
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        self.is_decoder = config.is_decoder
        self.layer_idx = layer_idx

    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Cache] = None,
        output_attentions: Optional[bool] = False,
        cache_position: Optional[torch.Tensor] = None,
    ) -> tuple[torch.Tensor]:
        batch_size, seq_length, _ = hidden_states.shape
        query_layer = self.query(hidden_states)
        query_layer = query_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
            1, 2
        )

        is_updated = False
        is_cross_attention = encoder_hidden_states is not None
        if past_key_values is not None:
            if isinstance(past_key_values, EncoderDecoderCache):
                is_updated = past_key_values.is_updated.get(self.layer_idx)
                if is_cross_attention:
                    # after the first generated id, we can subsequently re-use all key/value_layer from cache
                    curr_past_key_value = past_key_values.cross_attention_cache
                else:
                    curr_past_key_value = past_key_values.self_attention_cache
            else:
                curr_past_key_value = past_key_values

        current_states = encoder_hidden_states if is_cross_attention else hidden_states
        if is_cross_attention and past_key_values is not None and is_updated:
            # reuse k,v, cross_attentions
            key_layer = curr_past_key_value.layers[self.layer_idx].keys
            value_layer = curr_past_key_value.layers[self.layer_idx].values
        else:
            key_layer = self.key(current_states)
            key_layer = key_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
                1, 2
            )
            value_layer = self.value(current_states)
            value_layer = value_layer.view(
                batch_size, -1, self.num_attention_heads, self.attention_head_size
            ).transpose(1, 2)

            if past_key_values is not None:
                # save all key/value_layer to cache to be re-used for fast auto-regressive generation
                cache_position = cache_position if not is_cross_attention else None
                key_layer, value_layer = curr_past_key_value.update(
                    key_layer, value_layer, self.layer_idx, {"cache_position": cache_position}
                )
                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
                    past_key_values.is_updated[self.layer_idx] = True

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
            if past_key_values is not None:
                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
                    -1, 1
                )
            else:
                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
            distance = position_ids_l - position_ids_r

            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility

            if self.position_embedding_type == "relative_key":
                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
                attention_scores = attention_scores + relative_position_scores
            elif self.position_embedding_type == "relative_key_query":
                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key

        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        if attention_mask is not None:
            # Apply the attention mask is (precomputed for all layers in MegatronBertModel forward() function)
            attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

        # Mask heads if we want to
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        context_layer = torch.matmul(attention_probs, value_layer)

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        return context_layer, attention_probs


# Based transformers.models.bert.modeling_bert.BertSelfOutput. Moved LayerNorm to MegatronBertAttention below.

class MegatronBertSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        return residual + hidden_states


# Based transformers.models.bert.modeling_bert.BertAttention. Added LayerNorm.

class MegatronBertAttention(nn.Module):
    def __init__(self, config, layer_idx=None):
        super().__init__()
        self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.self = MegatronBertSelfAttention(config, layer_idx=layer_idx)
        self.output = MegatronBertSelfOutput(config)
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # Prune linear layers
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # Update hyper params and store pruned heads
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Cache] = None,
        output_attentions: Optional[bool] = False,
        cache_position: Optional[torch.Tensor] = None,
    ) -> tuple[torch.Tensor]:
        ln_outputs = self.ln(hidden_states)
        self_outputs = self.self(
            ln_outputs,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            past_key_values=past_key_values,
            output_attentions=output_attentions,
            cache_position=cache_position,
        )
        attention_output = self.output(self_outputs[0], hidden_states)
        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        return outputs


# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->MegatronBert

class MegatronBertIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# Based on transformers.models.bert.modeling_bert.BertOutput. Moved LayerNorm to MegatronBertLayer below.

class MegatronBertOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        return input_tensor + hidden_states


# Based on transformers.models.bert.modeling_bert.BertLayer. Added LayerNorm.

class MegatronBertLayer(GradientCheckpointingLayer):
    def __init__(self, config, layer_idx=None):
        super().__init__()
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        self.seq_len_dim = 1
        self.attention = MegatronBertAttention(config, layer_idx=layer_idx)
        self.is_decoder = config.is_decoder
        self.add_cross_attention = config.add_cross_attention
        if self.add_cross_attention:
            if not self.is_decoder:
                raise TypeError(f"{self} should be used as a decoder model if cross attention is added")
            self.crossattention = MegatronBertAttention(config, layer_idx=layer_idx)
        self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.intermediate = MegatronBertIntermediate(config)
        self.output = MegatronBertOutput(config)

    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Cache] = None,
        output_attentions: Optional[bool] = False,
        cache_position: Optional[torch.Tensor] = None,
    ) -> tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            past_key_values=past_key_values,
            cache_position=cache_position,
        )
        attention_output = self_attention_outputs[0]
        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                raise AttributeError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask=encoder_attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states,
                past_key_values=past_key_values,
                output_attentions=output_attentions,
                cache_position=cache_position,
            )
            attention_output = cross_attention_outputs[0]
            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights

        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        return (layer_output,) + outputs

    def feed_forward_chunk(self, attention_output):
        ln_output = self.ln(attention_output)
        intermediate_output = self.intermediate(ln_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output

class MegatronBertEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.layer = nn.ModuleList([MegatronBertLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)])

        # The final layer norm. We removed the 1st LN, moved LN to each hidden layer and this one
        # is simply the final LN (Transformer's BERT has it attached to each hidden layer).
        self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Cache] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        cache_position: Optional[torch.Tensor] = None,
    ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False
        if use_cache and past_key_values is None:
            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
        if use_cache and isinstance(past_key_values, tuple):
            logger.warning_once(
                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
            )
            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)

        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        for i, layer_module in enumerate(self.layer):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            layer_head_mask = head_mask[i] if head_mask is not None else None

            layer_outputs = layer_module(
                hidden_states,
                attention_mask,
                layer_head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                past_key_values,
                output_attentions,
                cache_position,
            )

            # Because we moved the layer-norm at the end of the hidden layer, we have non-normali-
            # zed data here. If that's really needed, we must apply LN to match Transformer's BERT.

            hidden_states = layer_outputs[0]
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # Finalize the hidden states.
        hidden_states = self.ln(hidden_states)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    past_key_values,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=past_key_values,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )


# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->MegatronBert

class MegatronBertPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->MegatronBert

class MegatronBertPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dense(hidden_states)
        hidden_states = self.transform_act_fn(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->MegatronBert

class MegatronBertLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.transform = MegatronBertPredictionHeadTransform(config)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
        self.decoder.bias = self.bias

    def _tie_weights(self):
        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->MegatronBert

class MegatronBertOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.predictions = MegatronBertLMPredictionHead(config)

    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores

#@auto_docstring
class MegatronBertPreTrainedModel(PreTrainedModel):
    config_class = MegatronBertConfig
    load_tf_weights = load_tf_weights_in_megatron_bert
    base_model_prefix = "bert"
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if hasattr(module, "bias") and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        elif isinstance(module, MegatronBertLMPredictionHead):
            module.bias.data.zero_()

#@auto_docstring

class MegatronBertModel(MegatronBertPreTrainedModel):
    _no_split_modules = ["MegatronBertEmbeddings", "MegatronBertLayer"]

    def __init__(self, config, add_pooling_layer=False):
        super().__init__(config)
        self.config = config
        self.gradient_checkpointing = False
        self.embeddings = MegatronBertEmbeddings(config)
        self.encoder = MegatronBertEncoder(config)
        self.pooler = MegatronBertPooler(config) if add_pooling_layer else None
        self.post_init()

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    #@auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Cache] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.Tensor] = None,
    ) -> Union[tuple, BaseModelOutputWithPoolingAndCrossAttentions]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        if self.config.is_decoder:
            use_cache = use_cache if use_cache is not None else self.config.use_cache
        else:
            use_cache = False

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
        elif input_ids is not None:
            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
            input_shape = input_ids.size()
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
        else:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        batch_size, seq_length = input_shape
        device = input_ids.device if input_ids is not None else inputs_embeds.device

        past_key_values_length = 0
        if past_key_values is not None:
            past_key_values_length = (
                past_key_values[0][0].shape[-2]
                if not isinstance(past_key_values, Cache)
                else past_key_values.get_seq_length()
            )

        if attention_mask is None:
            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)

        # If a 2D or 3D attention mask is provided for the cross-attention
        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
        if self.config.is_decoder and encoder_hidden_states is not None:
            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
            if encoder_attention_mask is None:
                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
        else:
            encoder_extended_attention_mask = None

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        # head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        embedding_output = self.embeddings(
            input_ids=input_ids,
            position_ids=position_ids,
            token_type_ids=token_type_ids,
            inputs_embeds=inputs_embeds,
            past_key_values_length=past_key_values_length,
        )
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=extended_attention_mask,
            # head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_extended_attention_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )
        sequence_output = encoder_outputs[0]
        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None

        if not return_dict:
            return (sequence_output, pooled_output) + encoder_outputs[1:]

        return BaseModelOutputWithPoolingAndCrossAttentions(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            past_key_values=encoder_outputs.past_key_values,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
            cross_attentions=encoder_outputs.cross_attentions,
        )


@auto_docstring(
    custom_intro="""
    MegatronBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
    `next sentence prediction (classification)` head.
    """
)

#@auto_docstring
class MegatronBertForMaskedLM(MegatronBertPreTrainedModel, GenerationMixin):
    _tied_weights_keys = {
        "cls.predictions.decoder.weight": "bert.embeddings.word_embeddings.weight",
        "cls.predictions.decoder.bias": "cls.predictions.bias",
    }

    def __init__(self, config):
        super().__init__(config)

        if config.is_decoder:
            logger.warning(
                "If you want to use `MegatronBertForMaskedLM` make sure "
                "`config.is_decoder=False` for bi-directional self-attention."
            )

        self.bert = MegatronBertModel(config, add_pooling_layer=False)
        self.cls = MegatronBertOnlyMLMHead(config)

        # Initialize weights and apply final processing
        self.post_init()
        self._force_tie_mlm_head()

    def get_input_embeddings(self):
        return self.bert.get_input_embeddings()

    def set_input_embeddings(self, value):
        self.bert.set_input_embeddings(value)
        self._force_tie_mlm_head()

    def _force_tie_mlm_head(self):
        self.cls.predictions.decoder.weight = self.bert.embeddings.word_embeddings.weight
        self.cls.predictions._tie_weights()

    def tie_weights(self, missing_keys: Optional[set[str]] = None, recompute_mapping: bool = True, **kwargs):
        # Transformers v5 calls tie_weights(recompute_mapping=False) during post_init/init_weights.
        # Keep the signature compatible with both v4 and v5, but force the exact tying behavior we need.
        self._force_tie_mlm_head()

    def assert_mlm_head_is_valid(self):
        in_w = self.bert.embeddings.word_embeddings.weight
        out_w = self.cls.predictions.decoder.weight
        out_b = self.cls.predictions.decoder.bias
        ref_b = self.cls.predictions.bias

        if in_w.data_ptr() != out_w.data_ptr():
            raise RuntimeError("MLM decoder.weight is not tied to input embeddings.")
        if out_b is None or out_b.data_ptr() != ref_b.data_ptr():
            raise RuntimeError("MLM decoder.bias is not tied to cls.predictions.bias.")

    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings
        self.cls.predictions.bias = new_embeddings.bias

    #@auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[tuple, MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """

        return_dict = return_dict if return_dict is not None else self.config.return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # -100 index = padding token
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
        input_shape = input_ids.shape
        effective_batch_size = input_shape[0]

        #  add a dummy token
        if self.config.pad_token_id is None:
            raise ValueError("The PAD token should be defined for generation")
        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
        dummy_token = torch.full(
            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
        )
        input_ids = torch.cat([input_ids, dummy_token], dim=1)

        return {"input_ids": input_ids, "attention_mask": attention_mask}


# Previous codes

class ProkBertConfig(MegatronBertConfig):
    model_type = "prokbert"

    attribute_map = {
        "num_class_labels": "num_labels",
        "curricular_num_labels": "num_labels",
        "classification_dropout_rate": "classifier_dropout",
        "curriculum_hidden_size": "curricular_embedding_size",
        "curricular_face_m": "curricular_margin",
        "curricular_face_s": "curricular_scale",
    }

    def __init__(
        self,
        kmer: int = 6,
        shift: int = 1,
        num_labels: int = 2,
        problem_type: str | None = None,
        classifier_dropout: float | None = None,
        classifier_pooling: str = "attention",
        classifier_mlp_hidden_size: int | None = None,
        classifier_head_type: str = "linear",
        curricular_margin: float = 0.5,
        curricular_scale: float = 64.0,
        curricular_embedding_size: int | None = None,
        **kwargs,
    ):
        legacy_num_class_labels = kwargs.pop("num_class_labels", None)
        legacy_curricular_num_labels = kwargs.pop("curricular_num_labels", None)
        legacy_dropout = kwargs.pop("classification_dropout_rate", None)
        legacy_proj = kwargs.pop("curriculum_hidden_size", None)
        legacy_margin = kwargs.pop("curricular_face_m", None)
        legacy_scale = kwargs.pop("curricular_face_s", None)
        kwargs.pop("bert_base_model", None)

        if legacy_num_class_labels is not None:
            num_labels = legacy_num_class_labels
        if legacy_curricular_num_labels is not None:
            num_labels = legacy_curricular_num_labels            
        loaded_id2label = kwargs.get("id2label")
        if loaded_id2label is not None:
            num_labels = len(loaded_id2label)                        
        if classifier_dropout is None and legacy_dropout is not None:
            classifier_dropout = legacy_dropout
        if curricular_embedding_size is None and legacy_proj not in (None, -1):
            curricular_embedding_size = legacy_proj
        if legacy_margin is not None:
            curricular_margin = legacy_margin
        if legacy_scale is not None:
            curricular_scale = legacy_scale

        super().__init__(num_labels=num_labels, problem_type=problem_type, **kwargs)

        self.kmer = kmer
        self.shift = shift

        self.classifier_dropout = classifier_dropout
        self.classifier_pooling = classifier_pooling
        self.classifier_mlp_hidden_size = classifier_mlp_hidden_size
        self.classifier_head_type = classifier_head_type

        self.curricular_margin = curricular_margin
        self.curricular_scale = curricular_scale
        self.curricular_embedding_size = curricular_embedding_size

        if self.classifier_pooling not in {"cls", "mean", "attention"}:
            raise ValueError(f"Unsupported classifier_pooling={self.classifier_pooling}")
        if self.classifier_head_type not in {"linear", "mlp", "curricular"}:
            raise ValueError(f"Unsupported classifier_head_type={self.classifier_head_type}")

        
class ProkBertPreTrainedModel(MegatronBertPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = ProkBertConfig
    base_model_prefix = "bert"
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()


class ProkBertModel(_SafeFromPretrainedMixin, MegatronBertModel):
    config_class = ProkBertConfig

    def __init__(self, config: ProkBertConfig, **kwargs):
        if not isinstance(config, ProkBertConfig):
            raise ValueError(
                f"Expected `ProkBertConfig`, got {config.__class__.__module__}.{config.__class__.__name__}"
            )
        super().__init__(config, **kwargs)
        self.config = config


    @classmethod
    def _adapt_state_dict(cls, state_dict):
        return _extract_base_model_state_dict(state_dict, base_prefix="bert")

    @classmethod
    def test_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        config = kwargs.pop("config", None)
        add_pooling_layer = kwargs.pop("add_pooling_layer", False)

        # ignored here on purpose; this loader bypasses HF v5 from_pretrained internals
        kwargs.pop("output_loading_info", None)
        kwargs.pop("ignore_mismatched_sizes", None)
        kwargs.pop("state_dict", None)

        if config is None:
            config = cls.config_class.from_pretrained(pretrained_model_name_or_path, **kwargs)

        model = cls(config, add_pooling_layer=add_pooling_layer)

        weights_path = _resolve_weights_file(pretrained_model_name_or_path)
        raw_state_dict = _read_state_dict(weights_path)

        # ProkBERT checkpoint is MLM-style; encoder lives under `bert.`
        state_dict = _extract_base_model_state_dict(raw_state_dict, base_prefix="bert")

        missing, unexpected = model.load_state_dict(state_dict, strict=False)

        allowed_missing = set()
        if add_pooling_layer:
            allowed_missing.update({"pooler.dense.weight", "pooler.dense.bias"})

        bad_missing = [k for k in missing if k not in allowed_missing]

        if bad_missing or unexpected:
            raise RuntimeError(
                f"Checkpoint mismatch.\nMissing: {bad_missing}\nUnexpected: {unexpected}"
            )

        model.eval()
        return model


class ProkBertForMaskedLM(_SafeFromPretrainedMixin, MegatronBertForMaskedLM):
    config_class = ProkBertConfig

    def __init__(self, config: ProkBertConfig, **kwargs):
        if not isinstance(config, ProkBertConfig):
            raise ValueError(
                f"Expected `ProkBertConfig`, got "
                f"{config.__class__.__module__}.{config.__class__.__name__}"
            )

        super().__init__(config, **kwargs)
        self.config = config
        # One should check if it is a prper prokbert config, if not crafting one.

    @classmethod
    def _adapt_state_dict(cls, state_dict):
        state_dict = dict(state_dict)

        emb_w = "bert.embeddings.word_embeddings.weight"
        dec_w = "cls.predictions.decoder.weight"
        mlm_b = "cls.predictions.bias"
        dec_b = "cls.predictions.decoder.bias"

        if dec_w not in state_dict and emb_w in state_dict:
            state_dict[dec_w] = state_dict[emb_w]
        if emb_w not in state_dict and dec_w in state_dict:
            state_dict[emb_w] = state_dict[dec_w]

        if dec_b not in state_dict and mlm_b in state_dict:
            state_dict[dec_b] = state_dict[mlm_b]
        if mlm_b not in state_dict and dec_b in state_dict:
            state_dict[mlm_b] = state_dict[dec_b]

        return state_dict


class ProkBertForSequenceClassification(_SafeFromPretrainedMixin, ProkBertPreTrainedModel):
    """
    Default ProkBERT sequence classifier:
      - padding-safe masked attention pooling
      - neutral pooling init (uniform over non-masked tokens at step 0)
      - simple dropout + linear classifier head
    """
    config_class = ProkBertConfig
    base_model_prefix = "bert"

    def __init__(self, config: ProkBertConfig):
        super().__init__(config)
        self.config = config
        self.num_labels = int(config.num_labels)

        self.bert = ProkBertModel(config, add_pooling_layer=False)

        # Keep the old module name for checkpoint compatibility.
        self.weighting_layer = nn.Linear(self.config.hidden_size, 1)
        self.dropout = nn.Dropout(get_classifier_dropout(self.config))
        self.classifier = nn.Linear(self.config.hidden_size, self.num_labels)

        self.post_init()

        # Neutral pooling init: uniform over valid tokens at the beginning of training.
        with torch.no_grad():
            nn.init.zeros_(self.weighting_layer.weight)
            if self.weighting_layer.bias is not None:
                nn.init.zeros_(self.weighting_layer.bias)

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]  # (B, L, H)

        token_scores = self.weighting_layer(sequence_output)  # (B, L, 1)
        pooled_output = masked_attention_pool(
            sequence_output=sequence_output,
            token_scores=token_scores,
            attention_mask=attention_mask,
        )

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif labels.dtype in (
                    torch.int8,
                    torch.int16,
                    torch.int32,
                    torch.int64,
                    torch.long,
                    torch.uint8,
                ):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = nn.MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels.float())
            else:
                raise ValueError(f"Unsupported problem_type: {self.config.problem_type}")

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=getattr(outputs, "hidden_states", None),
            attentions=getattr(outputs, "attentions", None),
        )
    
@dataclass
class CurricularSequenceClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: Optional[torch.FloatTensor] = None
    embeddings: Optional[torch.FloatTensor] = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


class CurricularFace(nn.Module):
    def __init__(self, in_features, out_features, m=0.5, s=64.0, ema_alpha=0.01):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.m = float(m)
        self.s = float(s)
        self.ema_alpha = float(ema_alpha)

        self.cos_m = math.cos(self.m)
        self.sin_m = math.sin(self.m)
        self.threshold = math.cos(math.pi - self.m)
        self.mm = math.sin(math.pi - self.m) * self.m

        # keep checkpoint compatibility: same shape as before
        self.kernel = Parameter(torch.empty(in_features, out_features))
        self.register_buffer("t", torch.zeros(1, dtype=torch.float32))
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.kernel)
        self.t.zero_()

    def cosine(self, embeddings: torch.Tensor) -> torch.Tensor:
        # entire angular-margin block starts from fp32 cosine similarities
        # one cast at the entrance; do not keep re-casting inside
        with _autocast_disabled(embeddings.device.type):
            x = F.normalize(embeddings.float(), p=2.0, dim=1, eps=1e-12)
            w = F.normalize(self.kernel.float(), p=2.0, dim=0, eps=1e-12)
            cos_theta = F.linear(x, w.t()).clamp(-1.0, 1.0)
        return cos_theta  # fp32

    def inference_logits(self, embeddings: torch.Tensor) -> torch.Tensor:
        return self.cosine(embeddings) * self.s

    def margin_logits_from_cosine(
        self,
        cos_theta: torch.Tensor,
        labels: torch.LongTensor,
        update_t: bool = False,
    ) -> torch.Tensor:
        labels = labels.reshape(-1).long()

        # (B, 1)
        target = cos_theta.gather(1, labels.unsqueeze(1))

        sin_theta = torch.sqrt((1.0 - target.square()).clamp(min=0.0))
        cos_theta_m = target * self.cos_m - sin_theta * self.sin_m

        hard_mask = cos_theta > cos_theta_m
        final_target = torch.where(
            target > self.threshold,
            cos_theta_m,
            target - self.mm,
        )

        # update running t only in training
        if update_t:
            with torch.no_grad():
                target_mean = target.mean().to(dtype=self.t.dtype).view_as(self.t)
                self.t.lerp_(target_mean, self.ema_alpha)

        # keep everything in one dtype; no masked indexed assignment
        t = self.t.to(device=cos_theta.device, dtype=cos_theta.dtype)
        adjusted = torch.where(hard_mask, cos_theta * (t + cos_theta), cos_theta)
        adjusted = adjusted.scatter(1, labels.unsqueeze(1), final_target)

        return adjusted * self.s

    def training_logits(
        self,
        embeddings: torch.Tensor,
        labels: torch.LongTensor,
        update_t: bool = False,
    ) -> torch.Tensor:
        cos_theta = self.cosine(embeddings)
        return self.margin_logits_from_cosine(cos_theta, labels, update_t=update_t)
    
    

class ProkBertForCurricularClassification(_SafeFromPretrainedMixin, ProkBertPreTrainedModel):
    config_class = ProkBertConfig
    base_model_prefix = "bert"

    def __init__(self, config: ProkBertConfig):
        super().__init__(config)
        self.config = config
        self.num_labels = int(config.num_labels)

        self.bert = ProkBertModel(config, add_pooling_layer=False)
        self.weighting_layer = nn.Linear(self.config.hidden_size, 1)
        self.dropout = nn.Dropout(get_classifier_dropout(self.config))

        use_projection = self.config.curricular_embedding_size not in (None, -1)
        proj_dim = self.config.hidden_size if not use_projection else int(self.config.curricular_embedding_size)
        self.linear = nn.Linear(self.config.hidden_size, proj_dim) if use_projection else nn.Identity()

        self.curricular_face = CurricularFace(
            in_features=proj_dim,
            out_features=self.num_labels,
            m=float(self.config.curricular_margin),
            s=float(self.config.curricular_scale),
        )
        self.loss_fct = nn.CrossEntropyLoss()

        self.post_init()

        with torch.no_grad():
            nn.init.zeros_(self.weighting_layer.weight)
            if self.weighting_layer.bias is not None:
                nn.init.zeros_(self.weighting_layer.bias)
            if isinstance(self.linear, nn.Linear):
                initialize_linear_kaiming(self.linear)

    def _pool_sequence_output(
        self,
        sequence_output: torch.Tensor,
        attention_mask: Optional[torch.Tensor],
    ) -> torch.Tensor:
        pooling = self.config.classifier_pooling

        if pooling == "cls":
            return sequence_output[:, 0]

        if pooling == "mean":
            keep_mask = normalize_pooling_attention_mask(attention_mask)
            if keep_mask is None:
                return sequence_output.mean(dim=1)

            empty_rows = keep_mask.sum(dim=1) == 0
            if empty_rows.any():
                keep_mask = keep_mask.clone()
                keep_mask[empty_rows, 0] = True

            mask = keep_mask.unsqueeze(-1).to(dtype=sequence_output.dtype)
            return (sequence_output * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1.0)

        if pooling == "attention":
            token_scores = self.weighting_layer(sequence_output)
            return masked_attention_pool(
                sequence_output=sequence_output,
                token_scores=token_scores,
                attention_mask=attention_mask,
            )

        raise ValueError(f"Unsupported classifier_pooling={pooling!r}")

    def _compute_embeddings(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        apply_dropout: bool = True,
    ) -> tuple[torch.Tensor, BaseModelOutputWithPoolingAndCrossAttentions]:
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=True,
        )

        pooled_output = self._pool_sequence_output(
            outputs.last_hidden_state,
            attention_mask,
        )

        if apply_dropout:
            pooled_output = self.dropout(pooled_output)

        embeddings = self.linear(pooled_output)
        return embeddings, outputs

    def encode(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        normalize: bool = True,
    ) -> torch.Tensor:
        # deterministic embedding extraction: no dropout
        embeddings, _ = self._compute_embeddings(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            apply_dropout=False,
        )
        return l2_norm(embeddings, axis=1) if normalize else embeddings

    def deprecated_curricular_inference_logits(self, embeddings: torch.Tensor) -> torch.Tensor:
        embeddings = l2_norm(embeddings, axis=1)
        kernel_norm = l2_norm(self.curricular_face.kernel, axis=0)
        cos_theta = torch.mm(embeddings, kernel_norm).clamp(-1.0, 1.0)
        return cos_theta * self.curricular_face.s
    

    def _curricular_inference_logits(self, embeddings: torch.Tensor) -> torch.Tensor:
        return self.curricular_face.inference_logits(embeddings)


    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,  # kept for compatibility
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        return_embeddings: bool = False,
        normalize_embeddings: bool = True,
    ) -> Union[Tuple, CurricularSequenceClassifierOutput]:
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        embeddings, outputs = self._compute_embeddings(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            apply_dropout=self.training,
        )

        exported_embeddings = None
        if return_embeddings:
            exported_embeddings = (
                l2_norm(embeddings, axis=1) if normalize_embeddings else embeddings
            )

        # compute cosine once in fp32
        cos_theta = self.curricular_face.cosine(embeddings)

        # always return label-free prediction logits
        logits = cos_theta * self.curricular_face.s

        loss = None
        if labels is not None:
            labels = labels.view(-1).long()
            train_logits = self.curricular_face.margin_logits_from_cosine(
                cos_theta,
                labels,
                update_t=self.training,  # do not mutate t in eval
            )
            loss = self.loss_fct(train_logits, labels)

        if not return_dict:
            out = (logits,)
            if return_embeddings:
                out = out + (exported_embeddings,)
            if output_hidden_states:
                out = out + (outputs.hidden_states,)
            if output_attentions:
                out = out + (outputs.attentions,)
            return ((loss,) + out) if loss is not None else out

        return CurricularSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            embeddings=exported_embeddings,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


class ProkBertForSequenceClassificationExt(_SafeFromPretrainedMixin, ProkBertPreTrainedModel):
    """
    Extensions vs. baseline ProkBertForSequenceClassification:
      - Fixes attention-pooling bug by masking PAD positions using attention_mask
      - Neutral pooling init: weighting_layer starts at zero => uniform pooling over non-masked tokens
      - LN + MLP head on pooled embedding
      - Temperature-controlled attention pooling with learnable temperature (scalar)
    """
    config_class = ProkBertConfig
    base_model_prefix = "bert"

    def __init__(self, config):
        super().__init__(config)
        self.config = config

        self.bert = ProkBertModel(config)

        # Attention pooling (token-wise scalar score)
        self.weighting_layer = nn.Linear(self.config.hidden_size, 1)

        # Learnable temperature for pooling: temperature = exp(log_temperature), clamped
        self.log_temperature = nn.Parameter(torch.zeros(()))  # scalar, starts at 0 => temperature=1
        self.temperature_min = float(getattr(config, "pool_temperature_min", 0.1))
        self.temperature_max = float(getattr(config, "pool_temperature_max", 10.0))

        # MLP head on pooled embedding
        eps = float(getattr(config, "layer_norm_eps", 1e-12))
        drop_p = float(getattr(config, "classification_dropout_rate", 0.1))
        hidden_size = int(self.config.hidden_size)
        mlp_hidden = int(getattr(config, "classifier_mlp_hidden_size", max(1, hidden_size // 2)))

        self.mlp_ln = nn.LayerNorm(hidden_size, eps=eps)
        self.mlp_dropout = nn.Dropout(drop_p)
        self.mlp_fc1 = nn.Linear(hidden_size, mlp_hidden)
        self.mlp_act = nn.GELU()
        self.mlp_fc2 = nn.Linear(mlp_hidden, int(self.config.num_class_labels))

        # Loss
        if int(self.config.num_class_labels) == 1:
            self.loss_fct = nn.MSELoss()
        else:
            self.loss_fct = nn.CrossEntropyLoss()

        self.post_init()

        # --- Custom init for "neutral" pooling + slightly conservative output layer ---
        self._init_ext_head()

    def _init_ext_head(self):
        # Make pooling start neutral: scores = 0 => uniform softmax over non-masked tokens
        with torch.no_grad():
            nn.init.zeros_(self.weighting_layer.weight)
            nn.init.zeros_(self.weighting_layer.bias)

        # Optional: make final classifier layer a bit smaller (reduces early overconfidence)
        init_range = float(getattr(self.config, "initializer_range", 0.02))
        with torch.no_grad():
            nn.init.normal_(self.mlp_fc2.weight, mean=0.0, std=init_range * 0.1)
            nn.init.zeros_(self.mlp_fc2.bias)

    def _get_temperature(self, device: torch.device) -> torch.Tensor:
        # Keep temperature positive and within a reasonable range
        t = torch.exp(self.log_temperature.to(device=device))
        return torch.clamp(t, min=self.temperature_min, max=self.temperature_max)

    @staticmethod
    def _normalize_attention_mask(attention_mask: torch.Tensor) -> torch.Tensor:
        """
        Convert attention_mask to shape (B, L) boolean mask where True means "keep token".
        Handles common shapes: (B, L), (B, 1, 1, L), (B, 1, L).
        """
        if attention_mask is None:
            return None

        mask = attention_mask
        # Common HF forms
        if mask.dim() == 4:
            # (B, 1, 1, L) -> (B, L)
            mask = mask.squeeze(1).squeeze(1)
        elif mask.dim() == 3:
            # (B, 1, L) -> (B, L)
            mask = mask.squeeze(1)

        # Convert to bool: treat >0 as keep
        mask = mask > 0
        return mask

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:

        return_dict = return_dict if return_dict is not None else self.config.return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]  # (B, L, H)

        # --- Temperature-controlled attention pooling with PAD-masking ---
        scores = self.weighting_layer(sequence_output)  # (B, L, 1)

        # Apply temperature (smooth if temperature > 1, sharper if < 1)
        temperature = self._get_temperature(device=scores.device)
        scores = scores / temperature

        # Mask out padding tokens (pooling bug fix)
        keep_mask = self._normalize_attention_mask(attention_mask)  # (B, L) bool or None
        if keep_mask is not None:
            # Guard: if an example is fully masked (shouldn't happen), keep first token to avoid NaNs
            if (keep_mask.sum(dim=1) == 0).any():
                keep_mask = keep_mask.clone()
                keep_mask[(keep_mask.sum(dim=1) == 0), 0] = True

            scores = scores.masked_fill(~keep_mask.unsqueeze(-1), float("-inf"))

        # Softmax in fp32 for stability, then cast back
        weights = torch.softmax(scores.float(), dim=1).to(dtype=sequence_output.dtype)  # (B, L, 1)

        pooled_output = torch.sum(weights * sequence_output, dim=1)  # (B, H)

        # --- LN + MLP head ---
        x = self.mlp_ln(pooled_output)
        x = self.mlp_dropout(x)
        x = self.mlp_fc1(x)
        x = self.mlp_act(x)
        x = self.mlp_dropout(x)
        logits = self.mlp_fc2(x)

        loss = None
        if labels is not None:
            if int(self.config.num_class_labels) == 1:
                loss = self.loss_fct(logits.view(-1), labels.view(-1).float())
            else:
                loss = self.loss_fct(logits.view(-1, int(self.config.num_class_labels)), labels.view(-1))

        if not return_dict:
            # outputs: (last_hidden_state, pooled_output, hidden_states, attentions) in most BERT-like models
            out = (logits,) + outputs[2:]
            return ((loss,) + out) if loss is not None else out

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=getattr(outputs, "hidden_states", None),
            attentions=getattr(outputs, "attentions", None),
        )