Upload CaduceusForMaskedLM

Browse files

Files changed (5) hide show

config.json +55 -0
configuration_caduceus.py +55 -0
model.safetensors +3 -0
modeling_caduceus.py +723 -0
modeling_rcps.py +269 -0

config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "hf-compo-cad2-l24-ms-v-chtk-c12k-1t-v2-b2-lr4e4-pHntE6-ep1-ba320185/",
+  "architectures": [
+    "CaduceusForMaskedLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_caduceus.CaduceusConfig",
+    "AutoModel": "modeling_caduceus.Caduceus",
+    "AutoModelForMaskedLM": "modeling_caduceus.CaduceusForMaskedLM",
+    "AutoModelForSequenceClassification": "modeling_caduceus.CaduceusForSequenceClassification"
+  },
+  "bidirectional": true,
+  "bidirectional_strategy": "add",
+  "bidirectional_weight_tie": true,
+  "complement_map": {
+    "0": 0,
+    "1": 1,
+    "2": 2,
+    "3": 6,
+    "4": 5,
+    "5": 4,
+    "6": 3,
+    "7": 7
+  },
+  "d_intermediate": 0,
+  "d_model": 768,
+  "fused_add_norm": true,
+  "initializer_cfg": {
+    "initializer_range": 0.02,
+    "n_residuals_per_layer": 1,
+    "rescale_prenorm_residual": true
+  },
+  "model_type": "caduceus",
+  "n_layer": 24,
+  "norm_epsilon": 1e-05,
+  "pad_token_id": -100,
+  "pad_vocab_size_multiple": 8,
+  "rcps": true,
+  "residual_in_fp32": false,
+  "rms_norm": true,
+  "ssm_cfg": {
+    "bias": false,
+    "conv_bias": true,
+    "d_conv": 4,
+    "d_state": 64,
+    "dt_init_floor": 0.0001,
+    "dt_max": 0.1,
+    "dt_min": 0.001,
+    "expand": 2
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.36.1",
+  "vocab_size": 8
+}

configuration_caduceus.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""Caduceus config for Hugging Face.
+"""
+from typing import Optional, Union
+from transformers import PretrainedConfig
+class CaduceusConfig(PretrainedConfig):
+    """Config that extends the original MambaConfig with params relevant to bi-directionality and RC equivariance."""
+    model_type = "caduceus"
+    def __init__(
+            self,
+            # From original MambaConfig
+            d_model: int = 2560,
+            n_layer: int = 64,
+            vocab_size: int = 50277,
+            ssm_cfg: Optional[dict] = None,
+            rms_norm: bool = True,
+            residual_in_fp32: bool = True,
+            fused_add_norm: bool = True,
+            pad_vocab_size_multiple: int = 8,
+            # Not in original MambaConfig, but default arg in create_block in mamba_ssm repo; used in layer norm
+            norm_epsilon: float = 1e-5,
+            # Used in init_weights
+            initializer_cfg: Optional[dict] = None,
+            # Caduceus-specific params
+            bidirectional: bool = True,
+            bidirectional_strategy: Union[str, None] = "add",
+            bidirectional_weight_tie: bool = True,
+            rcps: bool = False,
+            complement_map: Optional[dict] = None,  # used for RCPSEmbedding / RCPSLMHead
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.d_model = d_model
+        self.n_layer = n_layer
+        self.vocab_size = vocab_size
+        self.ssm_cfg = ssm_cfg
+        self.rms_norm = rms_norm
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.pad_vocab_size_multiple = pad_vocab_size_multiple
+        self.norm_epsilon = norm_epsilon
+        self.initializer_cfg = initializer_cfg
+        self.bidirectional = bidirectional
+        self.bidirectional_strategy = bidirectional_strategy
+        self.bidirectional_weight_tie = bidirectional_weight_tie
+        self.rcps = rcps
+        self.complement_map = complement_map

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62402395ca103398c1e78ee8749934e436acba51b28c027cf23512dc71933f11
+size 353000744

modeling_caduceus.py ADDED Viewed

	@@ -0,0 +1,723 @@

+"""Caduceus model for Hugging Face.
+"""
+import copy
+import math
+from functools import partial
+from typing import Optional, Tuple, Union
+import torch
+# from mamba_ssm.modules.mamba_simple import Mamba
+# #from mamba_ssm.modules.mamba2_simple import Mamba2Simple as Mamba2
+# from mamba_ssm import Mamba2
+# from mamba_ssm.modules.block import Block
+from mamba_ssm.models.config_mamba import MambaConfig
+from mamba_ssm.modules.mamba_simple import Mamba
+from mamba_ssm.modules.mamba2 import Mamba2
+from mamba_ssm.modules.mha import MHA
+from mamba_ssm.modules.mlp import GatedMLP
+from mamba_ssm.modules.block import Block
+from mamba_ssm.utils.generation import GenerationMixin
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import BaseModelOutputWithNoAttention, MaskedLMOutput, SequenceClassifierOutput
+#try:
+from mamba_ssm.ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
+#except ImportError:
+#    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+from .configuration_caduceus import CaduceusConfig
+from .modeling_rcps import RCPSAddNormWrapper, RCPSEmbedding, RCPSLMHead, RCPSMambaBlock
+# def create_block(
+#         d_model,
+#         d_intermediate,
+#         ssm_cfg=None,
+#         norm_epsilon=1e-5,
+#         rms_norm=False,
+#         residual_in_fp32=False,
+#         fused_add_norm=False,
+#         layer_idx=None,
+#         bidirectional=True,
+#         bidirectional_strategy="add",
+#         bidirectional_weight_tie=True,
+#         rcps=False,
+#         device=None,
+#         dtype=None,
+# ):
+#     """Create Caduceus block.
+#     Adapted from: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py
+#     """
+#     if ssm_cfg is None:
+#         ssm_cfg = {}
+#     factory_kwargs = {"device": device, "dtype": dtype}
+    # bidirectional_kwargs = {
+    #     "bidirectional": bidirectional,
+    #     "bidirectional_strategy": bidirectional_strategy,
+    #     "bidirectional_weight_tie": bidirectional_weight_tie,
+    # }
+    # mixer_cls = partial(BiMambaWrapper, layer_idx=layer_idx, **ssm_cfg, **bidirectional_kwargs, **factory_kwargs)
+#     norm_cls = partial(
+#         nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
+#     )
+#     block_cls = RCPSMambaBlock if rcps else Block
+#     if d_intermediate == 0:
+#         mlp_cls = nn.Identity
+#     else:
+#         mlp_cls = partial(
+#             GatedMLP, hidden_features=d_intermediate, out_features=d_model, **factory_kwargs
+#         )
+#     block = block_cls(
+#         d_model,
+#         mixer_cls,
+#         mlp_cls,
+#         norm_cls=norm_cls,
+#         fused_add_norm=fused_add_norm,
+#         residual_in_fp32=residual_in_fp32,
+#     )
+#     block.layer_idx = layer_idx
+#     return block
+def create_block(
+    d_model,
+    d_intermediate,
+    ssm_cfg=None,
+    attn_layer_idx=None,
+    attn_cfg=None,
+    norm_epsilon=1e-5,
+    rms_norm=False,
+    residual_in_fp32=False,
+    fused_add_norm=False,
+    layer_idx=None,
+    device=None,
+    dtype=None,
+    bidirectional=True,
+    bidirectional_strategy="add",
+    bidirectional_weight_tie=True,
+    rcps=False,
+):
+    if ssm_cfg is None:
+        ssm_cfg = {}
+    if attn_layer_idx is None:
+        attn_layer_idx = []
+    if attn_cfg is None:
+        attn_cfg = {}
+    factory_kwargs = {"device": device, "dtype": dtype}
+    bidirectional_kwargs = {
+        "bidirectional": bidirectional,
+        "bidirectional_strategy": bidirectional_strategy,
+        "bidirectional_weight_tie": bidirectional_weight_tie,
+    }
+    if layer_idx not in attn_layer_idx:
+        # Create a copy of the config to modify
+        ssm_cfg = copy.deepcopy(ssm_cfg) if ssm_cfg is not None else {}
+        ssm_layer = ssm_cfg.pop("ssm_layer", "Mamba1")
+        if ssm_layer not in ["Mamba1", "Mamba2"]:
+            raise ValueError(f"Invalid ssm_layer: {ssm_layer}, only support Mamba1 and Mamba2")
+        # mixer_cls = partial(
+        #     Mamba2 if ssm_layer == "Mamba2" else Mamba,
+        #     layer_idx=layer_idx,
+        #     **ssm_cfg,
+        #     **factory_kwargs
+        # )
+        mixer_cls = partial(BiMambaWrapper, layer_idx=layer_idx, **ssm_cfg, **bidirectional_kwargs, **factory_kwargs)
+    else:
+        #ssm_cfg.pop("layer", "Mamba1")
+        #TODO add bidirectional support
+        mixer_cls = partial(MHA, layer_idx=layer_idx, **attn_cfg, **factory_kwargs)
+    norm_cls = partial(
+        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
+    )
+    if d_intermediate == 0:
+        mlp_cls = nn.Identity
+    else:
+        mlp_cls = partial(
+            GatedMLP, hidden_features=d_intermediate, out_features=d_model, **factory_kwargs
+        )
+    block_cls = RCPSMambaBlock if rcps else Block
+    block = block_cls(
+        d_model,
+        mixer_cls,
+        mlp_cls,
+        norm_cls=norm_cls,
+        fused_add_norm=fused_add_norm,
+        residual_in_fp32=residual_in_fp32,
+    )
+    block.layer_idx = layer_idx
+    return block
+class BiMambaWrapper(nn.Module):
+    """Thin wrapper around Mamba to support bi-directionality."""
+    def __init__(
+            self,
+            d_model: int,
+            bidirectional: bool = True,
+            bidirectional_strategy: Optional[str] = "add",
+            bidirectional_weight_tie: bool = True,
+            ssm_layer = "Mamba2",
+            **mamba_kwargs,
+    ):
+        super().__init__()
+        assert ssm_layer in ("Mamba", "Mamba2"), f"{block_name=}"
+        if bidirectional and bidirectional_strategy is None:
+            bidirectional_strategy = "add"  # Default strategy: `add`
+        if bidirectional and bidirectional_strategy not in ["add", "ew_multiply"]:
+            raise NotImplementedError(f"`{bidirectional_strategy}` strategy for bi-directionality is not implemented!")
+        self.bidirectional = bidirectional
+        self.bidirectional_strategy = bidirectional_strategy
+        if ssm_layer == "Mamba":
+            block_cls = Mamba
+            self.mamba_fwd = block_cls(
+                d_model=d_model,
+                **mamba_kwargs
+            )
+        elif ssm_layer == "Mamba2":
+            block_cls = Mamba2
+            self.mamba_fwd = block_cls(
+                d_model=d_model,
+                **mamba_kwargs
+            )
+        else:
+            raise ValueError(f"Unrecognized {block_name=}")
+        if bidirectional:
+            self.mamba_rev = block_cls(
+                d_model=d_model,
+                **mamba_kwargs
+            )
+            if bidirectional_weight_tie:  # Tie in and out projections (where most of param count lies)
+                self.mamba_rev.in_proj.weight = self.mamba_fwd.in_proj.weight
+                self.mamba_rev.in_proj.bias = self.mamba_fwd.in_proj.bias
+                self.mamba_rev.out_proj.weight = self.mamba_fwd.out_proj.weight
+                self.mamba_rev.out_proj.bias = self.mamba_fwd.out_proj.bias
+        else:
+            self.mamba_rev = None
+    def forward(self, hidden_states, inference_params=None):
+        """Bidirectional-enabled forward pass
+        hidden_states: (B, L, D)
+        Returns: same shape as hidden_states
+        """
+        out = self.mamba_fwd(hidden_states.contiguous(), inference_params=inference_params)
+        if self.bidirectional:
+            out_rev = self.mamba_rev(
+                hidden_states.flip(dims=(1,)),  # Flip along the sequence length dimension
+                inference_params=inference_params
+            ).flip(dims=(1,))  # Flip back for combining with forward hidden states
+            if self.bidirectional_strategy == "add":
+                out = out + out_rev
+            elif self.bidirectional_strategy == "ew_multiply":
+                out = out * out_rev
+            else:
+                raise NotImplementedError(f"`{self.bidirectional_strategy}` for bi-directionality not implemented!")
+        return out.contiguous()
+class CaduceusEmbeddings(nn.Module):
+    def __init__(
+            self,
+            config: CaduceusConfig,
+            device=None,
+            dtype=None,
+    ):
+        super().__init__()
+        factory_kwargs = {"device": device, "dtype": dtype}
+        if config.rcps:
+            self.word_embeddings = RCPSEmbedding(
+                config.vocab_size, config.d_model, config.complement_map, **factory_kwargs
+            )
+        else:
+            self.word_embeddings = nn.Embedding(config.vocab_size, config.d_model, **factory_kwargs)
+    def forward(self, input_ids):
+        """
+            input_ids: (batch, seqlen)
+        """
+        return self.word_embeddings(input_ids)
+class CaduceusMixerModel(nn.Module):
+    def __init__(
+            self,
+            config: CaduceusConfig,
+            device=None,
+            dtype=None,
+    ) -> None:
+        super().__init__()
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.fused_add_norm = config.fused_add_norm
+        self.rcps = config.rcps
+        self.residual_in_fp32 = config.residual_in_fp32
+        self.embeddings = CaduceusEmbeddings(config, **factory_kwargs)
+        # Mamba changes the order of residual and layer norm:
+        # Instead of LN -> Attn / MLP -> Add, we do:
+        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
+        # the main branch (output of MLP / Mixer). The model definition is unchanged.
+        # This is for performance reason: we can fuse add + layer_norm.
+        if config.fused_add_norm:
+            if layer_norm_fn is None or rms_norm_fn is None:
+                raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels")
+        self.layers = nn.ModuleList(
+            [
+                create_block(
+                    config.d_model,
+                    d_intermediate=config.d_intermediate,
+                    ssm_cfg=config.ssm_cfg,
+                    norm_epsilon=config.norm_epsilon,
+                    rms_norm=config.rms_norm,
+                    residual_in_fp32=config.residual_in_fp32,
+                    fused_add_norm=config.fused_add_norm,
+                    layer_idx=i,
+                    bidirectional=config.bidirectional,
+                    bidirectional_strategy=config.bidirectional_strategy,
+                    bidirectional_weight_tie=config.bidirectional_weight_tie,
+                    rcps=config.rcps,
+                    **factory_kwargs,
+                )
+                for i in range(config.n_layer)
+            ]
+        )
+        norm_f = (nn.LayerNorm if not config.rms_norm else RMSNorm)(
+            config.d_model, eps=config.norm_epsilon, **factory_kwargs
+        )
+        self.norm_f = norm_f if (config.fused_add_norm or not config.rcps) else RCPSAddNormWrapper(norm_f)
+    def forward(self, input_ids, inputs_embeds=None, output_hidden_states=False):
+        """Mixer forward."""
+        all_hidden_states = []
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embeddings(input_ids)
+        residual = None
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states.append(hidden_states)
+            # TODO: Add support for gradient checkpointing
+            hidden_states, residual = layer(
+                hidden_states, residual, inference_params=None
+            )
+        if not self.fused_add_norm:
+            if self.rcps:
+                # Set prenorm=False here since we don't need the residual
+                hidden_states = self.norm_f(hidden_states, residual=residual, prenorm=False)
+            else:
+                residual = (hidden_states + residual) if residual is not None else hidden_states
+                hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
+        else:
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn
+            if self.rcps:
+                # Set prenorm=False here since we don't need the residual
+                hidden_states_fwd = fused_add_norm_fn(
+                    hidden_states[..., :hidden_states.shape[-1] // 2],
+                    self.norm_f.weight,
+                    self.norm_f.bias,
+                    eps=self.norm_f.eps,
+                    residual=residual[..., :hidden_states.shape[-1] // 2],
+                    prenorm=False,
+                    residual_in_fp32=self.residual_in_fp32,
+                )
+                hidden_states_rc = fused_add_norm_fn(
+                    hidden_states[..., hidden_states.shape[-1] // 2:].flip(dims=[-2, -1]),
+                    self.norm_f.weight,
+                    self.norm_f.bias,
+                    eps=self.norm_f.eps,
+                    residual=residual[..., hidden_states.shape[-1] // 2:].flip(dims=[-2, -1]),
+                    prenorm=False,
+                    residual_in_fp32=self.residual_in_fp32,
+                )
+                hidden_states = torch.cat([hidden_states_fwd, hidden_states_rc.flip(dims=[-2, -1])], dim=-1)
+            else:
+                # Set prenorm=False here since we don't need the residual
+                hidden_states = fused_add_norm_fn(
+                    hidden_states,
+                    self.norm_f.weight,
+                    self.norm_f.bias,
+                    eps=self.norm_f.eps,
+                    residual=residual,
+                    prenorm=False,
+                    residual_in_fp32=self.residual_in_fp32,
+                )
+            if output_hidden_states:
+                all_hidden_states.append(hidden_states)
+        return hidden_states, all_hidden_states
+def cross_entropy(logits, y, ignore_index=-100):
+    """Cross entropy loss."""
+    logits = logits.view(-1, logits.shape[-1])
+    y = y.view(-1)
+    return F.cross_entropy(logits, y, ignore_index=ignore_index)
+def weighted_cross_entropy(logits, y, loss_weights, ignore_index=-100):
+    """Weighted cross entropy loss (discounts certain tokens, e.g., repeated base pairs in genome)."""
+    logits = logits.view(-1, logits.shape[-1])
+    y = y.view(-1)
+    ce = F.cross_entropy(logits, y, ignore_index=ignore_index, reduction="none")
+    loss_weights = loss_weights.view(-1)
+    loss_weights[y == ignore_index] = 0.0
+    # TODO: Follows GPN implementation, but should we remove weight normalization?
+    return (ce * (loss_weights / loss_weights.sum())).sum()
+class CaduceusPreTrainedModel(PreTrainedModel):
+    """PreTrainedModel wrapper for Caduceus backbone."""
+    config_class = CaduceusConfig
+    base_model_prefix = "caduceus"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["BiMambaWrapper"]
+    def _init_weights(
+            self,
+            module,
+            initializer_range=0.02,  # Now only used for embedding layer.
+            **kwargs,
+    ):
+        """Adapted from: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py"""
+        n_layer = self.config.n_layer
+        initialized_cfg = self.config.initializer_cfg if self.config.initializer_cfg is not None else {}
+        rescale_prenorm_residual = initialized_cfg.get("rescale_prenorm_residual", True)
+        initializer_range = initialized_cfg.get("initializer_range", initializer_range)
+        n_residuals_per_layer = initialized_cfg.get("n_residuals_per_layer", 1)
+        if isinstance(module, nn.Linear):
+            if module.bias is not None:
+                if not getattr(module.bias, "_no_reinit", False):
+                    nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=initializer_range)
+        if rescale_prenorm_residual:
+            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+            #   > A modified initialization which accounts for the accumulation on the residual path with model depth.
+            #   > Scale the weights of residual layers at initialization by a factor of 1/√N where N is the # of
+            #   residual layers.
+            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+            #
+            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+            for name, p in module.named_parameters():
+                if name in ["out_proj.weight", "fc2.weight"]:
+                    # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                    # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                    # We need to reinit p since this code could be called multiple times
+                    # Having just p *= scale would repeatedly scale it down
+                    nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                    with torch.no_grad():
+                        p /= math.sqrt(n_residuals_per_layer * n_layer)
+class Caduceus(CaduceusPreTrainedModel):
+    """Caduceus model that can be instantiated using HF patterns."""
+    def __init__(self, config: CaduceusConfig, device=None, dtype=None, **kwargs):
+        super().__init__(config)
+        if config.rcps:
+            assert config.complement_map is not None, "Complement map must be provided for RCPS."
+        # Adjust vocab size and complement maps if vocab padding is set.
+        if config.vocab_size % config.pad_vocab_size_multiple != 0:
+            config.vocab_size += config.pad_vocab_size_multiple - (config.vocab_size % config.pad_vocab_size_multiple)
+        if config.complement_map is not None and config.vocab_size > len(config.complement_map):
+            for i in range(len(config.complement_map), config.vocab_size):
+                config.complement_map[i] = i
+        self.config = config
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.backbone = CaduceusMixerModel(config, **factory_kwargs, **kwargs)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[torch.Tensor, Tuple, BaseModelOutputWithNoAttention]:
+        """HF-compatible forward method."""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        hidden_states, all_hidden_states = self.backbone(
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states
+        )
+        if return_dict:
+            return BaseModelOutputWithNoAttention(
+                last_hidden_state=hidden_states,
+                hidden_states=all_hidden_states if output_hidden_states else None
+            )
+        elif output_hidden_states:
+            return hidden_states, all_hidden_states
+        else:
+            return hidden_states
+class CaduceusForMaskedLM(CaduceusPreTrainedModel):
+    """HF-compatible Caduceus model for masked language modeling."""
+    def __init__(self, config: CaduceusConfig, device=None, dtype=None, **kwargs):
+        super().__init__(config, **kwargs)
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.caduceus = Caduceus(config, **factory_kwargs, **kwargs)
+        if config.rcps:
+            self.lm_head = RCPSLMHead(
+                complement_map=self.config.complement_map,  # Use caduceus config as it might have been updated
+                vocab_size=self.config.vocab_size,  # Use caduceus config as it might have been updated
+                true_dim=config.d_model,
+                dtype=dtype
+            )
+        else:
+            self.lm_head = nn.Linear(
+                config.d_model,
+                self.config.vocab_size,  # Use caduceus config as it might have been updated
+                bias=False,
+                **factory_kwargs
+            )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.caduceus.backbone.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        if self.config.rcps:
+            raise NotImplementedError("Setting input embeddings for RCPS LM is not supported.")
+        self.caduceus.backbone.embeddings.word_embeddings = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        """Overrides output embeddings."""
+        if self.config.rcps:
+            raise NotImplementedError("Setting output embeddings for RCPS LM is not supported.")
+        self.lm_head = new_embeddings
+    def tie_weights(self):
+        """Tie weights, accounting for RCPS."""
+        if self.config.rcps:
+            self.lm_head.set_weight(self.get_input_embeddings().weight)
+        else:
+            super().tie_weights()
+    def get_decoder(self):
+        """Get decoder (backbone) for the model."""
+        return self.caduceus
+    def set_decoder(self, decoder):
+        """Set decoder (backbone) for the model."""
+        self.caduceus = decoder
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        loss_weights: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        """HF-compatible forward method."""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.caduceus(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            if loss_weights is not None:
+                loss = weighted_cross_entropy(logits, labels, loss_weights, ignore_index=self.config.pad_token_id)
+            else:
+                loss = cross_entropy(logits, labels, ignore_index=self.config.pad_token_id)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return MaskedLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
+class CaduceusForSequenceClassification(CaduceusPreTrainedModel):
+    def __init__(
+            self,
+            config: CaduceusConfig,
+            pooling_strategy: str = "mean",
+            conjoin_train: bool = False,
+            conjoin_eval: bool = False,
+            device=None,
+            dtype=None,
+            **kwargs):
+        super().__init__(config, **kwargs)
+        if pooling_strategy not in ["mean", "max", "first", "last"]:
+            raise NotImplementedError(f"Pooling strategy `{pooling_strategy}` not implemented.")
+        self.pooling_strategy = pooling_strategy
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.num_labels = kwargs.get("num_labels", config.num_labels)
+        self.caduceus = Caduceus(config, **factory_kwargs, **kwargs)
+        self.score = nn.Linear(config.d_model, self.num_labels, bias=False)
+        self.conjoin_train = conjoin_train
+        self.conjoin_eval = conjoin_eval
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.init_scorer()
+    def init_scorer(self, initializer_range=0.02):
+        initializer_range = self.config.initializer_cfg.get("initializer_range", initializer_range) \
+            if self.config.initializer_cfg is not None else initializer_range
+        self.score.weight.data.normal_(std=initializer_range)
+    def get_input_embeddings(self):
+        return self.caduceus.backbone.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        if self.config.rcps:
+            raise NotImplementedError("Setting input embeddings for RCPS LM is not supported.")
+        self.caduceus.backbone.embeddings.word_embeddings = value
+    def pool_hidden_states(self, hidden_states, sequence_length_dim=1):
+        """Pools hidden states along sequence length dimension."""
+        if self.pooling_strategy == "mean":  # Mean pooling along sequence length dimension
+            return hidden_states.mean(dim=sequence_length_dim)
+        if self.pooling_strategy == "max":  # Max pooling along sequence length dimension
+            return hidden_states.max(dim=sequence_length_dim).values
+        if self.pooling_strategy == "last":  # Use embedding of last token in the sequence
+            return hidden_states.moveaxis(hidden_states, sequence_length_dim, 0)[-1, ...]
+        if self.pooling_strategy == "first":  # Use embedding of first token in the sequence
+            return hidden_states.moveaxis(hidden_states, sequence_length_dim, 0)[0, ...]
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Get hidden representations from the backbone
+        if self.config.rcps:  # Hidden states have 2 * d_model channels for RCPS
+            transformer_outputs = self.caduceus(
+                input_ids,
+                inputs_embeds=inputs_embeds,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            hidden_states = torch.stack(
+                [
+                    transformer_outputs[0][..., :self.config.d_model],
+                    torch.flip(transformer_outputs[0][..., self.config.d_model:], dims=[1, 2])
+                 ],
+                dim=-1
+            )
+        elif self.conjoin_train or (self.conjoin_eval and not self.training):  # For conjoining / post-hoc conjoining
+            assert input_ids is not None, "`input_ids` must be provided for conjoining."
+            assert input_ids.ndim == 3, "`input_ids` must be 3D tensor: channels corresponds to forward and rc strands."
+            transformer_outputs = self.caduceus(
+                input_ids[..., 0],
+                inputs_embeds=None,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            transformer_outputs_rc = self.caduceus(
+                input_ids[..., 1],
+                inputs_embeds=None,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            # Stack along channel dimension (dim=-1)
+            hidden_states = torch.stack([transformer_outputs[0], transformer_outputs_rc[0]], dim=-1)
+        else:
+            transformer_outputs = self.caduceus(
+                input_ids,
+                inputs_embeds=None,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            hidden_states = transformer_outputs[0]
+        # Pool and get logits
+        pooled_hidden_states = self.pool_hidden_states(hidden_states)
+        # Potentially run `score` twice (with parameters shared) for conjoining
+        if hidden_states.ndim == 4:  # bsz, seq_len, hidden_dim, 2 where last channel has the stacked fwd and rc reps
+            logits_fwd = self.score(pooled_hidden_states[..., 0])
+            logits_rc = self.score(pooled_hidden_states[..., 1])
+            logits = (logits_fwd + logits_rc) / 2
+        else:
+            logits = self.score(pooled_hidden_states)
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                if self.num_labels == 1:
+                    loss = F.mse_loss(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = F.mse_loss(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss = F.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss = F.binary_cross_entropy_with_logits(logits, labels)
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+        )

modeling_rcps.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""Reverse-complement equivariant modules.
+"""
+from collections import OrderedDict
+from typing import Optional
+import torch
+from torch import Tensor
+from torch import nn
+from torch.nn import functional as F
+# try:
+#     from mamba_ssm.ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn
+# except ImportError:
+#     RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+from mamba_ssm.ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn
+class RCPSEmbedding(nn.Module):
+    """Embedding layer that supports reverse-complement equivariance."""
+    def __init__(self, vocab_size: int, d_model: int, complement_map: dict, **factory_kwargs):
+        """
+        Args:
+            vocab_size: Size of vocabulary.
+            d_model: Dimensionality of embedding (actual embedding matrix will have 1/2 the output dim).
+            complement_map: Dictionary mapping each token id to its complement.
+        """
+        super().__init__()
+        self.register_buffer(
+            "complement_map",
+            torch.tensor(list(OrderedDict(complement_map).values()), dtype=torch.long)
+        )
+        self.embedding = nn.Embedding(vocab_size, d_model, **factory_kwargs)
+    @property
+    def weight(self):
+        """Embedding weights."""
+        return self.embedding.weight
+    def set_weight(self, value):
+        """Set embedding weights."""
+        self.embedding.weight = value
+    def rc(self, x):
+        """Reverse-complement a tensor of input_ids by flipping along length dimension and complementing the ids."""
+        return torch.gather(
+            self.complement_map.unsqueeze(0).expand(x.shape[0], -1),
+            dim=1,
+            index=torch.flip(x, dims=[-1])
+        )
+    def forward(self, input_ids):
+        """Reverse-complement equivariant forward pass.
+        This embedding module doubles the output dimensionality to support reverse-complement equivariance.
+        Args:
+            input_ids: Input tensor of shape (batch_size, seq_len)
+        Returns:
+            Embedding tensor of shape (batch_size, seq_len, d_model * 2)
+        """
+        fwd_out = self.embedding(input_ids)
+        rc_out = torch.flip(self.embedding(self.rc(input_ids)), dims=[-2, -1])
+        return torch.cat([fwd_out, rc_out], dim=-1)
+class RCPSWrapper(nn.Module):
+    """Wrapper to convert arbitrary nn.Module into a reverse-complement equivariant module.
+    See ref. "Towards a Better Understanding of Reverse-Complement Equivariance for Deep Learning Models in Regulatory
+    Genomics", Zhou et al. (2022), https://proceedings.mlr.press/v165/zhou22a.html for more details.
+    """
+    def __init__(self, submodule: nn.Module):
+        super().__init__()
+        self.submodule = submodule
+    @staticmethod
+    def rc(x):
+        """Reverse-complement a tensor by flipping the length (dim=-2) and channel (dim=-1) dimensions."""
+        return torch.flip(x, dims=[-2, -1])
+    def forward(self, x, **kwargs):
+        """Reverse-complement equivariant forward pass.
+        Args:
+            x: Input tensor of shape (batch_size, seq_len, channels)
+        Returns:
+            Output tensor of shape (batch_size, seq_len, channels * 2)
+        """
+        n_channels = x.shape[-1]
+        # Run submodule along sequence
+        fwd_out = self.submodule(x[..., :n_channels // 2], **kwargs)
+        # Run submodule along rc-sequence
+        rc_out = self.submodule(self.rc(x[..., n_channels // 2:]), **kwargs)
+        # Concatenate along channel dimension (dim=-1)
+        return torch.cat([fwd_out, self.rc(rc_out)], dim=-1)
+class RCPSAddNormWrapper(RCPSWrapper):
+    """RC equivariant AddNorm layer."""
+    def __init__(self, submodule: nn.Module):
+        super().__init__(submodule)
+    def forward(self, x, residual=None, prenorm=False):
+        """
+        Args:
+            x: Input tensor of shape (batch_size, seq_len, channels)
+            residual: Residual tensor of shape (batch_size, seq_len, channels) or None.
+            prenorm: Whether to return residual.
+        """
+        n_channels = x.shape[-1]
+        if residual is None:
+            residual = x
+            x_fwd = self.submodule(x[..., :n_channels // 2].to(dtype=self.submodule.weight.dtype))
+            x_rc = self.submodule(self.rc(x[..., n_channels // 2:]).to(dtype=self.submodule.weight.dtype))
+            x = torch.cat([x_fwd, self.rc(x_rc)], dim=-1)
+        else:
+            residual_fwd = x[..., :n_channels // 2] + residual[..., :n_channels // 2]
+            x_fwd = self.submodule(residual_fwd.to(dtype=self.submodule.weight.dtype))
+            residual_rc = self.rc(x[..., n_channels // 2:]) + self.rc(residual[..., n_channels // 2:])
+            x_rc = self.submodule(residual_rc.to(dtype=self.submodule.weight.dtype))
+            residual = torch.cat([residual_fwd, self.rc(residual_rc)], dim=-1)
+            x = torch.cat([x_fwd, self.rc(x_rc)], dim=-1)
+        return x if not prenorm else (x, residual)
+class RCPSMambaBlock(nn.Module):
+    def __init__(
+            self,
+            dim,
+            mixer_cls,
+            mlp_cls,
+            norm_cls=nn.LayerNorm,
+            fused_add_norm=False,
+            residual_in_fp32=False,
+            device=None,  # Keep for consistency with original Mamba Block
+            dtype=None,  # Keep for consistency with original Mamba Block
+    ):
+        """RCPS version of simple block wrapping a mixer class with LayerNorm/RMSNorm and residual connection.
+        Adapted from: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/modules/mamba_simple.py
+        """
+        super().__init__()
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.mixer = RCPSWrapper(mixer_cls(dim))
+        norm_f = norm_cls(dim)
+        self.norm = norm_f if fused_add_norm else RCPSAddNormWrapper(norm_f)
+        if mlp_cls is not nn.Identity:
+            self.norm2 = norm_cls(dim)
+            self.mlp = mlp_cls(dim)
+        else:
+            self.mlp = None
+        if self.fused_add_norm:
+            assert RMSNorm is not None, "RMSNorm import fails"
+            assert isinstance(
+                self.norm, (nn.LayerNorm, RMSNorm)
+            ), "Only LayerNorm and RMSNorm are supported for fused_add_norm"
+    def forward(
+        self, hidden_states: Tensor, residual: Optional[Tensor] = None, inference_params=None
+    ):
+        r"""Pass the input through the encoder layer.
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: hidden_states = Mixer(LN(residual)).
+            inference_params: inference parameters for mixer.
+        """
+        if not self.fused_add_norm:
+            hidden_states, residual = self.norm(hidden_states, residual=residual, prenorm=True)
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+        else:
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm, RMSNorm) else layer_norm_fn
+            hidden_states_fwd, residual_fwd = fused_add_norm_fn(
+                hidden_states[..., hidden_states.shape[-1] // 2:],
+                self.norm.weight,
+                self.norm.bias,
+                residual=residual[..., hidden_states.shape[-1] // 2:] if residual is not None else None,
+                prenorm=True,
+                residual_in_fp32=self.residual_in_fp32,
+                eps=self.norm.eps,
+            )
+            hidden_states_rc, residual_rc = fused_add_norm_fn(
+                hidden_states[..., :hidden_states.shape[-1] // 2].flip(dims=[-2, -1]),
+                self.norm.weight,
+                self.norm.bias,
+                residual=residual[..., :hidden_states.shape[-1] // 2].flip(dims=[-2, -1]) if residual is not None else None,
+                prenorm=True,
+                residual_in_fp32=self.residual_in_fp32,
+                eps=self.norm.eps,
+            )
+            hidden_states = torch.cat([hidden_states_fwd, hidden_states_rc.flip(dims=[-2, -1])], dim=-1)
+            residual = torch.cat([residual_fwd, residual_rc.flip(dims=[-2, -1])], dim=-1)
+        hidden_states = self.mixer(hidden_states, inference_params=inference_params)
+        if self.mlp is not None:
+            if not self.fused_add_norm:
+                residual = hidden_states + residual
+                residual = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
+                if self.residual_in_fp32:
+                    residual = residual.to(torch.float32)
+            else:
+                hidden_states, residual = layer_norm_fn(
+                    hidden_states,
+                    self.norm2.weight,
+                    self.norm2.bias,
+                    residual=residual,
+                    prenorm=True,
+                    residual_in_fp32=self.residual_in_fp32,
+                    eps=self.norm2.eps,
+                    is_rms_norm=isinstance(self.norm2, RMSNorm)
+                )
+            hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        """Allocate inference cache for mixer.
+        Keep for compatibility with original Mamba Block.
+        """
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+class RCPSLMHead(nn.Module):
+    """LM Head for reverse-complement equivariant inputs, which have dim * 2 relative to standard inputs."""
+    def __init__(self, true_dim: int, vocab_size: int, complement_map: dict, **factory_kwargs):
+        """
+        `true_dim` corresponds to the actual dimensionality of the input were it not reverse-complement
+        equivariant, i.e. 0.5 times the actual input dim.
+        """
+        super().__init__()
+        self.register_buffer(
+            "complement_map",
+            torch.tensor(list(OrderedDict(complement_map).values()), dtype=torch.long)
+        )
+        self.true_dim = true_dim
+        self.lm_head = nn.Linear(true_dim, vocab_size, bias=False, **factory_kwargs)
+    @property
+    def weight(self):
+        """LM head weights."""
+        return self.lm_head.weight
+    def set_weight(self, value):
+        """Set LM head weights."""
+        self.lm_head.weight = value
+    def forward(self, x):
+        """
+        Args:
+            x: Input tensor of shape (batch_size, seq_len, dim), where dim = 2 * true_dim.
+        """
+        n_channels = x.shape[-1]
+        assert n_channels == 2 * self.true_dim, "Input must have 2 * true_dim channels."
+        fwd_logits = F.linear(x[..., :n_channels // 2], self.weight, bias=self.lm_head.bias)
+        rc_logits = F.linear(
+            torch.flip(x[..., n_channels // 2:], dims=[-1]),
+            self.weight[self.complement_map, :],
+            bias=self.lm_head.bias
+        )
+        return fwd_logits + rc_logits