add: model file

Browse files

Files changed (5) hide show

config.json +49 -0
plantbimoe/configuration_plantbimoe.py +56 -0
plantbimoe/modeling_plantbimoe.py +725 -0
plantbimoe/tokenization_plantbimoe.py +113 -0
pytorch_model.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+   "_name_or_path": "checkpoint-pretrain",
+  "architectures": [
+    "PlantbimoeForMaskedLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_plantbimoe.PlantbimoeConfig",
+    "AutoModel": "modeling_plantbimoe.Plantbimoe",
+    "AutoModelForMaskedLM": "modeling_plantbimoe.PlantbimoeForMaskedLM",
+    "AutoModelForSequenceClassification": "modeling_plantbimoe.PlantbimoeForSequenceClassification"
+  },
+  "bidirectional": true,
+  "bidirectional_strategy": "add",
+  "bidirectional_weight_tie": true,
+  "d_model": 512,
+  "fused_add_norm": true,
+  "initializer_cfg": {
+    "initializer_range": 0.02,
+    "n_residuals_per_layer": 1,
+    "rescale_prenorm_residual": true
+  },
+  "intermediate_size": 1408,
+  "model_type": "plantbimoe",
+  "n_layer": 16,
+  "norm_epsilon": 1e-05,
+  "num_experts": 4,
+  "num_experts_per_tok": 1,
+  "pad_token_id": 4,
+  "pad_vocab_size_multiple": 6,
+  "residual_in_fp32": false,
+  "rms_norm": true,
+  "ssm_cfg": {
+    "bias": false,
+    "conv_bias": true,
+    "d_conv": 4,
+    "d_state": 16,
+    "dt_init": "random",
+    "dt_init_floor": 0.0001,
+    "dt_max": 0.1,
+    "dt_min": 0.001,
+    "dt_rank": "auto",
+    "dt_scale": 1.0,
+    "expand": 2,
+    "use_fast_path": true
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.38.1",
+  "vocab_size": 12
+}

plantbimoe/configuration_plantbimoe.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""Plantbimoe config for Hugging Face.
+"""
+from typing import Optional, Union
+from transformers import PretrainedConfig
+class PlantbimoeConfig(PretrainedConfig):
+    """Config that extends the original MambaConfig with params relevant to bi-directionality and RC equivariance."""
+    model_type = "plantbimoe"
+    def __init__(
+            self,
+            d_model: int = 2560,
+            n_layer: int = 64,
+            vocab_size: int = 50277,
+            ssm_cfg: Optional[dict] = None,
+            rms_norm: bool = True,
+            residual_in_fp32: bool = True,
+            fused_add_norm: bool = True,
+            pad_vocab_size_multiple: int = 8,
+            # Not in original MambaConfig, but default arg in create_block in mamba_ssm repo; used in layer norm
+            norm_epsilon: float = 1e-5,
+            # Used in init_weights
+            initializer_cfg: Optional[dict] = None,
+            # BimoPlant-specific params
+            bidirectional: bool = True,
+            bidirectional_strategy: Union[str, None] = "add",
+            bidirectional_weight_tie: bool = True,
+            intermediate_size: int = 3840,
+            num_experts: int = 16,
+            num_experts_per_tok: int = 2,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.d_model = d_model
+        self.n_layer = n_layer
+        self.vocab_size = vocab_size
+        self.ssm_cfg = ssm_cfg
+        self.rms_norm = rms_norm
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.pad_vocab_size_multiple = pad_vocab_size_multiple
+        self.norm_epsilon = norm_epsilon
+        self.initializer_cfg = initializer_cfg
+        self.bidirectional = bidirectional
+        self.bidirectional_strategy = bidirectional_strategy
+        self.bidirectional_weight_tie = bidirectional_weight_tie
+        self.intermediate_size = intermediate_size
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok

plantbimoe/modeling_plantbimoe.py ADDED Viewed

	@@ -0,0 +1,725 @@

+"""BiMoPlant model for Hugging Face.
+"""
+import inspect
+import math
+from functools import partial
+from typing import Optional, Tuple, Union
+import torch
+from torch import Tensor
+from mamba_ssm.modules.mamba_simple import Mamba
+try:
+    from mamba_ssm.modules.mamba_simple import Block  # Legacy mambav1 file structure
+except ImportError:
+    from mamba_ssm.modules.block import Block  # mambav2 file structure
+from torch import nn
+from torch.nn import functional as F
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import BaseModelOutputWithNoAttention, MaskedLMOutput, SequenceClassifierOutput
+try:
+    from mamba_ssm.ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn  # Legacy mambav1 file structure
+except ImportError:
+    try:
+        from mamba_ssm.ops.triton.layer_norm import RMSNorm, layer_norm_fn, rms_norm_fn  # mambav2 file structure
+    except ImportError:
+        RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+from .configuration_plantbimoe import PlantbimoeConfig
+def create_block(
+        d_model,
+        ssm_cfg=None,
+        norm_epsilon=1e-5,
+        rms_norm=False,
+        residual_in_fp32=False,
+        fused_add_norm=False,
+        layer_idx=None,
+        bidirectional=True,
+        bidirectional_strategy="add",
+        bidirectional_weight_tie=True,
+        device=None,
+        dtype=None,
+):
+    """Create Plantbimoe block.
+    Adapted from: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py
+    """
+    if ssm_cfg is None:
+        ssm_cfg = {}
+    factory_kwargs = {"device": device, "dtype": dtype}
+    bidirectional_kwargs = {
+        "bidirectional": bidirectional,
+        "bidirectional_strategy": bidirectional_strategy,
+        "bidirectional_weight_tie": bidirectional_weight_tie,
+    }
+    mixer_cls = partial(BiMambaWrapper, layer_idx=layer_idx, **ssm_cfg, **bidirectional_kwargs, **factory_kwargs)
+    norm_cls = partial(
+        nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs
+    )
+    block_cls = Block
+    # mambav2 compatibility
+    if "mlp_cls" in inspect.signature(block_cls.__init__).parameters:
+        block = block_cls(
+            d_model,
+            mixer_cls,
+            mlp_cls=nn.Identity,
+            norm_cls=norm_cls,
+            fused_add_norm=fused_add_norm,
+            residual_in_fp32=residual_in_fp32,
+        )
+    else:
+        block = block_cls(
+            d_model,
+            mixer_cls,
+            norm_cls=norm_cls,
+            fused_add_norm=fused_add_norm,
+            residual_in_fp32=residual_in_fp32,
+        )
+    block.layer_idx = layer_idx
+    return block
+class MambaBlock(nn.Module):
+    def __init__(self, config, layer_idx, norm_cls=nn.LayerNorm, fused_add_norm=False, residual_in_fp32=False, moe=False, device=None, dtype=None ):
+        """
+        Simple block wrapping a mixer class with LayerNorm/RMSNorm and residual connection"
+        This Block has a slightly different structure compared to a regular
+        prenorm Transformer block.
+        The standard block is: LN -> MHA/MLP -> Add.
+        [Ref: https://arxiv.org/abs/2002.04745]
+        Here we have: Add -> LN -> Mixer, returning both
+        the hidden_states (output of the mixer) and the residual.
+        This is purely for performance reasons, as we can fuse add and LayerNorm.
+        The residual needs to be provided (except for the very first block).
+        """
+        super().__init__()
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.mixer = create_block(
+            config.d_model,
+            ssm_cfg=config.ssm_cfg,
+            norm_epsilon=config.norm_epsilon,
+            rms_norm=config.rms_norm,
+            residual_in_fp32=config.residual_in_fp32,
+            fused_add_norm=config.fused_add_norm,
+            layer_idx=layer_idx,
+            bidirectional=config.bidirectional,
+            bidirectional_strategy=config.bidirectional_strategy,
+            bidirectional_weight_tie=config.bidirectional_weight_tie,
+            **factory_kwargs,
+        )
+        ffn_layer_class = PlantbimoeSparseMoeBlock if moe else PlantbimoeMlp
+        self.feed_forward = ffn_layer_class(config)
+        self.residual_in_fp32 = residual_in_fp32
+        self.fused_add_norm = fused_add_norm
+        self.norm = norm_cls(config.d_model)
+        if self.fused_add_norm:
+            assert RMSNorm is not None, "RMSNorm import fails"
+            assert isinstance(
+                self.norm, (nn.LayerNorm, RMSNorm)
+            ), "Only LayerNorm and RMSNorm are supported for fused_add_norm"
+    def forward(
+        self, hidden_states: Tensor, residual: Optional[Tensor] = None, inference_params=None
+    ):
+        r"""Pass the input through the encoder layer.
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: hidden_states = Mixer(LN(residual))
+        """
+        hidden_states, residual = self.mixer(hidden_states, residual, inference_params=None)
+        if not self.fused_add_norm:
+            residual = (hidden_states + residual) if residual is not None else hidden_states
+            hidden_states = self.norm(residual.to(dtype=self.norm.weight.dtype))
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+        else:
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm, RMSNorm) else layer_norm_fn
+            hidden_states, residual = fused_add_norm_fn(
+                hidden_states,
+                self.norm.weight,
+                self.norm.bias,
+                residual=residual,
+                prenorm=True,
+                residual_in_fp32=self.residual_in_fp32,
+                eps=self.norm.eps,
+            )
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+# Adapted from transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock with Mistral->Jamba
+class PlantbimoeSparseMoeBlock(nn.Module):
+    """
+    This implementation is
+    strictly equivalent to standard MoE with full capacity (no
+    dropped tokens). It's faster since it formulates MoE operations
+    in terms of block-sparse operations to accomodate imbalanced
+    assignments of tokens to experts, whereas standard MoE either
+    (1) drop tokens at the cost of reduced performance or (2) set
+    capacity factor to number of experts and thus waste computation
+    and memory on padding.
+    """
+    def __init__(self, config: PlantbimoeConfig):
+        super().__init__()
+        self.hidden_dim = config.d_model
+        # self.ffn_dim = config.intermediate_size
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.router = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+        self.experts = nn.ModuleList([PlantbimoeMlp(config) for _ in range(self.num_experts)])
+    def forward(self, hidden_states: torch.Tensor):
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.router(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+            if top_x.shape[0] == 0:
+                continue
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states
+# Adapted from transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock with Mistral->Jamba
+class PlantbimoeMlp(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.d_model
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = nn.SiLU()
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class BiMambaWrapper(nn.Module):
+    """Thin wrapper around Mamba to support bi-directionality."""
+    def __init__(
+            self,
+            d_model: int,
+            bidirectional: bool = True,
+            bidirectional_strategy: Optional[str] = "add",
+            bidirectional_weight_tie: bool = True,
+            **mamba_kwargs,
+    ):
+        super().__init__()
+        if bidirectional and bidirectional_strategy is None:
+            bidirectional_strategy = "add"  # Default strategy: `add`
+        if bidirectional and bidirectional_strategy not in ["add", "ew_multiply"]:
+            raise NotImplementedError(f"`{bidirectional_strategy}` strategy for bi-directionality is not implemented!")
+        self.bidirectional = bidirectional
+        self.bidirectional_strategy = bidirectional_strategy
+        self.mamba_fwd = Mamba(
+            d_model=d_model,
+            **mamba_kwargs
+        )
+        if bidirectional:
+            self.mamba_rev = Mamba(
+                d_model=d_model,
+                **mamba_kwargs
+            )
+            if bidirectional_weight_tie:  # Tie in and out projections (where most of param count lies)
+                self.mamba_rev.in_proj.weight = self.mamba_fwd.in_proj.weight
+                self.mamba_rev.in_proj.bias = self.mamba_fwd.in_proj.bias
+                self.mamba_rev.out_proj.weight = self.mamba_fwd.out_proj.weight
+                self.mamba_rev.out_proj.bias = self.mamba_fwd.out_proj.bias
+        else:
+            self.mamba_rev = None
+    def forward(self, hidden_states, inference_params=None):
+        """Bidirectional-enabled forward pass
+        hidden_states: (B, L, D)
+        Returns: same shape as hidden_states
+        """
+        out = self.mamba_fwd(hidden_states, inference_params=inference_params)
+        if self.bidirectional:
+            out_rev = self.mamba_rev(
+                hidden_states.flip(dims=(1,)),  # Flip along the sequence length dimension
+                inference_params=inference_params
+            ).flip(dims=(1,))  # Flip back for combining with forward hidden states
+            if self.bidirectional_strategy == "add":
+                out = out + out_rev
+            elif self.bidirectional_strategy == "ew_multiply":
+                out = out * out_rev
+            else:
+                raise NotImplementedError(f"`{self.bidirectional_strategy}` for bi-directionality not implemented!")
+        return out
+class PlantbimoeEmbeddings(nn.Module):
+    def __init__(
+            self,
+            config: PlantbimoeConfig,
+            device=None,
+            dtype=None,
+    ):
+        super().__init__()
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.d_model, **factory_kwargs)
+    def forward(self, input_ids):
+        """
+            input_ids: (batch, seqlen)
+        """
+        return self.word_embeddings(input_ids)
+class PlantbimoeMixerModel(nn.Module):
+    def __init__(
+            self,
+            config: PlantbimoeConfig,
+            device=None,
+            dtype=None,
+    ) -> None:
+        super().__init__()
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.fused_add_norm = config.fused_add_norm
+        self.residual_in_fp32 = config.residual_in_fp32
+        self.embeddings = PlantbimoeEmbeddings(config, **factory_kwargs)
+        # Mamba changes the order of residual and layer norm:
+        # Instead of LN -> Attn / MLP -> Add, we do:
+        # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and
+        # the main branch (output of MLP / Mixer). The model definition is unchanged.
+        # This is for performance reason: we can fuse add + layer_norm.
+        if config.fused_add_norm:
+            if layer_norm_fn is None or rms_norm_fn is None:
+                raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels")
+        norm_cls = partial(
+            nn.LayerNorm if not config.rms_norm else RMSNorm, eps=config.norm_epsilon, **factory_kwargs
+        )
+        self.layers = nn.ModuleList()
+        for i in range(config.n_layer):
+            moe = (i + 1) % 2 == 0
+            self.layers.append(
+                MambaBlock(
+                    config=config,
+                    layer_idx=i,
+                    norm_cls=norm_cls,
+                    fused_add_norm=config.fused_add_norm,
+                    residual_in_fp32=config.residual_in_fp32,
+                    moe=moe
+                )
+            )
+        norm_f = (nn.LayerNorm if not config.rms_norm else RMSNorm)(
+            config.d_model, eps=config.norm_epsilon, **factory_kwargs
+        )
+        self.norm_f = norm_f
+    def forward(self, input_ids, inputs_embeds=None, output_hidden_states=False):
+        """Mixer forward."""
+        all_hidden_states = []
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embeddings(input_ids)
+        residual = None
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states.append(hidden_states)
+            # TODO: Add support for gradient checkpointing
+            hidden_states, residual = layer(
+                hidden_states, residual, inference_params=None
+            )
+        if not self.fused_add_norm:
+            residual = (hidden_states + residual) if residual is not None else hidden_states
+            hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
+        else:
+            fused_add_norm_fn = rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn
+            # Set prenorm=False here since we don't need the residual
+            hidden_states = fused_add_norm_fn(
+                hidden_states,
+                self.norm_f.weight,
+                self.norm_f.bias,
+                eps=self.norm_f.eps,
+                residual=residual,
+                prenorm=False,
+                residual_in_fp32=self.residual_in_fp32,
+            )
+            if output_hidden_states:
+                all_hidden_states.append(hidden_states)
+        return hidden_states, all_hidden_states
+def cross_entropy(logits, y, ignore_index=-100):
+    """Cross entropy loss."""
+    logits = logits.view(-1, logits.shape[-1])
+    y = y.view(-1)
+    return F.cross_entropy(logits, y, ignore_index=ignore_index)
+def weighted_cross_entropy(logits, y, loss_weights, ignore_index=-100):
+    """Weighted cross entropy loss (discounts certain tokens, e.g., repeated base pairs in genome)."""
+    logits = logits.view(-1, logits.shape[-1])
+    y = y.view(-1)
+    ce = F.cross_entropy(logits, y, ignore_index=ignore_index, reduction="none")
+    loss_weights = loss_weights.view(-1)
+    loss_weights[y == ignore_index] = 0.0
+    # TODO: Follows GPN implementation, but should we remove weight normalization?
+    return (ce * (loss_weights / loss_weights.sum())).sum()
+class PlantbimoePreTrainedModel(PreTrainedModel):
+    """PreTrainedModel wrapper for Plantbimoe backbone."""
+    config_class = PlantbimoeConfig
+    # base_model_prefix = "plantbimoe"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["MambaBlock"]
+    def _init_weights(
+            self,
+            module,
+            initializer_range=0.02,  # Now only used for embedding layer.
+            **kwargs,
+    ):
+        """Adapted from: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/models/mixer_seq_simple.py"""
+        n_layer = self.config.n_layer
+        initialized_cfg = self.config.initializer_cfg if self.config.initializer_cfg is not None else {}
+        rescale_prenorm_residual = initialized_cfg.get("rescale_prenorm_residual", True)
+        initializer_range = initialized_cfg.get("initializer_range", initializer_range)
+        n_residuals_per_layer = initialized_cfg.get("n_residuals_per_layer", 1)
+        if isinstance(module, nn.Linear):
+            if module.bias is not None:
+                if not getattr(module.bias, "_no_reinit", False):
+                    nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, std=initializer_range)
+        if rescale_prenorm_residual:
+            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+            #   > A modified initialization which accounts for the accumulation on the residual path with model depth.
+            #   > Scale the weights of residual layers at initialization by a factor of 1/√N where N is the # of
+            #   residual layers.
+            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+            #
+            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+            for name, p in module.named_parameters():
+                if name in ["out_proj.weight", "fc2.weight"]:
+                    # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                    # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                    # We need to reinit p since this code could be called multiple times
+                    # Having just p *= scale would repeatedly scale it down
+                    nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                    with torch.no_grad():
+                        p /= math.sqrt(n_residuals_per_layer * n_layer)
+class Plantbimoe(PlantbimoePreTrainedModel):
+    """Plantbimoe model that can be instantiated using HF patterns."""
+    def __init__(self, config: PlantbimoeConfig, device=None, dtype=None, **kwargs):
+        super().__init__(config)
+        # Adjust vocab size and complement maps if vocab padding is set.
+        if config.vocab_size % config.pad_vocab_size_multiple != 0:
+            config.vocab_size += config.pad_vocab_size_multiple - (config.vocab_size % config.pad_vocab_size_multiple)
+        self.config = config
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.backbone = PlantbimoeMixerModel(config, **factory_kwargs, **kwargs)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[torch.Tensor, Tuple, BaseModelOutputWithNoAttention]:
+        """HF-compatible forward method."""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        hidden_states, all_hidden_states = self.backbone(
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states
+        )
+        if return_dict:
+            return BaseModelOutputWithNoAttention(
+                last_hidden_state=hidden_states,
+                hidden_states=all_hidden_states if output_hidden_states else None
+            )
+        elif output_hidden_states:
+            return hidden_states, all_hidden_states
+        else:
+            return hidden_states
+class PlantbimoeForMaskedLM(PlantbimoePreTrainedModel):
+    """HF-compatible Plantbimoe model for masked language modeling."""
+    def __init__(self, config: PlantbimoeConfig, device=None, dtype=None, **kwargs):
+        super().__init__(config, **kwargs)
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.plantbimoe = Plantbimoe(config, **factory_kwargs, **kwargs)
+        self.lm_head = nn.Linear(
+            config.d_model,
+            self.config.vocab_size,  # Use plantbimoe config as it might have been updated
+            bias=False,
+            **factory_kwargs
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.plantbimoe.backbone.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.plantbimoe.backbone.embeddings.word_embeddings = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        """Overrides output embeddings."""
+        self.lm_head = new_embeddings
+    def tie_weights(self):
+        """Tie weights, accounting for RCPS."""
+        super().tie_weights()
+    def get_decoder(self):
+        """Get decoder (backbone) for the model."""
+        return self.plantbimoe
+    def set_decoder(self, decoder):
+        """Set decoder (backbone) for the model."""
+        self.plantbimoe = decoder
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        loss_weights: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        """HF-compatible forward method."""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.plantbimoe(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            if loss_weights is not None:
+                loss = weighted_cross_entropy(logits, labels, loss_weights, ignore_index=self.config.pad_token_id)
+            else:
+                loss = cross_entropy(logits, labels, ignore_index=self.config.pad_token_id)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return MaskedLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
+class PlantbimoeForSequenceClassification(PlantbimoePreTrainedModel):
+    def __init__(
+            self,
+            config: PlantbimoeConfig,
+            pooling_strategy: str = "mean",
+            conjoin_train: bool = False,
+            conjoin_eval: bool = False,
+            device=None,
+            dtype=None,
+            **kwargs):
+        super().__init__(config, **kwargs)
+        if pooling_strategy not in ["mean", "max", "first", "last"]:
+            raise NotImplementedError(f"Pooling strategy `{pooling_strategy}` not implemented.")
+        self.pooling_strategy = pooling_strategy
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.num_labels = kwargs.get("num_labels", config.num_labels)
+        self.plantbimoe = Plantbimoe(config, **factory_kwargs, **kwargs)
+        self.score = nn.Linear(config.d_model, self.num_labels, bias=False)
+        self.conjoin_train = conjoin_train
+        self.conjoin_eval = conjoin_eval
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.init_scorer()
+    def init_scorer(self, initializer_range=0.02):
+        initializer_range = self.config.initializer_cfg.get("initializer_range", initializer_range) \
+            if self.config.initializer_cfg is not None else initializer_range
+        self.score.weight.data.normal_(std=initializer_range)
+    def get_input_embeddings(self):
+        return self.plantbimoe.backbone.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.plantbimoe.backbone.embeddings.word_embeddings = value
+    def pool_hidden_states(self, hidden_states, sequence_length_dim=1):
+        """Pools hidden states along sequence length dimension."""
+        if self.pooling_strategy == "mean":  # Mean pooling along sequence length dimension
+            return hidden_states.mean(dim=sequence_length_dim)
+        if self.pooling_strategy == "max":  # Max pooling along sequence length dimension
+            return hidden_states.max(dim=sequence_length_dim).values
+        if self.pooling_strategy == "last":  # Use embedding of last token in the sequence
+            return hidden_states.moveaxis(hidden_states, sequence_length_dim, 0)[-1, ...]
+        if self.pooling_strategy == "first":  # Use embedding of first token in the sequence
+            return hidden_states.moveaxis(hidden_states, sequence_length_dim, 0)[0, ...]
+        if self.pooling_strategy == "all": # Segamentation pooling
+            return hidden_states[:,1:-1,:]
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask=None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Get hidden representations from the backbone
+        if self.conjoin_train or (self.conjoin_eval and not self.training):  # For conjoining / post-hoc conjoining
+            assert input_ids is not None, "`input_ids` must be provided for conjoining."
+            assert input_ids.ndim == 3, "`input_ids` must be 3D tensor: channels corresponds to forward and rc strands."
+            transformer_outputs = self.plantbimoe(
+                input_ids[..., 0],
+                inputs_embeds=None,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            transformer_outputs_rc = self.plantbimoe(
+                input_ids[..., 1],
+                inputs_embeds=None,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            # Stack along channel dimension (dim=-1)
+            hidden_states = torch.stack([transformer_outputs[0], transformer_outputs_rc[0]], dim=-1)
+        else:
+            transformer_outputs = self.plantbimoe(
+                input_ids,
+                inputs_embeds=None,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            hidden_states = transformer_outputs[0]
+        # Pool and get logits
+        pooled_hidden_states = self.pool_hidden_states(hidden_states)
+        # Potentially run `score` twice (with parameters shared) for conjoining
+        if hidden_states.ndim == 4:  # bsz, seq_len, hidden_dim, 2 where last channel has the stacked fwd and rc reps
+            logits_fwd = self.score(pooled_hidden_states[..., 0])
+            logits_rc = self.score(pooled_hidden_states[..., 1])
+            logits = (logits_fwd + logits_rc) / 2
+        else:
+            logits = self.score(pooled_hidden_states)
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                if self.num_labels == 1:
+                    loss = F.mse_loss(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = F.mse_loss(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss = F.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1).squeeze(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss = F.binary_cross_entropy_with_logits(logits, labels.float())
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+        )

plantbimoe/tokenization_plantbimoe.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from transformers import PreTrainedTokenizer
+from typing import List, Optional, Dict, Tuple
+# Copied from HyenaDNATokenizer
+class PlantbimoeTokenizer(PreTrainedTokenizer):
+    model_input_names = ["input_ids"]
+    def __init__(self,
+                 model_max_length: int,
+                 bos_token="[BOS]",
+                 eos_token="[SEP]",
+                 sep_token="[SEP]",
+                 cls_token="[CLS]",
+                 pad_token="[PAD]",
+                 mask_token="[MASK]",
+                 unk_token="[UNK]",
+                 **kwargs):
+        """Character tokenizer for Hugging Face transformers.
+        Args:
+            characters (Sequence[str]): List of desired characters. Any character which
+                is not included in this list will be replaced by a special token called
+                [UNK] with id=6. Following are list of all of the special tokens with
+                their corresponding ids:
+                    "[CLS]": 0
+                    "[SEP]": 1
+                    "[BOS]": 2
+                    "[MASK]": 3
+                    "[PAD]": 4
+                    "[RESERVED]": 5
+                    "[UNK]": 6
+                an id (starting at 7) will be assigned to each character.
+            model_max_length (int): Model maximum sequence length.
+        """
+        self.characters = ('A', 'C', 'G', 'T', 'N')
+        self.model_max_length = model_max_length
+        self._vocab_str_to_int = {
+            "[CLS]": 0,
+            "[SEP]": 1,
+            "[BOS]": 2,
+            "[MASK]": 3,
+            "[PAD]": 4,
+            "[RESERVED]": 5,
+            "[UNK]": 6,
+            **{ch: i + 7 for i, ch in enumerate(self.characters)},
+        }
+        self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()}
+        add_prefix_space = kwargs.pop("add_prefix_space", False)
+        padding_side = kwargs.pop("padding_side", "left")
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            unk_token=unk_token,
+            add_prefix_space=add_prefix_space,
+            model_max_length=model_max_length,
+            padding_side=padding_side,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab_str_to_int)
+    def _tokenize(self, text: str) -> List[str]:
+        return list(text)
+    def _convert_token_to_id(self, token: str) -> int:
+        return self._vocab_str_to_int.get(token, self._vocab_str_to_int["[UNK]"])
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._vocab_int_to_str[index]
+    def convert_tokens_to_string(self, tokens):
+        return "".join(tokens)
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+        result = ([0] * len(token_ids_0)) + [1]
+        if token_ids_1 is not None:
+            result += ([0] * len(token_ids_1)) + [1]
+        return result
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        result = cls + token_ids_0 + sep
+        if token_ids_1 is not None:
+            result += token_ids_1 + sep
+        return result
+    def get_vocab(self) -> Dict[str, int]:
+        return self._vocab_str_to_int
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple:
+        return ()

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:945126f2ae0fdd3cd04f9708fa289c1273515fc40afa6ff624d368177fb59a95
+size 462598584