Add NVFP4 quantized checkpoint

Browse files

Files changed (13) hide show

README.md +23 -0
config.json +143 -0
configuration_bolmo.py +235 -0
generation_config.json +7 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_bolmo.py +1351 -0
recipe.yaml +7 -0
special_tokens_map.json +5 -0
tokenization_bolmo.py +378 -0
tokenizer_config.json +34 -0
utils_bolmo.py +127 -0

README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+---
+datasets:
+- HuggingFaceH4/ultrachat_200k
+base_model:
+- allenai/Bolmo-7B
+---
+# Bolmo-7B-nvfp4
+**Format:** NVFP4 — weights & activations quantized to FP4 with dual scaling.
+**Base model:** `allenai/Bolmo-7B`
+**How it was made:** One-shot calibration with LLM Compressor (NVFP4 recipe), long-seq calibration with HuggingFaceH4/ultrachat_200k.
+> Notes: Keep `lm_head` in high precision; calibrate on long, domain-relevant sequences.
+Check the original model card for information about this model.
+# Running the model with VLLM in Docker
+```sh
+sudo docker run --runtime nvidia --gpus all -p 8000:8000 --ipc=host vllm/vllm-openai:nightly --model Firworks/Bolmo-7B-nvfp4 --dtype auto --max-model-len 32768
+```
+This was tested on an RTX Pro 6000 Blackwell cloud instance.
+If there are other models you're interested in seeing quantized to NVFP4 for use on the DGX Spark, or other modern Blackwell (or newer) cards let me know. I'm trying to make more NVFP4 models available to allow more people to try them out.

config.json ADDED Viewed

	@@ -0,0 +1,143 @@

+{
+  "add_expanded_embeddings": true,
+  "architectures": [
+    "BolmoForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_bolmo.BolmoConfig",
+    "AutoModelForCausalLM": "modeling_bolmo.BolmoForCausalLM"
+  },
+  "bos_token_id": 1,
+  "boundary_predictor_lookahead": 1,
+  "boundary_threshold": "sample:0",
+  "dtype": "float32",
+  "eos_token_id": 1,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "layer_types": [
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention"
+  ],
+  "local_intermediate_size": 5504,
+  "local_rms_norm_eps": 1e-05,
+  "max_position_embeddings": 65536,
+  "model_type": "bolmo",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "num_local_decoder_layers": 4,
+  "num_local_encoder_layers": 1,
+  "num_local_heads": 16,
+  "pad_token_id": 0,
+  "quantization_config": {
+    "config_groups": {
+      "group_0": {
+        "format": "nvfp4-pack-quantized",
+        "input_activations": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": "local",
+          "group_size": 16,
+          "num_bits": 4,
+          "observer": "minmax",
+          "observer_kwargs": {},
+          "strategy": "tensor_group",
+          "symmetric": true,
+          "type": "float"
+        },
+        "output_activations": null,
+        "targets": [
+          "Linear"
+        ],
+        "weights": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": false,
+          "group_size": 16,
+          "num_bits": 4,
+          "observer": "minmax",
+          "observer_kwargs": {},
+          "strategy": "tensor_group",
+          "symmetric": true,
+          "type": "float"
+        }
+      }
+    },
+    "format": "nvfp4-pack-quantized",
+    "global_compression_ratio": null,
+    "ignore": [
+      "lm_head"
+    ],
+    "kv_cache_scheme": null,
+    "quant_method": "compressed-tensors",
+    "quantization_status": "compressed",
+    "sparsity_config": {},
+    "transform_config": {},
+    "version": "0.12.2"
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "attention_factor": 1.2079441541679836,
+    "beta_fast": 32,
+    "beta_slow": 1,
+    "factor": 8.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "yarn"
+  },
+  "rope_theta": 500000,
+  "sliding_window": 4096,
+  "subword_vocab_size": 100278,
+  "tie_word_embeddings": false,
+  "tokenizer_config": {
+    "bos_token_id": 1,
+    "bpe_token_end_id": 3,
+    "eos_token_id": 1,
+    "original_identifier": "allenai/dolma2-tokenizer",
+    "pad_token_id": 0,
+    "special_tokens": [
+      "<pad>",
+      "<bos>",
+      "<eos>",
+      "<bpe_token_end>"
+    ],
+    "special_tokens_first": true,
+    "vocab_size": 520
+  },
+  "transformers_version": "4.57.3",
+  "use_cache": true,
+  "vocab_size": 520
+}

configuration_bolmo.py ADDED Viewed

	@@ -0,0 +1,235 @@

+from dataclasses import asdict
+from typing import Any
+from transformers.configuration_utils import PretrainedConfig, layer_type_validation
+from transformers.modeling_rope_utils import rope_config_validation
+from .tokenization_bolmo import BolmoTokenizerConfig
+class BolmoConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Olmo3Model`]. It is used to instantiate an OLMo3
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the [allenai/OLMo-3-0725-1B](https://huggingface.co/allenai/OLMo-3-0725-1B).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50304):
+            Vocabulary size of the Olmo3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Olmo3Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Padding token id.
+        bos_token_id (`int`, *optional*):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 50279):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Size of the sliding window for sliding window attention.
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer. Defaults to sliding window attention
+            for 3 out of 4 layers, and full attention for every 4th layer.
+    ```python
+    >>> from transformers import Olmo3Model, Olmo3Config
+    >>> # Initializing a Olmo3 7B style configuration
+    >>> configuration = Olmo3Config()
+    >>> # Initializing a model from the Olmo3 7B style configuration
+    >>> model = Olmo3Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "bolmo"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise_rep",  # we need to replicate here due to the added norm on q and k
+        "layers.*.self_attn.k_proj": "colwise_rep",  # we need to replicate here due to the added norm on q and k
+        "layers.*.self_attn.v_proj": "colwise_rep",  # we need to replicate here due to the added norm on q and k
+        "layers.*.self_attn.o_proj": "rowwise_rep",  # we need to replicate here due to the added norm on q and k
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=520,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=None,
+        eos_token_id=50279,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        rms_norm_eps=1e-5,
+        sliding_window=4096,
+        layer_types=None,
+        # bolmo config
+        add_expanded_embeddings: bool = True,
+        boundary_predictor_lookahead: int = 1,
+        boundary_threshold: str = "sample:0",
+        num_local_encoder_layers: int = 1,
+        num_local_decoder_layers: int = 4,
+        num_local_heads: int = 16,
+        local_intermediate_size: int = 5504,
+        local_rms_norm_eps=1e-5,
+        subword_vocab_size: int = 100278, # dolma2_tokenizer subword vocab size
+        tokenizer_config: BolmoTokenizerConfig | dict[str, Any] | None = None,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.rms_norm_eps = rms_norm_eps
+        self.sliding_window = sliding_window
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if (i + 1) % 4 != 0 else "full_attention" for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+        # bolmo configuration
+        self.add_expanded_embeddings = add_expanded_embeddings
+        self.boundary_predictor_lookahead = boundary_predictor_lookahead
+        self.boundary_threshold = boundary_threshold
+        self.num_local_encoder_layers = num_local_encoder_layers
+        self.num_local_decoder_layers = num_local_decoder_layers
+        self.num_local_heads = num_local_heads
+        self.local_intermediate_size = local_intermediate_size
+        self.local_rms_norm_eps = local_rms_norm_eps
+        self.subword_vocab_size = subword_vocab_size
+        if tokenizer_config is None:
+            self.tokenizer_config = asdict(BolmoTokenizerConfig.bolmo())
+        elif isinstance(tokenizer_config, BolmoTokenizerConfig):
+            self.tokenizer_config = asdict(tokenizer_config)
+        else:
+            self.tokenizer_config = tokenizer_config
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        rope_config_validation(self)
+__all__ = ["BolmoConfig"]

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "do_sample": true,
+  "eos_token_id": 50279,
+  "pad_token_id": 1,
+  "transformers_version": "4.57.3"
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99b1a3015951f3907501cc07ed492c6e60e5ade94b0fe62faf09f15d8ef58c87
+size 4979678176

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff6107aa16c14af67f8169fb536c4c94b70e8d0a48b11bb67f6d6e2ce5d829a2
+size 742727128

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_bolmo.py ADDED Viewed

	@@ -0,0 +1,1351 @@

+import copy
+from typing import Callable, Optional, Union, cast
+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from transformers.utils.generic import TransformersKwargs
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin, GenerationConfig, LogitsProcessorList, StoppingCriteriaList
+from transformers.generation.utils import GenerateOutput
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import can_return_tuple
+from transformers.utils.deprecation import deprecate_kwarg
+from transformers.utils.generic import check_model_inputs
+from .configuration_bolmo import BolmoConfig
+from .tokenization_bolmo import BolmoTokenizerConfig
+from .utils_bolmo import compute_boundary_mask, pad_right, pad_left, MaskState
+try:
+    from xlstm.xlstm_large.model import mLSTMLayer, mLSTMLayerConfig, mLSTMLayerStateType, soft_cap, mLSTMBackendConfig
+except ImportError:
+    raise ImportError("The `xlstm` package is required to use Bolmo. Please install it via `pip install xlstm`.")
+@use_kernel_forward_from_hub("RMSNorm")
+class BolmoRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        BolmoRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight * hidden_states).to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    q_type, k_type = q.dtype, k.dtype
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed.to(q_type), k_embed.to(k_type)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+class BolmoAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: BolmoConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.q_norm = BolmoRMSNorm(config.num_attention_heads * self.head_dim, config.rms_norm_eps)
+        self.k_norm = BolmoRMSNorm(config.num_key_value_heads * self.head_dim, config.rms_norm_eps)
+        assert config.layer_types is not None
+        self.attention_type = config.layer_types[layer_idx]
+        self.sliding_window = config.sliding_window if self.attention_type == "sliding_attention" else None
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_norm(self.q_proj(hidden_states))
+        key_states = self.k_norm(self.k_proj(hidden_states))
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(hidden_shape).transpose(1, 2)
+        key_states = key_states.view(hidden_shape).transpose(1, 2)
+        value_states = value_states.view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class BolmoMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class BolmoDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: BolmoConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = BolmoAttention(config=config, layer_idx=layer_idx)
+        self.mlp = BolmoMLP(config)
+        self.post_attention_layernorm = BolmoRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = BolmoRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        attn_out, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(attn_out)
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        mlp_out = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(mlp_out)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class BolmoBoundaryPredictor(nn.Module):
+    def __init__(self, config: BolmoConfig):
+        super().__init__()
+        self.d_model = config.hidden_size
+        self.boundary_threshold = config.boundary_threshold
+        self.boundary_predictor_lookahead = config.boundary_predictor_lookahead
+        self.q_proj_layer = nn.Linear(self.d_model, self.d_model, bias=False)
+        self.k_proj_layer = nn.Linear(self.d_model, self.d_model, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        sequence_start_indices: Optional[torch.Tensor] = None,
+        epsilon: float = 1e-3,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.boundary_predictor_lookahead == 0:
+            # do not use the same rep for k and v, use current and one before as in H-Net + pad with negative to the left
+            cos_sim = torch.cat([
+                torch.ones((hidden_states.shape[0], 1), device=hidden_states.device, dtype=hidden_states.dtype) * -1,
+                torch.einsum(
+                    "b l d, b l d -> b l",
+                    F.normalize(self.q_proj_layer(hidden_states[:, :-1]), dim=-1),
+                    F.normalize(self.k_proj_layer(hidden_states[:, 1:]), dim=-1),
+                )
+            ], dim=1)
+        else:
+            cos_sim = torch.einsum(
+                "b l d, b l d -> b l",
+                F.normalize(self.q_proj_layer(hidden_states[:, :-self.boundary_predictor_lookahead]), dim=-1),
+                F.normalize(self.k_proj_layer(hidden_states[:, self.boundary_predictor_lookahead:]), dim=-1),
+            )
+        boundary_logprobs = torch.log1p(-cos_sim.float().clip(max=1.0 - epsilon)) - math.log(2)
+        POSITIVE_LOGPROB = 0.0
+        NEGATIVE_LOGPROB = -100_000
+        if sequence_start_indices is None:
+            boundary_logprobs[:, 0] = POSITIVE_LOGPROB
+        else:
+            pad_mask = torch.arange(boundary_logprobs.shape[1], device=boundary_logprobs.device)[None, :] < sequence_start_indices[:, None]
+            boundary_logprobs = boundary_logprobs.masked_fill(pad_mask, NEGATIVE_LOGPROB)
+            boundary_logprobs[torch.arange(len(boundary_logprobs), device=boundary_logprobs.device), sequence_start_indices] = POSITIVE_LOGPROB
+        boundary_logprobs = F.pad(boundary_logprobs, (0, self.boundary_predictor_lookahead), "constant", NEGATIVE_LOGPROB)
+        boundary_mask = compute_boundary_mask(boundary_logprobs, self.boundary_threshold)
+        return boundary_logprobs, boundary_mask
+class BolmoXLSTMLayer(mLSTMLayer):
+    def __init__(self, config: BolmoConfig):
+        super().__init__(mLSTMLayerConfig(
+            embedding_dim=config.hidden_size,
+            num_heads=config.num_local_heads,
+            mlstm_backend=mLSTMBackendConfig(
+                chunkwise_kernel="chunkwise--triton_limit_chunk",
+                sequence_kernel="native_sequence__triton",
+                step_kernel="triton",
+                mode="train",
+                return_last_states=True,
+                autocast_kernel_dtype="float32",
+            )
+        ))
+    # original forward adapted to support sequence_start_indices
+    # i.e. set the forget gate to zero at the start of sequence
+    def _original_forward(
+        self, x: torch.Tensor,
+        state: mLSTMLayerStateType | None = None,
+        sequence_start_indices: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, mLSTMLayerStateType | None]:
+        assert x.ndim == 3, f"Input must have shape [B, S, D], got {x.shape}"
+        B, S, _ = x.shape
+        if self.config.weight_mode == "single":
+            q = self.q(x)
+            k = self.k(x)
+            v = self.v(x)
+            o_preact = self.ogate_preact(x)
+            i_preact = soft_cap(
+                self.igate_preact(x), cap_value=self.config.gate_soft_cap
+            )
+            f_preact = soft_cap(
+                self.fgate_preact(x), cap_value=self.config.gate_soft_cap
+            )
+        elif self.config.weight_mode == "fused":
+            qkv_opreact = self.qkv_opreact(x)
+            q, k, v, o_preact = torch.tensor_split(
+                qkv_opreact,
+                (
+                    self.qk_dim,
+                    2 * self.qk_dim,
+                    2 * self.qk_dim + self.v_dim,
+                ),
+                dim=-1,
+            )
+            if_preact = soft_cap(
+                self.ifgate_preact(x), cap_value=self.config.gate_soft_cap
+            )
+            i_preact, f_preact = torch.tensor_split(
+                if_preact, (self.config.num_heads,), dim=-1
+            )
+        else:
+            raise ValueError(f"Unknown weight_mode: {self.config.weight_mode}")
+        q = q.reshape(B, S, self.config.num_heads, -1).transpose(1, 2)
+        k = k.reshape(B, S, self.config.num_heads, -1).transpose(1, 2)
+        v = v.reshape(B, S, self.config.num_heads, -1).transpose(1, 2)
+        if sequence_start_indices is not None:
+            f_preact[torch.arange(B, device=f_preact.device), sequence_start_indices] = -100_000
+        i_preact = i_preact.transpose(1, 2)
+        f_preact = f_preact.transpose(1, 2)
+        if state is None:
+            c_initial, n_initial, m_initial = None, None, None
+        else:
+            c_initial, n_initial, m_initial = state
+        h, state = self.mlstm_backend(
+            q=q,
+            k=k,
+            v=v,
+            i=i_preact,
+            f=f_preact,
+            c_initial=c_initial,
+            n_initial=n_initial,
+            m_initial=m_initial,
+        )
+        expected_h_shape = (
+            B,
+            self.config.num_heads,
+            S,
+            self.v_dim // self.config.num_heads,
+        )
+        assert (
+            h.shape == expected_h_shape
+        ), f"Got {h.shape}, expected {expected_h_shape}"
+        h = h.transpose(1, 2)
+        h_norm = self.multihead_norm(h)
+        h_norm = h_norm.reshape(B, S, -1)
+        h_out = self.ogate_act_fn(o_preact) * h_norm
+        y = self.out_proj(h_out)
+        return y, state
+    def forward(  # type: ignore
+        self,
+        x: torch.Tensor,
+        past_key_values: Optional[dict] = None,
+        use_cache: bool = False,
+        sequence_start_indices: Optional[torch.Tensor] = None,
+        cache_mask: Optional[MaskState] = None
+    ):
+        if self.training:
+            self.mlstm_backend.config.mode = "train"
+        else:
+            self.mlstm_backend.config.mode = "inference"
+        if use_cache:
+            assert past_key_values is not None
+            prev_mode = self.mlstm_backend.config.mode
+            state = past_key_values.get("state", None)
+            if cache_mask is not None:
+                state_for_model = cast(mLSTMLayerStateType, tuple(cache_mask.selective_get(x, inv=True) for x in state) if state is not None else None)
+            else:
+                state_for_model = state
+            h, new_state = self._original_forward(
+                x,
+                state=state_for_model,
+                sequence_start_indices=sequence_start_indices
+            )
+            assert new_state is not None
+            if state is None or cache_mask is None:
+                state = new_state
+            else:
+                if cache_mask is not None:
+                    for i in range(len(state)):
+                        cache_mask.selective_put(new_state[i], state[i], inv=True)
+            past_key_values["state"] = state
+            self.mlstm_backend.config.mode = prev_mode
+            return h
+        else:
+            h, _ = super().forward(x)
+            return h
+class BolmoLocalLayer(nn.Module):
+    def __init__(self, config: BolmoConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.act_fn = ACT2FN[config.hidden_act]
+        self.xlstm = BolmoXLSTMLayer(config)
+        local_mlp_config = copy.deepcopy(config)
+        local_mlp_config.intermediate_size = config.local_intermediate_size
+        self.mlp = BolmoMLP(local_mlp_config)
+        self.pre_xlstm_layernorm = BolmoRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = BolmoRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        sequence_start_indices: Optional[torch.Tensor] = None,
+        past_key_values: Optional[dict] = None,
+        use_cache: Optional[bool] = False,
+        cache_mask: Optional[MaskState] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        xlstm_out = self.xlstm(self.pre_xlstm_layernorm(hidden_states), sequence_start_indices=sequence_start_indices, past_key_values=past_key_values["xlstm"] if past_key_values is not None else None, use_cache=use_cache, cache_mask=cache_mask)
+        hidden_states = residual + xlstm_out
+        # Fully Connected
+        residual = hidden_states
+        ffn_out = self.mlp(self.pre_feedforward_layernorm(hidden_states))
+        hidden_states = residual + ffn_out
+        return hidden_states
+class BolmoLocalEncoder(nn.Module):
+    def __init__(self, config: BolmoConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.add_expanded_embeddings = config.add_expanded_embeddings
+        self.byte_embedding = nn.Embedding(
+            config.vocab_size,
+            self.hidden_size,
+        )
+        if self.add_expanded_embeddings:
+            self.subword_embedding = nn.Embedding(
+                config.subword_vocab_size,
+                self.hidden_size,
+            )
+        else:
+            self.subword_embedding = None
+        self.layers = nn.ModuleList(
+            [BolmoLocalLayer(config) for _ in range(config.num_local_encoder_layers)]
+        )
+        self.post_last_block_norm = BolmoRMSNorm(
+            self.hidden_size,
+            config.local_rms_norm_eps,
+        )
+        self.out_projection = nn.Linear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+        )
+        self.boundary_predictor_module = BolmoBoundaryPredictor(config)
+        self.has_cache = False
+    def prepare_inference_cache(self, batch_size: int):
+        device = next(self.parameters()).device
+        self.has_cache = True
+        self.cache_seqlens = 0
+        self.last_h = torch.zeros((batch_size, self.hidden_size), dtype=self.out_projection.weight.dtype, device=device)
+        self.layer_states = [{"xlstm": {}} for _ in range(len(self.layers))]
+    def free_inference_cache(self):
+        self.has_cache = False
+        if hasattr(self, "cache_seqlens"):
+            del self.cache_seqlens
+        if hasattr(self, "last_h"):
+            del self.last_h
+        if hasattr(self, "layer_states"):
+            del self.layer_states
+    def _embed(self, tokens, expanded_input_ids: Optional[torch.Tensor] = None):
+        embeddings = self.byte_embedding(tokens)
+        if self.add_expanded_embeddings:
+            assert expanded_input_ids is not None and self.subword_embedding is not None
+            embeddings = embeddings + self.subword_embedding(expanded_input_ids)
+        return embeddings
+    def _pool(
+        self,
+        h: torch.Tensor,
+        boundary_mask: torch.Tensor | None,
+        n_patches: int,
+        boundary_state: Optional[MaskState] = None,
+    ):
+        if self.has_cache and self.cache_seqlens > 0:
+            assert boundary_state is not None
+            if boundary_state.all():
+                assert h.shape[1] == 1
+                reduced_h = h
+            else:
+                reduced_h = h[[], :, :]
+        else:
+            assert boundary_mask is not None
+            L = h.shape[1]
+            token_idx = (
+                torch.arange(L, device=h.device)[None, :] + (~boundary_mask).long() * L  # type: ignore
+            )
+            seq_sorted_indices = torch.argsort(token_idx, dim=1)
+            index = seq_sorted_indices[:, :n_patches, None].expand(
+                -1, -1, h.shape[-1]
+            )
+            reduced_h = torch.gather(
+                h,
+                dim=1,
+                index=index,
+            )
+        return reduced_h
+    def forward(
+        self,
+        input_ids,
+        true_boundary_mask: Optional[torch.Tensor] = None,
+        boundary_state: Optional[MaskState] = None,
+        pad_state: Optional[MaskState] = None,
+        expanded_input_ids: Optional[torch.Tensor] = None,
+        sequence_start_indices: Optional[torch.Tensor] = None,
+    ):
+        embeddings = self._embed(input_ids, expanded_input_ids)
+        # pass through encoder layers
+        if self.has_cache and self.cache_seqlens > 0:
+            assert pad_state is not None
+            # step those batch positions which are not currently idle (i.e. at a boundary position)
+            # if all batch positions are idle, skip the step entirely
+            # all positions being idle only happens if fuse_boundaries=False. In this case, the step where we
+            # obtain a new representation from the global model will have all positions for the local encoder being idle.
+            if not pad_state.all():
+                h = pad_state.selective_get(embeddings, inv=True)
+                for i, block in enumerate(self.layers):
+                    h = block(h, past_key_values=self.layer_states[i], use_cache=True, cache_mask=pad_state)
+                if self.post_last_block_norm is not None:
+                    h = self.post_last_block_norm(h)
+                pad_state.selective_put(h[:, -1, :], self.last_h, inv=True)
+            h = self.last_h.unsqueeze(1)
+        else:
+            h = embeddings
+            for i, block in enumerate(self.layers):
+                if self.has_cache:
+                    use_cache = True
+                    past_key_values = self.layer_states[i]
+                else:
+                    use_cache = False
+                    past_key_values = None
+                h = block(h, past_key_values=past_key_values, use_cache=use_cache, sequence_start_indices=sequence_start_indices)
+            if self.post_last_block_norm is not None:
+                h = self.post_last_block_norm(h)
+            if self.has_cache:
+                self.last_h.copy_(h[:, -1, :])
+        if not self.has_cache or self.cache_seqlens == 0: # only used for prefill
+            boundary_logprobs, boundary_mask = self.boundary_predictor_module(
+                h,
+                sequence_start_indices=sequence_start_indices,
+            )
+            if boundary_state is not None:
+                # can't predict through encoder - must be through prev local decoder step
+                boundary_mask[:, -1] = boundary_state.mask
+        else:
+            boundary_logprobs = boundary_mask = None
+        # overwrite with true boundaries
+        if true_boundary_mask is not None:
+            boundary_mask = true_boundary_mask
+        patch_embeddings = self._pool(
+            h=h,
+            boundary_mask=boundary_mask,
+            n_patches=int(cast(torch.Tensor, boundary_mask).sum(-1).max().item()) if boundary_mask is not None else 1,
+            boundary_state=boundary_state,
+        )
+        patch_embeddings = self.out_projection(patch_embeddings)
+        if self.has_cache:
+            self.cache_seqlens += input_ids.shape[1]
+        return h, patch_embeddings, boundary_logprobs, boundary_mask
+class BolmoLocalDecoder(nn.Module):
+    def __init__(self, config: BolmoConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.initial_norm = BolmoRMSNorm(
+            self.hidden_size,
+            eps=config.local_rms_norm_eps,
+        )
+        self.in_projection = nn.Linear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+        )
+        self.layers = nn.ModuleList(
+            [BolmoLocalLayer(config) for _ in range(config.num_local_decoder_layers)]
+        )
+        self.has_cache = False
+    def prepare_inference_cache(self, batch_size: int):
+        device = next(self.parameters()).device
+        self.has_cache = True
+        self.cache_seqlens = 0
+        self.last_value = torch.zeros((batch_size, self.hidden_size), dtype=self.in_projection.weight.dtype, device=device)
+        self.layer_states = [{"xlstm": {}} for _ in range(len(self.layers))]
+    def free_inference_cache(self):
+        self.has_cache = False
+        if hasattr(self, "cache_seqlens"):
+            del self.cache_seqlens
+        if hasattr(self, "last_value"):
+            del self.last_value
+        if hasattr(self, "layer_states"):
+            del self.layer_states
+    def _depool(
+        self,
+        embeds: torch.Tensor,
+        patch_embeds: torch.Tensor,
+        boundary_mask: Optional[torch.Tensor],
+        boundary_state: Optional[MaskState] = None,
+        sequence_start_indices: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if self.has_cache and self.cache_seqlens > 0:
+            assert boundary_state is not None
+            if patch_embeds.numel() > 0:
+                # we got a new value from the global model, so must be at boundary position
+                h_patch = patch_embeds[:, -1:, :]
+                h = embeds + h_patch
+                self.last_value.copy_(h_patch[:, -1])
+            else:
+                h = embeds + self.last_value.unsqueeze(1)
+            # skip pad positions until we get a new value from the global model
+            if patch_embeds.numel() == 0:
+                h = boundary_state.selective_get(h, inv=True)
+            else:
+                boundary_state = None
+            if h.shape[0] > 0:
+                for i, layer in enumerate(self.layers):
+                    h = layer(h, past_key_values=self.layer_states[i], use_cache=True, cache_mask=boundary_state)
+            self.cache_seqlens += h.shape[1]
+            return h
+        else:
+            assert boundary_mask is not None
+            h_patch = patch_embeds
+            prepool_out = h_patch
+            # TODO(benjaminm): clipping is problematic if it happens too much; track clip %.
+            plug_back_idx = (torch.cumsum(boundary_mask, dim=1) - 1).clip(min=0, max=prepool_out.shape[1] - 1)
+            depool_out = torch.gather(
+                prepool_out,
+                dim=1,
+                index=plug_back_idx.unsqueeze(-1).expand(-1, -1, self.hidden_size),
+            )
+            depool_out_modulated = depool_out
+            h = depool_out_modulated + embeds
+            for i, layer in enumerate(self.layers):
+                if self.has_cache:
+                    use_cache = True
+                    past_key_values = self.layer_states[i]
+                else:
+                    use_cache = False
+                    past_key_values = None
+                h = layer(h, past_key_values=past_key_values, use_cache=use_cache, sequence_start_indices=sequence_start_indices)
+            if self.has_cache:
+                self.last_value.copy_(prepool_out[:, -1])
+                self.cache_seqlens += h.shape[1]
+            return h
+    def forward(
+        self,
+        embeds: torch.Tensor,
+        patch_embeds: torch.Tensor,
+        boundary_state: Optional[MaskState],
+        boundary_mask: torch.Tensor | None,
+        sequence_start_indices: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        h = self.in_projection(embeds)
+        h_patch = self.initial_norm(patch_embeds)
+        return self._depool(
+            embeds=h,
+            patch_embeds=h_patch,
+            boundary_mask=boundary_mask,
+            boundary_state=boundary_state,
+            sequence_start_indices=sequence_start_indices,
+        )
+class BolmoRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, config: BolmoConfig, device=None, rope_type: Optional[str] = None):
+        super().__init__()
+        if rope_type is not None:
+            self.rope_type = rope_type
+        elif hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            # BC: "rope_type" was originally "type"
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        assert self.rope_type is not None
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+            return cos, sin
+class BolmoPreTrainedModel(PreTrainedModel):
+    config: BolmoConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["BolmoDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": BolmoDecoderLayer,
+        "attentions": BolmoAttention,
+    }
+class BolmoModel(BolmoPreTrainedModel):
+    def __init__(self, config: BolmoConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.local_encoder = BolmoLocalEncoder(config)
+        self.local_decoder = BolmoLocalDecoder(config)
+        self.layers = nn.ModuleList(
+            [BolmoDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = BolmoRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        self.rotary_embs = nn.ModuleDict(
+            {
+                "sliding_attention": BolmoRotaryEmbedding(config=config, rope_type="default"),
+                "full_attention": BolmoRotaryEmbedding(config=config),
+            }
+        )
+        self.tokenizer_config = BolmoTokenizerConfig(**config.tokenizer_config)
+        self._tokenizer = None
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.local_encoder.byte_embedding
+    def set_input_embeddings(self, value: nn.Embedding):  # type: ignore
+        self.local_encoder.byte_embedding = value
+    @property
+    def tokenizer(self):
+        if self._tokenizer is None:
+            self._tokenizer = self.tokenizer_config.build()
+        return self._tokenizer
+    def prefill_boundary_prediction_forward(
+        self,
+        input_ids: torch.Tensor,
+        expanded_input_ids: Optional[torch.Tensor] = None,
+        sequence_start_indices: Optional[torch.Tensor] = None,
+        last_token_is_boundary: bool = False,
+        **kwargs,
+    ) -> torch.Tensor:
+        _, _, _, boundary_mask = self.local_encoder.forward(  # type: ignore
+            input_ids,
+            expanded_input_ids=expanded_input_ids,
+            boundary_state=MaskState(torch.full((input_ids.shape[0],), fill_value=last_token_is_boundary, device=input_ids.device, dtype=torch.bool)),
+            pad_state=MaskState(torch.zeros((input_ids.shape[0],), device=input_ids.device, dtype=torch.bool)),
+            sequence_start_indices=sequence_start_indices,
+        )
+        return cast(torch.Tensor, boundary_mask)
+    @check_model_inputs()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        expanded_input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        boundary_mask: Optional[torch.Tensor] = None,
+        boundary_state: Optional[MaskState] = None,
+        pad_state: Optional[MaskState] = None,
+        sequence_start_indices: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        batch_size = input_ids.shape[0]
+        device = input_ids.device
+        if self.local_encoder.add_expanded_embeddings and expanded_input_ids is None and input_ids is not None:
+            # not optimized
+            expanded_input_ids_list: list[torch.Tensor] = []
+            for example_idx in range(batch_size):
+                expanded_input_ids_list.append(torch.tensor(self.tokenizer.expand_byte_ids(input_ids[example_idx].tolist()), dtype=torch.long, device=device))
+            expanded_input_ids = pad_right(expanded_input_ids_list, value=self.tokenizer.pad_token_id, multiple_of=1)  # type: ignore
+        h_byte, h_patch, _, boundary_mask = self.local_encoder(
+            input_ids=input_ids,
+            expanded_input_ids=expanded_input_ids,
+            true_boundary_mask=boundary_mask,
+            boundary_state=boundary_state,
+            pad_state=pad_state,
+        )
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + h_patch.shape[1], device=device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)  # type: ignore
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": h_patch,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+                "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs),
+            }
+        position_embeddings_mapping = {
+            "sliding_attention": self.rotary_embs["sliding_attention"](h_byte, position_ids),
+            "full_attention": self.rotary_embs["full_attention"](h_byte, position_ids),
+        }
+        if h_patch.numel() > 0:
+            # we need to convert from right-pad to left-pad and back for prefill
+            # since flash attention expects left-pad and local/enc dec expect right-pad global tokens
+            # should add better left-pad support but this only affects prefill so OK for now
+            # although super inefficient!
+            if boundary_mask is not None: # prefill
+                n_boundaries = boundary_mask.sum(-1)
+                for i, current_n_boundaries in enumerate(n_boundaries):
+                    h_patch[i, -current_n_boundaries:] = h_patch[i, :current_n_boundaries].clone()
+            h_patch_after_global = h_patch
+            for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+                h_patch_after_global = decoder_layer(
+                    h_patch_after_global,
+                    attention_mask=causal_mask_mapping[decoder_layer.self_attn.attention_type],
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings_mapping[decoder_layer.self_attn.attention_type],
+                    **kwargs,
+                )
+            if boundary_mask is not None: # prefill
+                n_boundaries = boundary_mask.sum(-1)
+                for i, current_n_boundaries in enumerate(n_boundaries):
+                    h_patch_after_global[i, :current_n_boundaries] = h_patch_after_global[i, -current_n_boundaries:].clone()
+        else:
+            h_patch_after_global = h_patch
+        h_out = self.local_decoder.forward(  # type: ignore
+            embeds=h_byte,
+            patch_embeds=h_patch_after_global,
+            boundary_mask=boundary_mask,
+            boundary_state=boundary_state,
+            sequence_start_indices=sequence_start_indices,
+        )
+        h_out = self.norm(h_out)
+        return BaseModelOutputWithPast(
+            last_hidden_state=h_out,
+            past_key_values=past_key_values,
+        )
+class BolmoForCausalLM(BolmoPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = BolmoModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings: nn.Linear):
+        self.lm_head = new_embeddings
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        expanded_input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        boundary_mask: Optional[torch.Tensor] = None,
+        boundary_state: Optional[MaskState] = None,
+        pad_state: Optional[MaskState] = None,
+        sequence_start_indices: Optional[torch.Tensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, BolmoForCausalLM
+        >>> model = BolmoForCausalLM.from_pretrained("meta-olmo3/Bolmo-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-olmo3/Bolmo-2-7b-hf")
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            expanded_input_ids=expanded_input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            boundary_mask=boundary_mask,
+            boundary_state=boundary_state,
+            pad_state=pad_state,
+            sequence_start_indices=sequence_start_indices,
+            **kwargs,
+        )
+        hidden_states = cast(torch.Tensor, outputs.last_hidden_state)
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        return CausalLMOutputWithPast(
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    @torch.no_grad()
+    def generate(  # type: ignore
+        self,
+        inputs: torch.Tensor,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        use_model_defaults: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.Tensor]:
+        # generic preprocessing
+        generation_config, model_kwargs = self._prepare_generation_config(
+            generation_config, use_model_defaults, **kwargs
+        )
+        self._prepare_special_tokens(generation_config, device=self.model.device)
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        # start of custom generate
+        expand_input_ids = self.model.local_encoder.add_expanded_embeddings
+        batch_size = len(inputs)
+        if expand_input_ids:
+            expanded_input_ids = []
+            for i in range(len(inputs)):
+                expanded_input_ids.append(torch.tensor(self.model.tokenizer.expand_byte_ids(inputs[i].tolist()), device=self.device, dtype=torch.long))
+            expanded_input_ids = pad_left(expanded_input_ids, value=self.model.tokenizer.pad_token_id, multiple_of=1)  # type: ignore
+        else:
+            expanded_input_ids = None
+        byte_input_ids = inputs
+        sequence_start_indices = (byte_input_ids == self.model.tokenizer.pad_token_id).sum(-1)
+        batch_size, prompt_len = byte_input_ids.shape
+        finished = torch.zeros(batch_size, dtype=torch.bool, device=self.device)
+        boundary_offset = self.model.tokenizer.offset + 256
+        eos = self.model.tokenizer.eos_token_id
+        self.model.local_encoder.free_inference_cache()
+        self.model.local_decoder.free_inference_cache()
+        boundary_mask = self.model.prefill_boundary_prediction_forward(  # type: ignore
+            byte_input_ids,
+            expanded_input_ids=expanded_input_ids,
+            sequence_start_indices=sequence_start_indices,
+        )
+        self.model.local_encoder.prepare_inference_cache(batch_size)
+        self.model.local_decoder.prepare_inference_cache(batch_size)
+        # roll back by one and force decoding to account for lookahead
+        boundary_mask = boundary_mask[:, :-1]
+        # need to roll one byte back and force decoding to detect whether the last byte is a boundary
+        forced_decoding_ids = byte_input_ids[:, -1].cpu().tolist()
+        byte_input_ids = byte_input_ids[:, :-1]
+        expanded_input_ids = expanded_input_ids[:, :-1] if expanded_input_ids is not None else None
+        # stays the same unless last token is pad.
+        sequence_start_indices = (byte_input_ids == self.model.tokenizer.pad_token_id).sum(-1)
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
+        generation_config = self._prepare_generated_length(
+            generation_config=generation_config,
+            has_default_max_length=has_default_max_length,
+            has_default_min_length=has_default_min_length,
+            model_input_name="input_ids",
+            inputs_tensor=byte_input_ids,
+            input_ids_length=byte_input_ids.shape[1],
+        )
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,  # type: ignore
+            input_ids_seq_length=byte_input_ids.shape[1],
+            encoder_input_ids=byte_input_ids,  # type: ignore
+            logits_processor=logits_processor,
+            device=byte_input_ids.device,  # type: ignore
+            model_kwargs=model_kwargs,
+        )
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config,  # type: ignore
+            stopping_criteria=stopping_criteria,
+            tokenizer=self.model.tokenizer,
+        )
+        # output container
+        generated = byte_input_ids
+        max_n_prefill_patches = boundary_mask.sum(-1).max().item()
+        tokens_generated_plus_prefilled = max_n_prefill_patches
+        bytes_generated = 0
+        # generation state
+        boundary_state = MaskState(boundary_mask[:, -1].clone())
+        pad_state = MaskState(torch.zeros(batch_size, dtype=torch.bool, device=self.device))
+        next_tokens = torch.full((batch_size,), self.model.tokenizer.bpe_token_end_id, device=self.device, dtype=torch.long)  # type: ignore
+        non_boundary_generated_tokens = [[byte_input_ids[example_idx, -1].item()] for example_idx in range(batch_size)]
+        bytes_since_boundary = (boundary_mask.flip(1).cumsum(-1) == 0).sum(-1)
+        is_first_forward = True
+        global_past_key_values = None
+        while not finished.all():
+            input_ids_for_model = (
+                generated
+                if is_first_forward
+                else torch.tensor([x[-1] for x in non_boundary_generated_tokens], device=generated.device, dtype=generated.dtype).unsqueeze(1)
+            )
+            assert not (
+                (input_ids_for_model == self.model.tokenizer.bpe_token_end_id) |
+                (input_ids_for_model >= boundary_offset)
+            ).any().item()  # type: ignore
+            if expand_input_ids:
+                expanded_input_ids_for_model = torch.zeros_like(input_ids_for_model)
+                for i in range(input_ids_for_model.shape[0]):
+                    expanded_input_ids_for_model[i, :] = torch.tensor(self.model.tokenizer.expand_byte_ids(
+                        generated[i, :].tolist(),
+                        n_last=input_ids_for_model.shape[1],
+                    ), device=expanded_input_ids_for_model.device, dtype=expanded_input_ids_for_model.dtype)
+            else:
+                expanded_input_ids_for_model = None
+            out = self.forward(  # type: ignore
+                input_ids_for_model,
+                expanded_input_ids=expanded_input_ids_for_model,
+                boundary_mask=boundary_mask if is_first_forward else None,
+                boundary_state=boundary_state,
+                pad_state=pad_state,
+                sequence_start_indices=sequence_start_indices,
+                logits_to_keep=1,
+                use_cache=True,
+                past_key_values=global_past_key_values,
+            )
+            next_token_logits = cast(torch.Tensor, out.logits)
+            global_past_key_values = out.past_key_values
+            if boundary_state.all():
+                # new token, must not be boundary
+                bytes_since_boundary[:] = 0
+            else:
+                boundary_state.selective_add(1, bytes_since_boundary, inv=True)
+            if any(x is not None for x in forced_decoding_ids):
+                # only supported for the first token atm, so len(next_token_logits) == batch_size
+                assert len(next_token_logits) == batch_size and is_first_forward
+                for example_idx in range(batch_size):
+                    forced_decoding_id = forced_decoding_ids[example_idx]
+                    if forced_decoding_id is not None:
+                        no_boundary_logit = next_token_logits[example_idx, 0, forced_decoding_id].item()
+                        boundary_logit = next_token_logits[example_idx, 0, forced_decoding_id + boundary_offset].item()
+                        next_token_logits[example_idx, 0, :] = -100_000
+                        next_token_logits[example_idx, 0, forced_decoding_id] = no_boundary_logit
+                        next_token_logits[example_idx, 0, forced_decoding_id + boundary_offset] = boundary_logit
+                        forced_decoding_ids[example_idx] = None  # only force once
+            # passing input_ids to logit processor not implemented
+            next_token_scores = logits_processor(None, next_token_logits[:, -1]) # type: ignore
+            if generation_config is not None and generation_config.do_sample:
+                probs = nn.functional.softmax(next_token_scores, dim=-1)
+                new_next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                new_next_tokens = torch.argmax(next_token_scores, dim=-1)
+            if boundary_state.all() or is_first_forward:
+                tokens_generated_plus_prefilled += 1
+                next_tokens = new_next_tokens
+                next_tokens_cpu = next_tokens.cpu()
+                for example_idx in range(batch_size):
+                    if finished[example_idx].item():
+                        continue
+                    next_token_cpu = next_tokens_cpu[example_idx].item()
+                    if next_token_cpu >= boundary_offset:
+                        next_token_cpu -= boundary_offset
+                    non_boundary_generated_tokens[example_idx].append(next_token_cpu)
+            else:
+                next_tokens[:] = self.model.tokenizer.bpe_token_end_id # type: ignore
+                boundary_state.selective_put(new_next_tokens, next_tokens, inv=True)
+                next_tokens_cpu = next_tokens.cpu()
+                for example_idx in range(batch_size):
+                    if finished[example_idx].item():
+                        continue
+                    next_token_cpu = next_tokens_cpu[example_idx].item()
+                    if not boundary_state.cpu_mask[example_idx].item():
+                        if next_token_cpu >= boundary_offset:
+                            next_token_cpu -= boundary_offset
+                        non_boundary_generated_tokens[example_idx].append(next_token_cpu)
+            is_first_forward = False
+            boundary_state = MaskState(
+                (next_tokens == self.model.tokenizer.bpe_token_end_id) |
+                (next_tokens >= boundary_offset) |
+                finished
+            )  # type: ignore
+            pad_state = MaskState(
+                (next_tokens == self.model.tokenizer.bpe_token_end_id) |
+                finished
+            )
+            # Force EOS for (previously) finished sequences
+            next_tokens = torch.where(finished, torch.full_like(next_tokens, eos), next_tokens)
+            # Append next tokens
+            generated = torch.cat([generated, next_tokens.unsqueeze(-1)], dim=1)
+            # Handle finished sequences
+            stop_hit = next_tokens.eq(eos) | next_tokens.eq(eos + boundary_offset)
+            for i in range(batch_size):
+                # passing `scores` to stopping criteria not implemented
+                if stopping_criteria(torch.tensor(non_boundary_generated_tokens[i], dtype=torch.long).unsqueeze(0), None).squeeze(0).item():  # type: ignore
+                    stop_hit[i] = True
+            finished |= stop_hit
+            bytes_generated += 1
+        return pad_left([
+            torch.cat([byte_input_ids[i, :-1], torch.tensor(x, dtype=torch.long, device=byte_input_ids.device)])
+            for i, x in enumerate(non_boundary_generated_tokens)
+        ], value=self.model.tokenizer.pad_token_id, multiple_of=1)  # type: ignore
+__all__ = ["BolmoForCausalLM", "BolmoModel", "BolmoPreTrainedModel"]

recipe.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+default_stage:
+  default_modifiers:
+    QuantizationModifier:
+      targets: [Linear]
+      ignore: [lm_head, 're:visual.*', 're:.*vision_tower.*', 're:.*video_tower.*', 're:.*audio_tower.*',
+        're:.*multi_modal_projector.*']
+      scheme: NVFP4

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<bos>",
+  "eos_token": "<bos>",
+  "pad_token": "<pad>"
+}

tokenization_bolmo.py ADDED Viewed

	@@ -0,0 +1,378 @@

+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import Optional, Union
+from transformers import AutoTokenizer
+from transformers.tokenization_utils import PreTrainedTokenizer
+# Source: https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9
+# Also implemented in https://docs.rs/tokenizers/latest/src/tokenizers/pre_tokenizers/byte_level.rs.html#13-39
+_CHARS_TO_BYTES = {
+    "Ā": 0, "ā": 1, "Ă": 2, "ă": 3, "Ą": 4, "ą": 5, "Ć": 6, "ć": 7, "Ĉ": 8,
+    "ĉ": 9, "Ċ": 10, "ċ": 11, "Č": 12, "č": 13, "Ď": 14, "ď": 15, "Đ": 16,
+    "đ": 17, "Ē": 18, "ē": 19, "Ĕ": 20, "ĕ": 21, "Ė": 22, "ė": 23, "Ę": 24,
+    "ę": 25, "Ě": 26, "ě": 27, "Ĝ": 28, "ĝ": 29, "Ğ": 30, "ğ": 31, "Ġ": 32,
+    "!": 33, '"': 34, "#": 35, "$": 36, "%": 37, "&": 38, "'": 39, "(": 40,
+    ")": 41, "*": 42, "+": 43, ",": 44, "-": 45, ".": 46, "/": 47, "0": 48,
+    "1": 49, "2": 50, "3": 51, "4": 52, "5": 53, "6": 54, "7": 55, "8": 56,
+    "9": 57, ":": 58, ";": 59, "<": 60, "=": 61, ">": 62, "?": 63, "@": 64,
+    "A": 65, "B": 66, "C": 67, "D": 68, "E": 69, "F": 70, "G": 71, "H": 72,
+    "I": 73, "J": 74, "K": 75, "L": 76, "M": 77, "N": 78, "O": 79, "P": 80,
+    "Q": 81, "R": 82, "S": 83, "T": 84, "U": 85, "V": 86, "W": 87, "X": 88,
+    "Y": 89, "Z": 90, "[": 91, "\\": 92, "]": 93, "^": 94, "_": 95, "`": 96,
+    "a": 97, "b": 98, "c": 99, "d": 100, "e": 101, "f": 102, "g": 103,
+    "h": 104, "i": 105, "j": 106, "k": 107, "l": 108, "m": 109, "n": 110,
+    "o": 111, "p": 112, "q": 113, "r": 114, "s": 115, "t": 116, "u": 117,
+    "v": 118, "w": 119, "x": 120, "y": 121, "z": 122, "{": 123, "|": 124,
+    "}": 125, "~": 126, "ġ": 127, "Ģ": 128, "ģ": 129, "Ĥ": 130, "ĥ": 131,
+    "Ħ": 132, "ħ": 133, "Ĩ": 134, "ĩ": 135, "Ī": 136, "ī": 137, "Ĭ": 138,
+    "ĭ": 139, "Į": 140, "į": 141, "İ": 142, "ı": 143, "Ĳ": 144, "ĳ": 145,
+    "Ĵ": 146, "ĵ": 147, "Ķ": 148, "ķ": 149, "ĸ": 150, "Ĺ": 151, "ĺ": 152,
+    "Ļ": 153, "ļ": 154, "Ľ": 155, "ľ": 156, "Ŀ": 157, "ŀ": 158, "Ł": 159,
+    "ł": 160, "¡": 161, "¢": 162, "£": 163, "¤": 164, "¥": 165, "¦": 166,
+    "§": 167, "¨": 168, "©": 169, "ª": 170, "«": 171, "¬": 172, "Ń": 173,
+    "®": 174, "¯": 175, "°": 176, "±": 177, "²": 178, "³": 179, "´": 180,
+    "µ": 181, "¶": 182, "·": 183, "¸": 184, "¹": 185, "º": 186, "»": 187,
+    "¼": 188, "½": 189, "¾": 190, "¿": 191, "À": 192, "Á": 193, "Â": 194,
+    "Ã": 195, "Ä": 196, "Å": 197, "Æ": 198, "Ç": 199, "È": 200, "É": 201,
+    "Ê": 202, "Ë": 203, "Ì": 204, "Í": 205, "Î": 206, "Ï": 207, "Ð": 208,
+    "Ñ": 209, "Ò": 210, "Ó": 211, "Ô": 212, "Õ": 213, "Ö": 214, "×": 215,
+    "Ø": 216, "Ù": 217, "Ú": 218, "Û": 219, "Ü": 220, "Ý": 221, "Þ": 222,
+    "ß": 223, "à": 224, "á": 225, "â": 226, "ã": 227, "ä": 228, "å": 229,
+    "æ": 230, "ç": 231, "è": 232, "é": 233, "ê": 234, "ë": 235, "ì": 236,
+    "í": 237, "î": 238, "ï": 239, "ð": 240, "ñ": 241, "ò": 242, "ó": 243,
+    "ô": 244, "õ": 245, "ö": 246, "÷": 247, "ø": 248, "ù": 249, "ú": 250,
+    "û": 251, "ü": 252, "ý": 253, "þ": 254, "ÿ": 255,
+}
+_BYTES_TO_CHARS = {v: k for k, v in _CHARS_TO_BYTES.items()}
+def _bytes_to_chars(byte_sequence: bytes) -> str:
+    return "".join(_BYTES_TO_CHARS[byte] for byte in byte_sequence)
+def _chars_to_bytes(char_sequence: str) -> list:
+    return list(bytes(_CHARS_TO_BYTES[char] for char in char_sequence))
+@dataclass
+class BolmoTokenizerConfig:
+    vocab_size: int
+    bos_token_id: int
+    pad_token_id: int
+    eos_token_id: int
+    bpe_token_end_id: int
+    special_tokens: list[str] = field(default_factory=lambda: [])
+    special_tokens_first: bool = True
+    original_identifier: Optional[str] = None
+    @classmethod
+    def bolmo(cls) -> "BolmoTokenizerConfig":
+        special_tokens = [
+            "<pad>",
+            "<bos>",
+            "<eos>",
+            "<bpe_token_end>",
+        ]
+        return cls(
+            # *2 to accomodate fused boundary tokens
+            vocab_size=(len(special_tokens) + 256) * 2,
+            special_tokens=special_tokens,
+            bos_token_id=special_tokens.index("<bos>"),
+            pad_token_id=special_tokens.index("<pad>"),
+            eos_token_id=special_tokens.index("<bos>"),
+            bpe_token_end_id=special_tokens.index("<bpe_token_end>"),
+            original_identifier="allenai/dolma2-tokenizer",
+        )
+    def build(self):
+        return BolmoTokenizer(tokenizer_config=self)
+class BolmoTokenizer(PreTrainedTokenizer):
+    TOKEN_ID_KEY = -1
+    def __init__(self, **kwargs):
+        tokenizer_config = kwargs.pop("tokenizer_config", BolmoTokenizerConfig.bolmo())
+        self.config = tokenizer_config
+        self.hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_config.original_identifier)
+        if self.config.special_tokens_first:
+            self.offset = len(tokenizer_config.special_tokens)
+            self.special_tokens_offset = 0
+        else:
+            self.offset = 0
+            self.special_tokens_offset = self.config.vocab_size - len(tokenizer_config.special_tokens)
+        self.byte_sequences = {}
+        for key, value in self.hf_tokenizer.get_vocab().items():
+            if key in self.config.special_tokens:
+                byte_sequence = [self.special_tokens_offset + self.config.special_tokens.index(key)]
+            elif value == self.hf_tokenizer.eos_token_id and self.eos_token_id is not None:
+                byte_sequence = [self.eos_token_id]
+            elif value == self.hf_tokenizer.bos_token_id and self.bos_token_id is not None:
+                byte_sequence = [self.bos_token_id]
+            elif value == self.hf_tokenizer.pad_token_id and self.pad_token_id is not None:
+                byte_sequence = [self.pad_token_id]
+            else:
+                byte_sequence = [self.offset + i for i in _chars_to_bytes(key)]
+            assert self.byte_sequences.get(value) is None
+            self.byte_sequences[value] = byte_sequence
+        self.byte_trie = {}
+        for token_id, byte_sequence in self.byte_sequences.items():
+            current_dict = self.byte_trie
+            for byte in byte_sequence[::-1]: # retrieved from the back so store in reverse order
+                if byte not in current_dict:
+                    current_dict[byte] = {}
+                current_dict = current_dict[byte]
+            current_dict[BolmoTokenizer.TOKEN_ID_KEY] = token_id
+        self.add_bos_token = True
+        self.add_eos_token = False
+        self.padding_side = "left" # for generate
+        super().__init__(
+            bos_token=self.config.special_tokens[self.config.bos_token_id],
+            eos_token=self.config.special_tokens[self.config.eos_token_id],
+            pad_token=self.config.special_tokens[self.config.pad_token_id],
+            extra_ids=0,
+        )
+    @property
+    def bos_token_id(self):
+        return self.config.bos_token_id
+    @property
+    def eos_token_id(self):
+        return self.config.eos_token_id
+    @property
+    def pad_token_id(self):
+        return self.config.pad_token_id
+    @property
+    def bpe_token_end_id(self):
+        return self.config.bpe_token_end_id
+    @property
+    def vocab_size(self):
+        return self.config.vocab_size
+    def _convert_id_to_token(self, index):
+        if index < self.offset:
+            return self.config.special_tokens[index - self.special_tokens_offset]
+        if index >= self.offset + 256 and index < self.offset * 2 + 256:
+            # special token with fused boundary
+            return self.config.special_tokens[index - self.offset - 256] + "b"
+        return _BYTES_TO_CHARS[index - self.offset - 256 - self.offset] + "b" if index >= self.offset + 256 else _BYTES_TO_CHARS[index - self.offset]
+    def _convert_token_to_id(self, token):
+        if token in self.config.special_tokens:
+            return self.config.special_tokens.index(token)
+        if token in [x + "b" for x in self.config.special_tokens]:
+            # special token with fused boundary
+            return 256 + self.config.special_tokens.index(token[:-1])
+        if len(token) > 1 and token[-1] == "b":
+            return self.offset + 256 + _CHARS_TO_BYTES[token[0]]
+        else:
+            return self.offset + _CHARS_TO_BYTES[token]
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        return vocab
+    def expand_byte_ids(self, byte_ids: list[int], n_last: Optional[int] = None) -> list[int]:
+        # search in the byte tree for the longest matching token at every byte position
+        expanded_ids = []
+        for i in range(len(byte_ids)):
+            if n_last is not None and i < len(byte_ids) - n_last:
+                continue
+            current_dict = self.byte_trie
+            current_expansion = None
+            for i in range(i, -1, -1):
+                byte = byte_ids[i]
+                if byte == self.bpe_token_end_id:
+                    # skip bpe token end markers, needed for generation
+                    continue
+                if byte >= self.offset + 256:
+                    # ignore fused boundary
+                    byte -= self.offset + 256
+                try:
+                    current_dict = current_dict[byte]
+                    if BolmoTokenizer.TOKEN_ID_KEY in current_dict:
+                        current_expansion = current_dict[BolmoTokenizer.TOKEN_ID_KEY]
+                except KeyError:
+                    assert current_expansion is not None
+                    break
+            expanded_ids.append(current_expansion)
+        return expanded_ids
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = bos_token_id + token_ids_0 + eos_token_id
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+        return output
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+        return output
+    def _tokenize(self, text: str, **kwargs) -> list[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        tokens = self.convert_ids_to_tokens(self._bolmo_encode(text))
+        return tokens
+    def _patch_ids_to_byte_ids(self, input_ids: list[int]):
+        return [byte_token_id for token_id in input_ids for byte_token_id in self.byte_sequences[token_id]]
+    def _bolmo_encode(self, string: str, add_special_tokens=False):
+        input_ids = self.hf_tokenizer.encode(string, add_special_tokens=add_special_tokens)
+        return self._patch_ids_to_byte_ids(input_ids)
+    def _bolmo_decode(self, tokens: list[int], skip_special_tokens: bool = False) -> str:
+        return self._decode_to_bytes(tokens, skip_special_tokens=skip_special_tokens).decode("utf-8", errors="replace")
+    def _decode_to_bytes(self, tokens: list[int], skip_special_tokens: bool = False) -> bytes:
+        tokens_without_boundary = []
+        for token in tokens:
+            if token >= (self.offset + 256):
+                token -= self.offset + 256
+            tokens_without_boundary.append(token)
+        utf8_bytes = []
+        for token in tokens_without_boundary:
+            if token < self.offset:
+                if skip_special_tokens:
+                    continue
+                else:
+                    utf8_bytes.extend(self.config.special_tokens[token].encode("utf-8"))
+            else:
+                utf8_bytes.append(min(token - self.offset, 255))
+        return bytes(utf8_bytes)
+    def get_tokens_and_patch_lengths(self, original_input_ids: list[int], add_bos=False, strip_pad=False, skip_last=False):
+        if add_bos and self.bos_token_id is not None:
+            byte_tokens = [self.bos_token_id]
+            patch_lengths = [1]
+        else:
+            byte_tokens = []
+            patch_lengths = []
+        for idx, token in enumerate(original_input_ids):
+            # optionally skip last token to keep the length the same if add_bos=True
+            if skip_last and idx == len(original_input_ids) - 1:
+                break
+            token_byte_tokens = self._patch_ids_to_byte_ids([int(token)])
+            if strip_pad and all(t == self.pad_token_id for t in token_byte_tokens):
+                # skip padding tokens
+                continue
+            patch_lengths.append(len(token_byte_tokens))
+            byte_tokens.extend(token_byte_tokens)
+        return byte_tokens, patch_lengths
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        return self._bolmo_decode(self.convert_tokens_to_ids(tokens), skip_special_tokens=False)  # type: ignore
+    def _decode(
+        self,
+        token_ids: Union[int, list[int]],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        spaces_between_special_tokens: bool = True,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        return self._bolmo_decode(token_ids, skip_special_tokens=skip_special_tokens)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        return ()  # type: ignore

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_bolmo.BolmoTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<bos>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<bos>",
+  "extra_ids": 0,
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "BolmoTokenizer"
+}

utils_bolmo.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import math
+import torch
+import torch.nn.functional as F
+def compute_boundary_mask(boundary_logprobs: torch.Tensor, boundary_threshold: str) -> torch.Tensor:
+    if boundary_threshold.startswith("sample:"):
+        _, temperature = boundary_threshold.split(":")
+        temperature = float(temperature)
+        if temperature == 0:
+            return (boundary_logprobs > math.log(0.5))
+        elif temperature == 1:
+            return torch.bernoulli(torch.exp(boundary_logprobs)).to(torch.bool)
+        else:
+            raise NotImplementedError("Temperatures outside {0,1} are not implemented yet.")
+    elif boundary_threshold.startswith("topk:"):
+        _, topk = boundary_threshold.split(":")
+        topk = int(topk)
+        thresholds = torch.quantile(boundary_logprobs, dim=1, q=1 - (topk / boundary_logprobs.shape[1]))
+        return (boundary_logprobs >= thresholds.unsqueeze(-1))
+    elif boundary_threshold.startswith("topk_percent:"):
+        _, topk_percent = boundary_threshold.split(":")
+        topk_percent = float(topk_percent)
+        assert 0 <= topk_percent <= 1
+        thresholds = torch.quantile(boundary_logprobs, dim=1, q=1 - topk_percent)
+        return (boundary_logprobs >= thresholds.unsqueeze(-1))
+    else:
+        raise ValueError(f"Unknown boundary threshold: {boundary_threshold}")
+def _pad(tensors: list[torch.Tensor], multiple_of: int, direction: str, value):
+    max_len = max(t.size(0) for t in tensors)
+    if multiple_of > 1:
+        # Round up max_len to the nearest multiple_of
+        max_len = ((max_len + multiple_of - 1) // multiple_of) * multiple_of
+    padded = []
+    for t in tensors:
+        if direction == "left":
+            pad_shape = (max_len - t.size(0), 0)
+        elif direction == "right":
+            pad_shape = (0, max_len - t.size(0))
+        else:
+            raise ValueError(f"Unknown direction: {direction}. Must be 'left' or 'right'.")
+        padded.append(F.pad(t, pad_shape, value=value))
+    return torch.stack(padded, dim=0)
+def pad_right(
+    tensors: list[torch.Tensor],
+    multiple_of: int = 128,
+    value=0,
+):
+    return _pad(tensors, multiple_of, direction="right", value=value)
+def pad_left(
+    tensors: list[torch.Tensor],
+    multiple_of: int = 128,
+    value=0,
+):
+    return _pad(tensors, multiple_of, direction="left", value=value)
+class MaskState:
+    def __init__(self, mask):
+        self.cpu_mask = mask.cpu()
+        self.mask = mask
+        self.inv_mask = ~mask
+        self._all = self.cpu_mask.all().item()
+        self._any = self.cpu_mask.any().item()
+    def any(self):
+        return self._any
+    def all(self):
+        return self._all
+    def selective_get(self, x, inv=False):
+        # try to avoid sync through nonzero on index
+        if inv:
+            if self.all():
+                return x[[]]
+            elif not self.any():
+                return x
+            else:
+                return x[self.inv_mask]
+        else:
+            if self.all():
+                return x
+            elif not self.any():
+                return x[[]]
+            else:
+                return x[self.mask]
+    def selective_put(self, x, out, inv=False):
+        # try to avoid sync through nonzero on index
+        if inv:
+            if self.all():
+                return
+            elif not self.any():
+                out.copy_(x)
+            else:
+                out[self.inv_mask] = x
+        else:
+            if self.all():
+                out.copy_(x)
+            elif not self.any():
+                return
+            else:
+                out[self.mask] = x
+    def selective_add(self, x, out, inv=False):
+        # try to avoid sync through nonzero on index
+        if inv:
+            if self.all():
+                return
+            elif not self.any():
+                out.add_(x)
+            else:
+                out[self.inv_mask] += x
+        else:
+            if self.all():
+                out.add_(x)
+            elif not self.any():
+                return
+            else:
+                out[self.mask] += x