Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

configuration_bolmo.py +4 -4
modeling_bolmo.py +97 -44
special_tokens_map.json +5 -0
tokenization_bolmo.py +170 -93
tokenizer_config.json +34 -0

configuration_bolmo.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any
 from transformers.configuration_utils import PretrainedConfig, layer_type_validation
 from transformers.modeling_rope_utils import rope_config_validation
-from .tokenization_bolmo import ByteTokenizerConfig
 class BolmoConfig(PretrainedConfig):
     r"""
@@ -167,7 +167,7 @@ class BolmoConfig(PretrainedConfig):
         local_intermediate_size: int = 5504,
         local_rms_norm_eps=1e-5,
         subword_vocab_size: int = 100278, # dolma2_tokenizer subword vocab size
-        tokenizer_config: ByteTokenizerConfig | dict[str, Any] | None = None,
         **kwargs,
     ):
         super().__init__(
@@ -220,8 +220,8 @@ class BolmoConfig(PretrainedConfig):
         self.subword_vocab_size = subword_vocab_size
         if tokenizer_config is None:
-            self.tokenizer_config = asdict(ByteTokenizerConfig.bolmo())
-        elif isinstance(tokenizer_config, ByteTokenizerConfig):
             self.tokenizer_config = asdict(tokenizer_config)
         else:
             self.tokenizer_config = tokenizer_config

 from transformers.configuration_utils import PretrainedConfig, layer_type_validation
 from transformers.modeling_rope_utils import rope_config_validation
+from .tokenization_bolmo import BolmoTokenizerConfig
 class BolmoConfig(PretrainedConfig):
     r"""
         local_intermediate_size: int = 5504,
         local_rms_norm_eps=1e-5,
         subword_vocab_size: int = 100278, # dolma2_tokenizer subword vocab size
+        tokenizer_config: BolmoTokenizerConfig | dict[str, Any] | None = None,
         **kwargs,
     ):
         super().__init__(
         self.subword_vocab_size = subword_vocab_size
         if tokenizer_config is None:
+            self.tokenizer_config = asdict(BolmoTokenizerConfig.bolmo())
+        elif isinstance(tokenizer_config, BolmoTokenizerConfig):
             self.tokenizer_config = asdict(tokenizer_config)
         else:
             self.tokenizer_config = tokenizer_config

modeling_bolmo.py CHANGED Viewed

@@ -10,7 +10,8 @@ from transformers.utils.generic import TransformersKwargs
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
-from transformers.generation import GenerationMixin
 from transformers.integrations import use_kernel_forward_from_hub
 from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
 from transformers.modeling_layers import GradientCheckpointingLayer
@@ -18,15 +19,18 @@ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutpu
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
-from transformers.utils import auto_docstring, can_return_tuple
 from transformers.utils.deprecation import deprecate_kwarg
 from transformers.utils.generic import check_model_inputs
 from .configuration_bolmo import BolmoConfig
-from .tokenization_bolmo import ByteTokenizerConfig
 from .utils_bolmo import compute_boundary_mask, pad_right, pad_left, MaskState
-from xlstm.xlstm_large.model import mLSTMLayer, mLSTMLayerConfig, mLSTMLayerStateType, soft_cap, mLSTMBackendConfig
 @use_kernel_forward_from_hub("RMSNorm")
@@ -161,7 +165,7 @@ class BolmoAttention(nn.Module):
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor],
         past_key_values: Optional[Cache] = None,
-        cache_position: Optional[torch.LongTensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         input_shape = hidden_states.shape[:-1]
@@ -235,10 +239,10 @@ class BolmoDecoderLayer(GradientCheckpointingLayer):
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
         use_cache: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
         **kwargs: Unpack[TransformersKwargs],
     ) -> torch.Tensor:
@@ -834,7 +838,6 @@ class BolmoRotaryEmbedding(nn.Module):
             return cos, sin
-@auto_docstring
 class BolmoPreTrainedModel(PreTrainedModel):
     config: BolmoConfig
     base_model_prefix = "model"
@@ -853,7 +856,6 @@ class BolmoPreTrainedModel(PreTrainedModel):
     }
-@auto_docstring
 class BolmoModel(BolmoPreTrainedModel):
     def __init__(self, config: BolmoConfig):
         super().__init__(config)
@@ -875,7 +877,7 @@ class BolmoModel(BolmoPreTrainedModel):
             }
         )
-        self.tokenizer_config = ByteTokenizerConfig(**config.tokenizer_config)
         self._tokenizer = None
         # Initialize weights and apply final processing
@@ -897,7 +899,7 @@ class BolmoModel(BolmoPreTrainedModel):
     def prefill_boundary_prediction_forward(
         self,
         input_ids: torch.Tensor,
-        expanded_input_ids: Optional[torch.LongTensor] = None,
         sequence_start_indices: Optional[torch.Tensor] = None,
         last_token_is_boundary: bool = False,
         **kwargs,
@@ -913,16 +915,14 @@ class BolmoModel(BolmoPreTrainedModel):
         return cast(torch.Tensor, boundary_mask)
     @check_model_inputs()
-    @auto_docstring
     def forward(
         self,
-        input_ids: torch.LongTensor,
-        expanded_input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        cache_position: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         boundary_mask: Optional[torch.Tensor] = None,
         boundary_state: Optional[MaskState] = None,
@@ -1029,7 +1029,6 @@ class BolmoModel(BolmoPreTrainedModel):
         )
-@auto_docstring
 class BolmoForCausalLM(BolmoPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     _tp_plan = {"lm_head": "colwise_rep"}
@@ -1051,16 +1050,15 @@ class BolmoForCausalLM(BolmoPreTrainedModel, GenerationMixin):
         self.lm_head = new_embeddings
     @can_return_tuple
-    @auto_docstring
     def forward(
         self,
-        input_ids: torch.LongTensor,
-        expanded_input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
-        cache_position: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         boundary_mask: Optional[torch.Tensor] = None,
         boundary_state: Optional[MaskState] = None,
@@ -1114,22 +1112,42 @@ class BolmoForCausalLM(BolmoPreTrainedModel, GenerationMixin):
             attentions=outputs.attentions,
         )
-    def generate(self, input_ids: list[list[int]], max_new_tokens: int = 20):
         expand_input_ids = self.model.local_encoder.add_expanded_embeddings
-        batch_size = len(input_ids)
         if expand_input_ids:
             expanded_input_ids = []
-            for i in range(len(input_ids)):
-                expanded_input_ids.append(torch.tensor(self.model.tokenizer.expand_byte_ids(input_ids[i]), device=self.device, dtype=torch.long))
             expanded_input_ids = pad_left(expanded_input_ids, value=self.model.tokenizer.pad_token_id, multiple_of=1)  # type: ignore
         else:
             expanded_input_ids = None
-        byte_input_ids: torch.Tensor = pad_left([torch.tensor(x, device=self.device, dtype=torch.long) for x in input_ids], value=self.model.tokenizer.pad_token_id, multiple_of=1)
         sequence_start_indices = (byte_input_ids == self.model.tokenizer.pad_token_id).sum(-1)
         batch_size, prompt_len = byte_input_ids.shape
         finished = torch.zeros(batch_size, dtype=torch.bool, device=self.device)
@@ -1155,6 +1173,31 @@ class BolmoForCausalLM(BolmoPreTrainedModel, GenerationMixin):
         # stays the same unless last token is pad.
         sequence_start_indices = (byte_input_ids == self.model.tokenizer.pad_token_id).sum(-1)
         # output container
         generated = byte_input_ids
@@ -1162,8 +1205,6 @@ class BolmoForCausalLM(BolmoPreTrainedModel, GenerationMixin):
         tokens_generated_plus_prefilled = max_n_prefill_patches
         bytes_generated = 0
-        max_length = max_n_prefill_patches + max_new_tokens
         # generation state
         boundary_state = MaskState(boundary_mask[:, -1].clone())
         pad_state = MaskState(torch.zeros(batch_size, dtype=torch.bool, device=self.device))
@@ -1173,10 +1214,7 @@ class BolmoForCausalLM(BolmoPreTrainedModel, GenerationMixin):
         is_first_forward = True
         global_past_key_values = None
-        # TODO: impl
-        stop_token_sequences = []
-        while not ((max_length is not None and tokens_generated_plus_prefilled >= max_length) or finished.all()):
             input_ids_for_model = (
                 generated
                 if is_first_forward
@@ -1232,15 +1270,24 @@ class BolmoForCausalLM(BolmoPreTrainedModel, GenerationMixin):
                         forced_decoding_ids[example_idx] = None  # only force once
-            # TODO: impl non-greedy
-            new_next_tokens = next_token_logits.squeeze(1).argmax(dim=-1)
-            if boundary_state.all():
                 tokens_generated_plus_prefilled += 1
                 next_tokens = new_next_tokens
                 next_tokens_cpu = next_tokens.cpu()
                 for example_idx in range(batch_size):
                     next_token_cpu = next_tokens_cpu[example_idx].item()
                     if next_token_cpu >= boundary_offset:
@@ -1253,6 +1300,9 @@ class BolmoForCausalLM(BolmoPreTrainedModel, GenerationMixin):
                 next_tokens_cpu = next_tokens.cpu()
                 for example_idx in range(batch_size):
                     next_token_cpu = next_tokens_cpu[example_idx].item()
                     if not boundary_state.cpu_mask[example_idx].item():
@@ -1282,14 +1332,17 @@ class BolmoForCausalLM(BolmoPreTrainedModel, GenerationMixin):
             # Handle finished sequences
             stop_hit = next_tokens.eq(eos) | next_tokens.eq(eos + boundary_offset)
-            # Also check for stop tokens if provided
-            # TODO(benjaminm): this is very annoying due to the boundaries
-            # make better
-            if len(stop_token_sequences) > 0:
-                # TODO: implement
-                raise NotImplementedError("stop_token_sequences not implemented yet for Bolmo generation.")
             finished |= stop_hit
             bytes_generated += 1
 __all__ = ["BolmoForCausalLM", "BolmoModel", "BolmoPreTrainedModel"]

 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin, GenerationConfig, LogitsProcessorList, StoppingCriteriaList
+from transformers.generation.utils import GenerateOutput
 from transformers.integrations import use_kernel_forward_from_hub
 from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
 from transformers.modeling_layers import GradientCheckpointingLayer
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
+from transformers.utils import can_return_tuple
 from transformers.utils.deprecation import deprecate_kwarg
 from transformers.utils.generic import check_model_inputs
 from .configuration_bolmo import BolmoConfig
+from .tokenization_bolmo import BolmoTokenizerConfig
 from .utils_bolmo import compute_boundary_mask, pad_right, pad_left, MaskState
+try:
+    from xlstm.xlstm_large.model import mLSTMLayer, mLSTMLayerConfig, mLSTMLayerStateType, soft_cap, mLSTMBackendConfig
+except ImportError:
+    raise ImportError("The `xlstm` package is required to use Bolmo. Please install it via `pip install xlstm`.")
 @use_kernel_forward_from_hub("RMSNorm")
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor],
         past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.Tensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         input_shape = hidden_states.shape[:-1]
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
         past_key_values: Optional[Cache] = None,
         use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
         position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
         **kwargs: Unpack[TransformersKwargs],
     ) -> torch.Tensor:
             return cos, sin
 class BolmoPreTrainedModel(PreTrainedModel):
     config: BolmoConfig
     base_model_prefix = "model"
     }
 class BolmoModel(BolmoPreTrainedModel):
     def __init__(self, config: BolmoConfig):
         super().__init__(config)
             }
         )
+        self.tokenizer_config = BolmoTokenizerConfig(**config.tokenizer_config)
         self._tokenizer = None
         # Initialize weights and apply final processing
     def prefill_boundary_prediction_forward(
         self,
         input_ids: torch.Tensor,
+        expanded_input_ids: Optional[torch.Tensor] = None,
         sequence_start_indices: Optional[torch.Tensor] = None,
         last_token_is_boundary: bool = False,
         **kwargs,
         return cast(torch.Tensor, boundary_mask)
     @check_model_inputs()
     def forward(
         self,
+        input_ids: torch.Tensor,
+        expanded_input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
         past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.Tensor] = None,
         use_cache: Optional[bool] = None,
         boundary_mask: Optional[torch.Tensor] = None,
         boundary_state: Optional[MaskState] = None,
         )
 class BolmoForCausalLM(BolmoPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     _tp_plan = {"lm_head": "colwise_rep"}
         self.lm_head = new_embeddings
     @can_return_tuple
     def forward(
         self,
+        input_ids: torch.Tensor,
+        expanded_input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
         past_key_values: Optional[Cache] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
         use_cache: Optional[bool] = None,
         boundary_mask: Optional[torch.Tensor] = None,
         boundary_state: Optional[MaskState] = None,
             attentions=outputs.attentions,
         )
+    @torch.no_grad()
+    def generate(  # type: ignore
+        self,
+        inputs: torch.Tensor,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        use_model_defaults: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.Tensor]:
+        # generic preprocessing
+        generation_config, model_kwargs = self._prepare_generation_config(
+            generation_config, use_model_defaults, **kwargs
+        )
+        self._prepare_special_tokens(generation_config, device=self.model.device)
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        # start of custom generate
         expand_input_ids = self.model.local_encoder.add_expanded_embeddings
+        batch_size = len(inputs)
         if expand_input_ids:
             expanded_input_ids = []
+            for i in range(len(inputs)):
+                expanded_input_ids.append(torch.tensor(self.model.tokenizer.expand_byte_ids(inputs[i].tolist()), device=self.device, dtype=torch.long))
             expanded_input_ids = pad_left(expanded_input_ids, value=self.model.tokenizer.pad_token_id, multiple_of=1)  # type: ignore
         else:
             expanded_input_ids = None
+        byte_input_ids = inputs
         sequence_start_indices = (byte_input_ids == self.model.tokenizer.pad_token_id).sum(-1)
         batch_size, prompt_len = byte_input_ids.shape
         finished = torch.zeros(batch_size, dtype=torch.bool, device=self.device)
         # stays the same unless last token is pad.
         sequence_start_indices = (byte_input_ids == self.model.tokenizer.pad_token_id).sum(-1)
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
+        generation_config = self._prepare_generated_length(
+            generation_config=generation_config,
+            has_default_max_length=has_default_max_length,
+            has_default_min_length=has_default_min_length,
+            model_input_name="input_ids",
+            inputs_tensor=byte_input_ids,
+            input_ids_length=byte_input_ids.shape[1],
+        )
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,  # type: ignore
+            input_ids_seq_length=byte_input_ids.shape[1],
+            encoder_input_ids=byte_input_ids,  # type: ignore
+            logits_processor=logits_processor,
+            device=byte_input_ids.device,  # type: ignore
+            model_kwargs=model_kwargs,
+        )
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config,  # type: ignore
+            stopping_criteria=stopping_criteria,
+            tokenizer=self.model.tokenizer,
+        )
         # output container
         generated = byte_input_ids
         tokens_generated_plus_prefilled = max_n_prefill_patches
         bytes_generated = 0
         # generation state
         boundary_state = MaskState(boundary_mask[:, -1].clone())
         pad_state = MaskState(torch.zeros(batch_size, dtype=torch.bool, device=self.device))
         is_first_forward = True
         global_past_key_values = None
+        while not finished.all():
             input_ids_for_model = (
                 generated
                 if is_first_forward
                         forced_decoding_ids[example_idx] = None  # only force once
+            # passing input_ids to logit processor not implemented
+            next_token_scores = logits_processor(None, next_token_logits[:, -1]) # type: ignore
+            if generation_config is not None and generation_config.do_sample:
+                probs = nn.functional.softmax(next_token_scores, dim=-1)
+                new_next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                new_next_tokens = torch.argmax(next_token_scores, dim=-1)
+            if boundary_state.all() or is_first_forward:
                 tokens_generated_plus_prefilled += 1
                 next_tokens = new_next_tokens
                 next_tokens_cpu = next_tokens.cpu()
                 for example_idx in range(batch_size):
+                    if finished[example_idx].item():
+                        continue
                     next_token_cpu = next_tokens_cpu[example_idx].item()
                     if next_token_cpu >= boundary_offset:
                 next_tokens_cpu = next_tokens.cpu()
                 for example_idx in range(batch_size):
+                    if finished[example_idx].item():
+                        continue
                     next_token_cpu = next_tokens_cpu[example_idx].item()
                     if not boundary_state.cpu_mask[example_idx].item():
             # Handle finished sequences
             stop_hit = next_tokens.eq(eos) | next_tokens.eq(eos + boundary_offset)
+            for i in range(batch_size):
+                # passing `scores` to stopping criteria not implemented
+                if stopping_criteria(torch.tensor(non_boundary_generated_tokens[i], dtype=torch.long).unsqueeze(0), None).squeeze(0).item():  # type: ignore
+                    stop_hit[i] = True
             finished |= stop_hit
             bytes_generated += 1
+        return pad_left([
+            torch.cat([byte_input_ids[i, :-1], torch.tensor(x, dtype=torch.long, device=byte_input_ids.device)])
+            for i, x in enumerate(non_boundary_generated_tokens)
+        ], value=self.model.tokenizer.pad_token_id, multiple_of=1)  # type: ignore
 __all__ = ["BolmoForCausalLM", "BolmoModel", "BolmoPreTrainedModel"]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<bos>",
+  "eos_token": "<bos>",
+  "pad_token": "<pad>"
+}

tokenization_bolmo.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from dataclasses import dataclass, field
 from functools import lru_cache
-from typing import Optional
 from transformers import AutoTokenizer
 # Source: https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9
 # Also implemented in https://docs.rs/tokenizers/latest/src/tokenizers/pre_tokenizers/byte_level.rs.html#13-39
@@ -51,7 +52,7 @@ def _chars_to_bytes(char_sequence: str) -> list:
     return list(bytes(_CHARS_TO_BYTES[char] for char in char_sequence))
 @dataclass
-class ByteTokenizerConfig:
     vocab_size: int
     bos_token_id: int
     pad_token_id: int
@@ -63,7 +64,7 @@ class ByteTokenizerConfig:
     @classmethod
-    def bolmo(cls) -> "ByteTokenizerConfig":
         special_tokens = [
             "<pad>",
             "<bos>",
@@ -83,13 +84,15 @@ class ByteTokenizerConfig:
         )
     def build(self):
-        return ByteTokenizer(self)
-class ByteTokenizer:
     TOKEN_ID_KEY = -1
-    def __init__(self, tokenizer_config: ByteTokenizerConfig):
         self.config = tokenizer_config
         self.hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_config.original_identifier)
         if self.config.special_tokens_first:
@@ -124,7 +127,18 @@ class ByteTokenizer:
                 if byte not in current_dict:
                     current_dict[byte] = {}
                 current_dict = current_dict[byte]
-            current_dict[ByteTokenizer.TOKEN_ID_KEY] = token_id
     @property
     def bos_token_id(self):
@@ -142,6 +156,37 @@ class ByteTokenizer:
     def bpe_token_end_id(self):
         return self.config.bpe_token_end_id
     def expand_byte_ids(self, byte_ids: list[int], n_last: Optional[int] = None) -> list[int]:
         # search in the byte tree for the longest matching token at every byte position
         expanded_ids = []
@@ -165,8 +210,8 @@ class ByteTokenizer:
                 try:
                     current_dict = current_dict[byte]
-                    if ByteTokenizer.TOKEN_ID_KEY in current_dict:
-                        current_expansion = current_dict[ByteTokenizer.TOKEN_ID_KEY]
                 except KeyError:
                     assert current_expansion is not None
                     break
@@ -175,17 +220,100 @@ class ByteTokenizer:
         return expanded_ids
-    def patch_ids_to_byte_ids(self, input_ids: list[int]):
         return [byte_token_id for token_id in input_ids for byte_token_id in self.byte_sequences[token_id]]
-    def encode(self, string: str, add_special_tokens=False):
         input_ids = self.hf_tokenizer.encode(string, add_special_tokens=add_special_tokens)
-        return self.patch_ids_to_byte_ids(input_ids)
-    def decode(self, tokens: list[int]) -> str:
-        return self.decode_to_bytes(tokens).decode("utf-8", errors="replace")
-    def decode_to_bytes(self, tokens: list[int]) -> bytes:
         tokens_without_boundary = []
         for token in tokens:
             if token >= (self.offset + 256):
@@ -193,7 +321,17 @@ class ByteTokenizer:
             tokens_without_boundary.append(token)
-        utf8_bytes = [min(token - self.offset, 255) for token in tokens_without_boundary if token >= self.offset]
         return bytes(utf8_bytes)
     def get_tokens_and_patch_lengths(self, original_input_ids: list[int], add_bos=False, strip_pad=False, skip_last=False):
@@ -209,7 +347,7 @@ class ByteTokenizer:
             if skip_last and idx == len(original_input_ids) - 1:
                 break
-            token_byte_tokens = self.patch_ids_to_byte_ids([int(token)])
             if strip_pad and all(t == self.pad_token_id for t in token_byte_tokens):
                 # skip padding tokens
@@ -220,82 +358,21 @@ class ByteTokenizer:
         return byte_tokens, patch_lengths
-    @lru_cache(maxsize=1024)
-    def _is_spacelike(self, token_id: int) -> bool:
-        """
-        Check if a token ID is spacelike.
-        """
-        byte = token_id - self.offset
-        # see https://github.com/kjslag/spacebyte/blob/321111315c92bce0bc2f9f1630cb0bc82b897c57/spacebyte.py#L137-L145.
-        is_spacelike = (
-            (byte < ord('0')) |
-            ((ord('9') < byte) & (byte < ord('A'))) |
-            ((ord('Z') < byte) & (byte < ord('a'))) |
-            ((ord('z') < byte) & (byte < 0b1000_0000)) |
-            (0b1100_0000 <= byte)
-        )
-        return is_spacelike
-    @lru_cache(maxsize=1024)
-    def _is_strict_spacelike(self, token_id: int) -> bool:
-        """
-        Check if a token ID is strictly spacelike (only space, tab, newline, carriage return).
-        """
-        byte = token_id - self.offset
-        return byte in {ord(' '), ord('\t'), ord('\n'), ord('\r')}
-    def get_space_patch_lengths(self, input_ids: list[int], max_patch_length: int = 16, kind: str = "strict_end_before_space") -> list[int]:
-        patch_lengths = []
-        current_length = 0
-        special_tokens = {self.bos_token_id, self.eos_token_id, self.pad_token_id}
-        all_spacelike = [self._is_spacelike(token) for token in input_ids]
-        if kind == "spacebyte":
-            for token_idx, token in enumerate(input_ids):
-                current_length += 1
-                spacelike = all_spacelike[token_idx]
-                previous_spacelike = all_spacelike[token_idx - 1] if token_idx > 0 else False
-                if (not previous_spacelike and spacelike) or current_length >= max_patch_length or token in special_tokens:
-                    patch_lengths.append(current_length)
-                    current_length = 0
-        elif kind == "spacebyte_end_before_space":
-            for token_idx, token in enumerate(input_ids):
-                current_length += 1
-                spacelike = all_spacelike[token_idx]
-                next_spacelike = all_spacelike[token_idx + 1] if token_idx < len(input_ids) - 1 else True
-                if (not spacelike and next_spacelike) or current_length >= max_patch_length or token in special_tokens:
-                    patch_lengths.append(current_length)
-                    current_length = 0
-        elif kind == "strict_end_before_space":
-            all_strict_spacelike = [self._is_strict_spacelike(token) for token in input_ids]
-            in_strict_prefix = True
-            for token_idx, token in enumerate(input_ids):
-                current_length += 1
-                spacelike = all_spacelike[token_idx]
-                strict_spacelike = all_strict_spacelike[token_idx]
-                next_spacelike = all_spacelike[token_idx + 1] if token_idx < len(input_ids) - 1 else True
-                next_strict_spacelike = all_strict_spacelike[token_idx + 1] if token_idx < len(input_ids) - 1 else True
-                if not strict_spacelike:
-                    in_strict_prefix = False
-                if in_strict_prefix:
-                    continue
-                if (spacelike != next_spacelike) or (strict_spacelike != next_strict_spacelike) or current_length >= max_patch_length or token in special_tokens:
-                    patch_lengths.append(current_length)
-                    in_strict_prefix = True
-                    current_length = 0
-        if current_length > 0:
-            patch_lengths.append(current_length)
-        return patch_lengths

 from dataclasses import dataclass, field
 from functools import lru_cache
+from typing import Optional, Union
 from transformers import AutoTokenizer
+from transformers.tokenization_utils import PreTrainedTokenizer
 # Source: https://github.com/openai/gpt-2/blob/master/src/encoder.py#L9
 # Also implemented in https://docs.rs/tokenizers/latest/src/tokenizers/pre_tokenizers/byte_level.rs.html#13-39
     return list(bytes(_CHARS_TO_BYTES[char] for char in char_sequence))
 @dataclass
+class BolmoTokenizerConfig:
     vocab_size: int
     bos_token_id: int
     pad_token_id: int
     @classmethod
+    def bolmo(cls) -> "BolmoTokenizerConfig":
         special_tokens = [
             "<pad>",
             "<bos>",
         )
     def build(self):
+        return BolmoTokenizer(tokenizer_config=self)
+class BolmoTokenizer(PreTrainedTokenizer):
     TOKEN_ID_KEY = -1
+    def __init__(self, **kwargs):
+        tokenizer_config = kwargs.pop("tokenizer_config", BolmoTokenizerConfig.bolmo())
         self.config = tokenizer_config
         self.hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_config.original_identifier)
         if self.config.special_tokens_first:
                 if byte not in current_dict:
                     current_dict[byte] = {}
                 current_dict = current_dict[byte]
+            current_dict[BolmoTokenizer.TOKEN_ID_KEY] = token_id
+        self.add_bos_token = True
+        self.add_eos_token = False
+        self.padding_side = "left" # for generate
+        super().__init__(
+            bos_token=self.config.special_tokens[self.config.bos_token_id],
+            eos_token=self.config.special_tokens[self.config.eos_token_id],
+            pad_token=self.config.special_tokens[self.config.pad_token_id],
+            extra_ids=0,
+        )
     @property
     def bos_token_id(self):
     def bpe_token_end_id(self):
         return self.config.bpe_token_end_id
+    @property
+    def vocab_size(self):
+        return self.config.vocab_size
+    def _convert_id_to_token(self, index):
+        if index < self.offset:
+            return self.config.special_tokens[index - self.special_tokens_offset]
+        if index >= self.offset + 256 and index < self.offset * 2 + 256:
+            # special token with fused boundary
+            return self.config.special_tokens[index - self.offset - 256] + "b"
+        return _BYTES_TO_CHARS[index - self.offset - 256 - self.offset] + "b" if index >= self.offset + 256 else _BYTES_TO_CHARS[index - self.offset]
+    def _convert_token_to_id(self, token):
+        if token in self.config.special_tokens:
+            return self.config.special_tokens.index(token)
+        if token in [x + "b" for x in self.config.special_tokens]:
+            # special token with fused boundary
+            return 256 + self.config.special_tokens.index(token[:-1])
+        if len(token) > 1 and token[-1] == "b":
+            return self.offset + 256 + _CHARS_TO_BYTES[token[0]]
+        else:
+            return self.offset + _CHARS_TO_BYTES[token]
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        return vocab
     def expand_byte_ids(self, byte_ids: list[int], n_last: Optional[int] = None) -> list[int]:
         # search in the byte tree for the longest matching token at every byte position
         expanded_ids = []
                 try:
                     current_dict = current_dict[byte]
+                    if BolmoTokenizer.TOKEN_ID_KEY in current_dict:
+                        current_expansion = current_dict[BolmoTokenizer.TOKEN_ID_KEY]
                 except KeyError:
                     assert current_expansion is not None
                     break
         return expanded_ids
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = bos_token_id + token_ids_0 + eos_token_id
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+        return output
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+        return output
+    def _tokenize(self, text: str, **kwargs) -> list[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        tokens = self.convert_ids_to_tokens(self._bolmo_encode(text))
+        return tokens
+    def _patch_ids_to_byte_ids(self, input_ids: list[int]):
         return [byte_token_id for token_id in input_ids for byte_token_id in self.byte_sequences[token_id]]
+    def _bolmo_encode(self, string: str, add_special_tokens=False):
         input_ids = self.hf_tokenizer.encode(string, add_special_tokens=add_special_tokens)
+        return self._patch_ids_to_byte_ids(input_ids)
+    def _bolmo_decode(self, tokens: list[int], skip_special_tokens: bool = False) -> str:
+        return self._decode_to_bytes(tokens, skip_special_tokens=skip_special_tokens).decode("utf-8", errors="replace")
+    def _decode_to_bytes(self, tokens: list[int], skip_special_tokens: bool = False) -> bytes:
         tokens_without_boundary = []
         for token in tokens:
             if token >= (self.offset + 256):
             tokens_without_boundary.append(token)
+        utf8_bytes = []
+        for token in tokens_without_boundary:
+            if token < self.offset:
+                if skip_special_tokens:
+                    continue
+                else:
+                    utf8_bytes.extend(self.config.special_tokens[token].encode("utf-8"))
+            else:
+                utf8_bytes.append(min(token - self.offset, 255))
         return bytes(utf8_bytes)
     def get_tokens_and_patch_lengths(self, original_input_ids: list[int], add_bos=False, strip_pad=False, skip_last=False):
             if skip_last and idx == len(original_input_ids) - 1:
                 break
+            token_byte_tokens = self._patch_ids_to_byte_ids([int(token)])
             if strip_pad and all(t == self.pad_token_id for t in token_byte_tokens):
                 # skip padding tokens
         return byte_tokens, patch_lengths
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        return self._bolmo_decode(self.convert_tokens_to_ids(tokens), skip_special_tokens=False)  # type: ignore
+    def _decode(
+        self,
+        token_ids: Union[int, list[int]],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        spaces_between_special_tokens: bool = True,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        return self._bolmo_decode(token_ids, skip_special_tokens=skip_special_tokens)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        return ()  # type: ignore

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_bolmo.BolmoTokenizer",
+      null
+    ]
+  },
+  "bos_token": "<bos>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<bos>",
+  "extra_ids": 0,
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "BolmoTokenizer"
+}