Fix tokenizer ID offset: reserve IDs 0-4 for BERT special tokens

The original Latin BERT model expects [PAD]=0, [UNK]=1, [CLS]=2,
[SEP]=3, [MASK]=4 with SubwordTextEncoder IDs shifted by +5. This
commit adds those five special tokens, shifts all subtoken IDs
accordingly, and implements build_inputs_with_special_tokens so
encode(add_special_tokens=True) wraps with [CLS]/[SEP].

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (3) hide show

src/latincy_latinbert/tokenization_latin_bert.py +72 -10
src/latincy_latinbert/tokenizer_config.json +6 -3
tests/test_tokenizer.py +42 -11

src/latincy_latinbert/tokenization_latin_bert.py CHANGED Viewed

@@ -11,6 +11,9 @@ The tokenization pipeline:
   3. Append trailing underscore (word boundary marker)
   4. Greedy longest-match against subword vocabulary
 Usage:
     from transformers import AutoModel, AutoTokenizer
@@ -83,6 +86,11 @@ def _escape_token(token: str, alphabet: set) -> str:
     return "".join(ret) + "_"
 # ── HuggingFace tokenizer ─────────────────────────────────────────────
 # Vocab file name expected by HF save/load
@@ -95,6 +103,10 @@ class LatinBertTokenizer(PreTrainedTokenizer):
     Wraps the original tensor2tensor SubwordTextEncoder as a
     PreTrainedTokenizer so it works with AutoTokenizer and standard
     HF pipelines.
     """
     vocab_files_names = VOCAB_FILES_NAMES
@@ -103,9 +115,12 @@ class LatinBertTokenizer(PreTrainedTokenizer):
     def __init__(
         self,
         vocab_file: str,
-        pad_token: str = "<pad>_",
         eos_token: str = "<EOS>_",
-        unk_token: str = "<unk>",
         **kwargs,
     ):
         # Load subword vocabulary before super().__init__ so that
@@ -122,8 +137,11 @@ class LatinBertTokenizer(PreTrainedTokenizer):
         super().__init__(
             pad_token=pad_token,
-            eos_token=eos_token,
             unk_token=unk_token,
             **kwargs,
         )
@@ -140,11 +158,18 @@ class LatinBertTokenizer(PreTrainedTokenizer):
                 ):
                     s = s[1:-1]
                 subtoken_strings.append(s)
         self._subtoken_strings = subtoken_strings
         self._max_subtoken_len = (
             max(len(s) for s in subtoken_strings) if subtoken_strings else 0
         )
-        self._subtoken_to_id = {s: i for i, s in enumerate(subtoken_strings) if s}
         self._alphabet = {c for token in subtoken_strings for c in token}
         self._alphabet |= _ESCAPE_CHARS
@@ -152,10 +177,12 @@ class LatinBertTokenizer(PreTrainedTokenizer):
     @property
     def vocab_size(self) -> int:
-        return len(self._subtoken_strings)
     def get_vocab(self) -> Dict[str, int]:
-        return dict(self._subtoken_to_id)
     def _tokenize(self, text: str, **kwargs) -> List[str]:
         """Tokenize text into subtoken strings."""
@@ -198,16 +225,21 @@ class LatinBertTokenizer(PreTrainedTokenizer):
         return ret
     def _convert_token_to_id(self, token: str) -> int:
-        return self._subtoken_to_id.get(token, 0)
     def _convert_id_to_token(self, index: int) -> str:
-        if 0 <= index < len(self._subtoken_strings):
-            return self._subtoken_strings[index]
         return self.unk_token
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
         """Reverse the tokenization: unescape and join."""
-        text = "".join(tokens)
         # Remove trailing underscores (word boundary markers)
         # and unescape: \\u → _, \\\\ → \\, \\<digits>; → chr
         text = re.sub(r"(?<!\\)_", "", text)
@@ -215,6 +247,36 @@ class LatinBertTokenizer(PreTrainedTokenizer):
         text = text.replace("\\u", "_").replace("\\\\", "\\")
         return text
     def save_vocabulary(
         self, save_directory: str, filename_prefix: Optional[str] = None
     ) -> Tuple[str]:

   3. Append trailing underscore (word boundary marker)
   4. Greedy longest-match against subword vocabulary
+IDs 0-4 are reserved for BERT special tokens ([PAD], [UNK], [CLS],
+[SEP], [MASK]). SubwordTextEncoder subtokens start at ID 5.
 Usage:
     from transformers import AutoModel, AutoTokenizer
     return "".join(ret) + "_"
+# ── BERT special tokens ───────────────────────────────────────────────
+SPECIAL_TOKENS = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
+NUM_SPECIAL = 5  # IDs 0-4 reserved for BERT special tokens
 # ── HuggingFace tokenizer ─────────────────────────────────────────────
 # Vocab file name expected by HF save/load
     Wraps the original tensor2tensor SubwordTextEncoder as a
     PreTrainedTokenizer so it works with AutoTokenizer and standard
     HF pipelines.
+    IDs 0-4 are reserved for BERT special tokens:
+      0=[PAD], 1=[UNK], 2=[CLS], 3=[SEP], 4=[MASK]
+    SubwordTextEncoder subtokens are shifted to start at ID 5.
     """
     vocab_files_names = VOCAB_FILES_NAMES
     def __init__(
         self,
         vocab_file: str,
+        pad_token: str = "[PAD]",
+        unk_token: str = "[UNK]",
+        cls_token: str = "[CLS]",
+        sep_token: str = "[SEP]",
+        mask_token: str = "[MASK]",
         eos_token: str = "<EOS>_",
         **kwargs,
     ):
         # Load subword vocabulary before super().__init__ so that
         super().__init__(
             pad_token=pad_token,
             unk_token=unk_token,
+            cls_token=cls_token,
+            sep_token=sep_token,
+            mask_token=mask_token,
+            eos_token=eos_token,
             **kwargs,
         )
                 ):
                     s = s[1:-1]
                 subtoken_strings.append(s)
+        # IDs 0-4 are reserved for BERT special tokens [PAD],[UNK],[CLS],[SEP],[MASK]
+        # SubwordTextEncoder subtokens are shifted to IDs 5+
         self._subtoken_strings = subtoken_strings
         self._max_subtoken_len = (
             max(len(s) for s in subtoken_strings) if subtoken_strings else 0
         )
+        self._subtoken_to_id = {
+            s: i + NUM_SPECIAL for i, s in enumerate(subtoken_strings) if s
+        }
+        # Also map special tokens to their IDs
+        for i, tok in enumerate(SPECIAL_TOKENS):
+            self._subtoken_to_id[tok] = i
         self._alphabet = {c for token in subtoken_strings for c in token}
         self._alphabet |= _ESCAPE_CHARS
     @property
     def vocab_size(self) -> int:
+        return len(self._subtoken_strings) + NUM_SPECIAL
     def get_vocab(self) -> Dict[str, int]:
+        vocab = {tok: i for i, tok in enumerate(SPECIAL_TOKENS)}
+        vocab.update(self._subtoken_to_id)
+        return vocab
     def _tokenize(self, text: str, **kwargs) -> List[str]:
         """Tokenize text into subtoken strings."""
         return ret
     def _convert_token_to_id(self, token: str) -> int:
+        return self._subtoken_to_id.get(token, 1)  # 1 = [UNK]
     def _convert_id_to_token(self, index: int) -> str:
+        if 0 <= index < NUM_SPECIAL:
+            return SPECIAL_TOKENS[index]
+        subtoken_index = index - NUM_SPECIAL
+        if 0 <= subtoken_index < len(self._subtoken_strings):
+            return self._subtoken_strings[subtoken_index]
         return self.unk_token
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
         """Reverse the tokenization: unescape and join."""
+        # Filter out special tokens before joining
+        filtered = [t for t in tokens if t not in SPECIAL_TOKENS]
+        text = "".join(filtered)
         # Remove trailing underscores (word boundary markers)
         # and unescape: \\u → _, \\\\ → \\, \\<digits>; → chr
         text = re.sub(r"(?<!\\)_", "", text)
         text = text.replace("\\u", "_").replace("\\\\", "\\")
         return text
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        cls_id = [self.convert_tokens_to_ids("[CLS]")]
+        sep_id = [self.convert_tokens_to_ids("[SEP]")]
+        if token_ids_1 is None:
+            return cls_id + token_ids_0 + sep_id
+        return cls_id + token_ids_0 + sep_id + token_ids_1 + sep_id
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False
+    ) -> List[int]:
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0, token_ids_1, already_has_special_tokens=True
+            )
+        if token_ids_1 is None:
+            return [1] + [0] * len(token_ids_0) + [1]
+        return [1] + [0] * len(token_ids_0) + [1] + [0] * len(token_ids_1) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        sep = [self.convert_tokens_to_ids("[SEP]")]
+        cls_ = [self.convert_tokens_to_ids("[CLS]")]
+        if token_ids_1 is None:
+            return [0] * (len(cls_) + len(token_ids_0) + len(sep))
+        return [0] * (len(cls_) + len(token_ids_0) + len(sep)) + [1] * (len(token_ids_1) + len(sep))
     def save_vocabulary(
         self, save_directory: str, filename_prefix: Optional[str] = None
     ) -> Tuple[str]:

src/latincy_latinbert/tokenizer_config.json CHANGED Viewed

@@ -1,10 +1,13 @@
 {
   "tokenizer_class": "LatinBertTokenizer",
   "auto_map": {
-    "AutoTokenizer": "tokenization_latin_bert.LatinBertTokenizer"
   },
-  "pad_token": "<pad>_",
   "eos_token": "<EOS>_",
-  "unk_token": "<unk>",
   "model_max_length": 512
 }

 {
   "tokenizer_class": "LatinBertTokenizer",
   "auto_map": {
+    "AutoTokenizer": ["tokenization_latin_bert.LatinBertTokenizer", null]
   },
+  "pad_token": "[PAD]",
+  "unk_token": "[UNK]",
+  "cls_token": "[CLS]",
+  "sep_token": "[SEP]",
+  "mask_token": "[MASK]",
   "eos_token": "<EOS>_",
   "model_max_length": 512
 }

tests/test_tokenizer.py CHANGED Viewed

@@ -18,40 +18,71 @@ def tokenizer():
     return LatinBertTokenizer(vocab_file=VOCAB_FILE)
 class TestVocab:
     def test_vocab_size(self, tokenizer):
-        assert tokenizer.vocab_size == 32895
     def test_pad_token_id(self, tokenizer):
-        assert tokenizer.pad_token == "<pad>_"
-        assert tokenizer.convert_tokens_to_ids("<pad>_") == 0
     def test_eos_token(self, tokenizer):
         assert tokenizer.eos_token == "<EOS>_"
-        assert tokenizer.convert_tokens_to_ids("<EOS>_") == 1
 class TestEncoding:
-    """Reference IDs from original standalone_text_encoder.py on cluster."""
     def test_gallia(self, tokenizer):
         ids = tokenizer.encode("Gallia est omnis divisa in partes tres",
                                add_special_tokens=False)
-        expected = [32883, 3508, 32889, 24331, 4, 32883, 7730, 8, 10,
-                    32883, 7730, 8, 338, 32883, 7730, 8, 6768, 32883,
-                    7730, 8, 7, 32883, 7730, 8, 563, 32883, 7730, 8, 559]
         assert ids == expected
     def test_arma(self, tokenizer):
         ids = tokenizer.encode("arma virumque cano",
                                add_special_tokens=False)
-        expected = [910, 32883, 7730, 8, 18561, 8102, 32883, 7730, 8, 4415]
         assert ids == expected
     def test_uppercase(self, tokenizer):
         ids = tokenizer.encode("ROMA", add_special_tokens=False)
-        expected = [32883, 3500, 32889, 32886, 32883, 2155, 32889,
-                    32883, 2783, 8]
         assert ids == expected
     def test_empty(self, tokenizer):

     return LatinBertTokenizer(vocab_file=VOCAB_FILE)
+class TestSpecialTokens:
+    def test_special_token_ids(self, tokenizer):
+        """BERT special tokens must occupy IDs 0-4."""
+        assert tokenizer.convert_tokens_to_ids("[PAD]") == 0
+        assert tokenizer.convert_tokens_to_ids("[UNK]") == 1
+        assert tokenizer.convert_tokens_to_ids("[CLS]") == 2
+        assert tokenizer.convert_tokens_to_ids("[SEP]") == 3
+        assert tokenizer.convert_tokens_to_ids("[MASK]") == 4
+    def test_special_token_strings(self, tokenizer):
+        assert tokenizer.pad_token == "[PAD]"
+        assert tokenizer.unk_token == "[UNK]"
+        assert tokenizer.cls_token == "[CLS]"
+        assert tokenizer.sep_token == "[SEP]"
+        assert tokenizer.mask_token == "[MASK]"
+    def test_vocab_size_includes_specials(self, tokenizer):
+        """vocab_size = 5 special + 32895 subtokens = 32900."""
+        assert tokenizer.vocab_size == 32900
+    def test_subtoken_offset(self, tokenizer):
+        """First subtoken '<pad>_' from encoder should be at ID 5, not 0."""
+        assert tokenizer.convert_tokens_to_ids("<pad>_") == 5
+    def test_add_special_tokens_encoding(self, tokenizer):
+        """encode with add_special_tokens=True should wrap with [CLS]/[SEP]."""
+        ids = tokenizer.encode("et", add_special_tokens=True)
+        assert ids[0] == 2   # [CLS]
+        assert ids[-1] == 3  # [SEP]
 class TestVocab:
     def test_vocab_size(self, tokenizer):
+        assert tokenizer.vocab_size == 32900
     def test_pad_token_id(self, tokenizer):
+        assert tokenizer.pad_token == "[PAD]"
+        assert tokenizer.convert_tokens_to_ids("[PAD]") == 0
     def test_eos_token(self, tokenizer):
         assert tokenizer.eos_token == "<EOS>_"
+        assert tokenizer.convert_tokens_to_ids("<EOS>_") == 6  # was 1, now 1+5
 class TestEncoding:
+    """Reference IDs from original LatinTokenizer (with +5 offset)."""
     def test_gallia(self, tokenizer):
         ids = tokenizer.encode("Gallia est omnis divisa in partes tres",
                                add_special_tokens=False)
+        expected = [32888, 3513, 32894, 24336, 9, 32888, 7735, 13, 15,
+                    32888, 7735, 13, 343, 32888, 7735, 13, 6773, 32888,
+                    7735, 13, 12, 32888, 7735, 13, 568, 32888, 7735, 13, 564]
         assert ids == expected
     def test_arma(self, tokenizer):
         ids = tokenizer.encode("arma virumque cano",
                                add_special_tokens=False)
+        expected = [915, 32888, 7735, 13, 18566, 8107, 32888, 7735, 13, 4420]
         assert ids == expected
     def test_uppercase(self, tokenizer):
         ids = tokenizer.encode("ROMA", add_special_tokens=False)
+        expected = [32888, 3505, 32894, 32891, 32888, 2160, 32894,
+                    32888, 2788, 13]
         assert ids == expected
     def test_empty(self, tokenizer):