diyclassics Claude Opus 4.6 commited on
Commit
ce59834
·
1 Parent(s): 68d8806

Fix tokenizer ID offset: reserve IDs 0-4 for BERT special tokens

Browse files

The original Latin BERT model expects [PAD]=0, [UNK]=1, [CLS]=2,
[SEP]=3, [MASK]=4 with SubwordTextEncoder IDs shifted by +5. This
commit adds those five special tokens, shifts all subtoken IDs
accordingly, and implements build_inputs_with_special_tokens so
encode(add_special_tokens=True) wraps with [CLS]/[SEP].

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

src/latincy_latinbert/tokenization_latin_bert.py CHANGED
@@ -11,6 +11,9 @@ The tokenization pipeline:
11
  3. Append trailing underscore (word boundary marker)
12
  4. Greedy longest-match against subword vocabulary
13
 
 
 
 
14
  Usage:
15
  from transformers import AutoModel, AutoTokenizer
16
 
@@ -83,6 +86,11 @@ def _escape_token(token: str, alphabet: set) -> str:
83
  return "".join(ret) + "_"
84
 
85
 
 
 
 
 
 
86
  # ── HuggingFace tokenizer ─────────────────────────────────────────────
87
 
88
  # Vocab file name expected by HF save/load
@@ -95,6 +103,10 @@ class LatinBertTokenizer(PreTrainedTokenizer):
95
  Wraps the original tensor2tensor SubwordTextEncoder as a
96
  PreTrainedTokenizer so it works with AutoTokenizer and standard
97
  HF pipelines.
 
 
 
 
98
  """
99
 
100
  vocab_files_names = VOCAB_FILES_NAMES
@@ -103,9 +115,12 @@ class LatinBertTokenizer(PreTrainedTokenizer):
103
  def __init__(
104
  self,
105
  vocab_file: str,
106
- pad_token: str = "<pad>_",
 
 
 
 
107
  eos_token: str = "<EOS>_",
108
- unk_token: str = "<unk>",
109
  **kwargs,
110
  ):
111
  # Load subword vocabulary before super().__init__ so that
@@ -122,8 +137,11 @@ class LatinBertTokenizer(PreTrainedTokenizer):
122
 
123
  super().__init__(
124
  pad_token=pad_token,
125
- eos_token=eos_token,
126
  unk_token=unk_token,
 
 
 
 
127
  **kwargs,
128
  )
129
 
@@ -140,11 +158,18 @@ class LatinBertTokenizer(PreTrainedTokenizer):
140
  ):
141
  s = s[1:-1]
142
  subtoken_strings.append(s)
 
 
143
  self._subtoken_strings = subtoken_strings
144
  self._max_subtoken_len = (
145
  max(len(s) for s in subtoken_strings) if subtoken_strings else 0
146
  )
147
- self._subtoken_to_id = {s: i for i, s in enumerate(subtoken_strings) if s}
 
 
 
 
 
148
  self._alphabet = {c for token in subtoken_strings for c in token}
149
  self._alphabet |= _ESCAPE_CHARS
150
 
@@ -152,10 +177,12 @@ class LatinBertTokenizer(PreTrainedTokenizer):
152
 
153
  @property
154
  def vocab_size(self) -> int:
155
- return len(self._subtoken_strings)
156
 
157
  def get_vocab(self) -> Dict[str, int]:
158
- return dict(self._subtoken_to_id)
 
 
159
 
160
  def _tokenize(self, text: str, **kwargs) -> List[str]:
161
  """Tokenize text into subtoken strings."""
@@ -198,16 +225,21 @@ class LatinBertTokenizer(PreTrainedTokenizer):
198
  return ret
199
 
200
  def _convert_token_to_id(self, token: str) -> int:
201
- return self._subtoken_to_id.get(token, 0)
202
 
203
  def _convert_id_to_token(self, index: int) -> str:
204
- if 0 <= index < len(self._subtoken_strings):
205
- return self._subtoken_strings[index]
 
 
 
206
  return self.unk_token
207
 
208
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
209
  """Reverse the tokenization: unescape and join."""
210
- text = "".join(tokens)
 
 
211
  # Remove trailing underscores (word boundary markers)
212
  # and unescape: \\u → _, \\\\ → \\, \\<digits>; → chr
213
  text = re.sub(r"(?<!\\)_", "", text)
@@ -215,6 +247,36 @@ class LatinBertTokenizer(PreTrainedTokenizer):
215
  text = text.replace("\\u", "_").replace("\\\\", "\\")
216
  return text
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  def save_vocabulary(
219
  self, save_directory: str, filename_prefix: Optional[str] = None
220
  ) -> Tuple[str]:
 
11
  3. Append trailing underscore (word boundary marker)
12
  4. Greedy longest-match against subword vocabulary
13
 
14
+ IDs 0-4 are reserved for BERT special tokens ([PAD], [UNK], [CLS],
15
+ [SEP], [MASK]). SubwordTextEncoder subtokens start at ID 5.
16
+
17
  Usage:
18
  from transformers import AutoModel, AutoTokenizer
19
 
 
86
  return "".join(ret) + "_"
87
 
88
 
89
+ # ── BERT special tokens ───────────────────────────────────────────────
90
+
91
+ SPECIAL_TOKENS = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
92
+ NUM_SPECIAL = 5 # IDs 0-4 reserved for BERT special tokens
93
+
94
  # ── HuggingFace tokenizer ─────────────────────────────────────────────
95
 
96
  # Vocab file name expected by HF save/load
 
103
  Wraps the original tensor2tensor SubwordTextEncoder as a
104
  PreTrainedTokenizer so it works with AutoTokenizer and standard
105
  HF pipelines.
106
+
107
+ IDs 0-4 are reserved for BERT special tokens:
108
+ 0=[PAD], 1=[UNK], 2=[CLS], 3=[SEP], 4=[MASK]
109
+ SubwordTextEncoder subtokens are shifted to start at ID 5.
110
  """
111
 
112
  vocab_files_names = VOCAB_FILES_NAMES
 
115
  def __init__(
116
  self,
117
  vocab_file: str,
118
+ pad_token: str = "[PAD]",
119
+ unk_token: str = "[UNK]",
120
+ cls_token: str = "[CLS]",
121
+ sep_token: str = "[SEP]",
122
+ mask_token: str = "[MASK]",
123
  eos_token: str = "<EOS>_",
 
124
  **kwargs,
125
  ):
126
  # Load subword vocabulary before super().__init__ so that
 
137
 
138
  super().__init__(
139
  pad_token=pad_token,
 
140
  unk_token=unk_token,
141
+ cls_token=cls_token,
142
+ sep_token=sep_token,
143
+ mask_token=mask_token,
144
+ eos_token=eos_token,
145
  **kwargs,
146
  )
147
 
 
158
  ):
159
  s = s[1:-1]
160
  subtoken_strings.append(s)
161
+ # IDs 0-4 are reserved for BERT special tokens [PAD],[UNK],[CLS],[SEP],[MASK]
162
+ # SubwordTextEncoder subtokens are shifted to IDs 5+
163
  self._subtoken_strings = subtoken_strings
164
  self._max_subtoken_len = (
165
  max(len(s) for s in subtoken_strings) if subtoken_strings else 0
166
  )
167
+ self._subtoken_to_id = {
168
+ s: i + NUM_SPECIAL for i, s in enumerate(subtoken_strings) if s
169
+ }
170
+ # Also map special tokens to their IDs
171
+ for i, tok in enumerate(SPECIAL_TOKENS):
172
+ self._subtoken_to_id[tok] = i
173
  self._alphabet = {c for token in subtoken_strings for c in token}
174
  self._alphabet |= _ESCAPE_CHARS
175
 
 
177
 
178
  @property
179
  def vocab_size(self) -> int:
180
+ return len(self._subtoken_strings) + NUM_SPECIAL
181
 
182
  def get_vocab(self) -> Dict[str, int]:
183
+ vocab = {tok: i for i, tok in enumerate(SPECIAL_TOKENS)}
184
+ vocab.update(self._subtoken_to_id)
185
+ return vocab
186
 
187
  def _tokenize(self, text: str, **kwargs) -> List[str]:
188
  """Tokenize text into subtoken strings."""
 
225
  return ret
226
 
227
  def _convert_token_to_id(self, token: str) -> int:
228
+ return self._subtoken_to_id.get(token, 1) # 1 = [UNK]
229
 
230
  def _convert_id_to_token(self, index: int) -> str:
231
+ if 0 <= index < NUM_SPECIAL:
232
+ return SPECIAL_TOKENS[index]
233
+ subtoken_index = index - NUM_SPECIAL
234
+ if 0 <= subtoken_index < len(self._subtoken_strings):
235
+ return self._subtoken_strings[subtoken_index]
236
  return self.unk_token
237
 
238
  def convert_tokens_to_string(self, tokens: List[str]) -> str:
239
  """Reverse the tokenization: unescape and join."""
240
+ # Filter out special tokens before joining
241
+ filtered = [t for t in tokens if t not in SPECIAL_TOKENS]
242
+ text = "".join(filtered)
243
  # Remove trailing underscores (word boundary markers)
244
  # and unescape: \\u → _, \\\\ → \\, \\<digits>; → chr
245
  text = re.sub(r"(?<!\\)_", "", text)
 
247
  text = text.replace("\\u", "_").replace("\\\\", "\\")
248
  return text
249
 
250
+ def build_inputs_with_special_tokens(
251
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
252
+ ) -> List[int]:
253
+ cls_id = [self.convert_tokens_to_ids("[CLS]")]
254
+ sep_id = [self.convert_tokens_to_ids("[SEP]")]
255
+ if token_ids_1 is None:
256
+ return cls_id + token_ids_0 + sep_id
257
+ return cls_id + token_ids_0 + sep_id + token_ids_1 + sep_id
258
+
259
+ def get_special_tokens_mask(
260
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
261
+ already_has_special_tokens: bool = False
262
+ ) -> List[int]:
263
+ if already_has_special_tokens:
264
+ return super().get_special_tokens_mask(
265
+ token_ids_0, token_ids_1, already_has_special_tokens=True
266
+ )
267
+ if token_ids_1 is None:
268
+ return [1] + [0] * len(token_ids_0) + [1]
269
+ return [1] + [0] * len(token_ids_0) + [1] + [0] * len(token_ids_1) + [1]
270
+
271
+ def create_token_type_ids_from_sequences(
272
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
273
+ ) -> List[int]:
274
+ sep = [self.convert_tokens_to_ids("[SEP]")]
275
+ cls_ = [self.convert_tokens_to_ids("[CLS]")]
276
+ if token_ids_1 is None:
277
+ return [0] * (len(cls_) + len(token_ids_0) + len(sep))
278
+ return [0] * (len(cls_) + len(token_ids_0) + len(sep)) + [1] * (len(token_ids_1) + len(sep))
279
+
280
  def save_vocabulary(
281
  self, save_directory: str, filename_prefix: Optional[str] = None
282
  ) -> Tuple[str]:
src/latincy_latinbert/tokenizer_config.json CHANGED
@@ -1,10 +1,13 @@
1
  {
2
  "tokenizer_class": "LatinBertTokenizer",
3
  "auto_map": {
4
- "AutoTokenizer": "tokenization_latin_bert.LatinBertTokenizer"
5
  },
6
- "pad_token": "<pad>_",
 
 
 
 
7
  "eos_token": "<EOS>_",
8
- "unk_token": "<unk>",
9
  "model_max_length": 512
10
  }
 
1
  {
2
  "tokenizer_class": "LatinBertTokenizer",
3
  "auto_map": {
4
+ "AutoTokenizer": ["tokenization_latin_bert.LatinBertTokenizer", null]
5
  },
6
+ "pad_token": "[PAD]",
7
+ "unk_token": "[UNK]",
8
+ "cls_token": "[CLS]",
9
+ "sep_token": "[SEP]",
10
+ "mask_token": "[MASK]",
11
  "eos_token": "<EOS>_",
 
12
  "model_max_length": 512
13
  }
tests/test_tokenizer.py CHANGED
@@ -18,40 +18,71 @@ def tokenizer():
18
  return LatinBertTokenizer(vocab_file=VOCAB_FILE)
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  class TestVocab:
22
  def test_vocab_size(self, tokenizer):
23
- assert tokenizer.vocab_size == 32895
24
 
25
  def test_pad_token_id(self, tokenizer):
26
- assert tokenizer.pad_token == "<pad>_"
27
- assert tokenizer.convert_tokens_to_ids("<pad>_") == 0
28
 
29
  def test_eos_token(self, tokenizer):
30
  assert tokenizer.eos_token == "<EOS>_"
31
- assert tokenizer.convert_tokens_to_ids("<EOS>_") == 1
32
 
33
 
34
  class TestEncoding:
35
- """Reference IDs from original standalone_text_encoder.py on cluster."""
36
 
37
  def test_gallia(self, tokenizer):
38
  ids = tokenizer.encode("Gallia est omnis divisa in partes tres",
39
  add_special_tokens=False)
40
- expected = [32883, 3508, 32889, 24331, 4, 32883, 7730, 8, 10,
41
- 32883, 7730, 8, 338, 32883, 7730, 8, 6768, 32883,
42
- 7730, 8, 7, 32883, 7730, 8, 563, 32883, 7730, 8, 559]
43
  assert ids == expected
44
 
45
  def test_arma(self, tokenizer):
46
  ids = tokenizer.encode("arma virumque cano",
47
  add_special_tokens=False)
48
- expected = [910, 32883, 7730, 8, 18561, 8102, 32883, 7730, 8, 4415]
49
  assert ids == expected
50
 
51
  def test_uppercase(self, tokenizer):
52
  ids = tokenizer.encode("ROMA", add_special_tokens=False)
53
- expected = [32883, 3500, 32889, 32886, 32883, 2155, 32889,
54
- 32883, 2783, 8]
55
  assert ids == expected
56
 
57
  def test_empty(self, tokenizer):
 
18
  return LatinBertTokenizer(vocab_file=VOCAB_FILE)
19
 
20
 
21
+ class TestSpecialTokens:
22
+ def test_special_token_ids(self, tokenizer):
23
+ """BERT special tokens must occupy IDs 0-4."""
24
+ assert tokenizer.convert_tokens_to_ids("[PAD]") == 0
25
+ assert tokenizer.convert_tokens_to_ids("[UNK]") == 1
26
+ assert tokenizer.convert_tokens_to_ids("[CLS]") == 2
27
+ assert tokenizer.convert_tokens_to_ids("[SEP]") == 3
28
+ assert tokenizer.convert_tokens_to_ids("[MASK]") == 4
29
+
30
+ def test_special_token_strings(self, tokenizer):
31
+ assert tokenizer.pad_token == "[PAD]"
32
+ assert tokenizer.unk_token == "[UNK]"
33
+ assert tokenizer.cls_token == "[CLS]"
34
+ assert tokenizer.sep_token == "[SEP]"
35
+ assert tokenizer.mask_token == "[MASK]"
36
+
37
+ def test_vocab_size_includes_specials(self, tokenizer):
38
+ """vocab_size = 5 special + 32895 subtokens = 32900."""
39
+ assert tokenizer.vocab_size == 32900
40
+
41
+ def test_subtoken_offset(self, tokenizer):
42
+ """First subtoken '<pad>_' from encoder should be at ID 5, not 0."""
43
+ assert tokenizer.convert_tokens_to_ids("<pad>_") == 5
44
+
45
+ def test_add_special_tokens_encoding(self, tokenizer):
46
+ """encode with add_special_tokens=True should wrap with [CLS]/[SEP]."""
47
+ ids = tokenizer.encode("et", add_special_tokens=True)
48
+ assert ids[0] == 2 # [CLS]
49
+ assert ids[-1] == 3 # [SEP]
50
+
51
+
52
  class TestVocab:
53
  def test_vocab_size(self, tokenizer):
54
+ assert tokenizer.vocab_size == 32900
55
 
56
  def test_pad_token_id(self, tokenizer):
57
+ assert tokenizer.pad_token == "[PAD]"
58
+ assert tokenizer.convert_tokens_to_ids("[PAD]") == 0
59
 
60
  def test_eos_token(self, tokenizer):
61
  assert tokenizer.eos_token == "<EOS>_"
62
+ assert tokenizer.convert_tokens_to_ids("<EOS>_") == 6 # was 1, now 1+5
63
 
64
 
65
  class TestEncoding:
66
+ """Reference IDs from original LatinTokenizer (with +5 offset)."""
67
 
68
  def test_gallia(self, tokenizer):
69
  ids = tokenizer.encode("Gallia est omnis divisa in partes tres",
70
  add_special_tokens=False)
71
+ expected = [32888, 3513, 32894, 24336, 9, 32888, 7735, 13, 15,
72
+ 32888, 7735, 13, 343, 32888, 7735, 13, 6773, 32888,
73
+ 7735, 13, 12, 32888, 7735, 13, 568, 32888, 7735, 13, 564]
74
  assert ids == expected
75
 
76
  def test_arma(self, tokenizer):
77
  ids = tokenizer.encode("arma virumque cano",
78
  add_special_tokens=False)
79
+ expected = [915, 32888, 7735, 13, 18566, 8107, 32888, 7735, 13, 4420]
80
  assert ids == expected
81
 
82
  def test_uppercase(self, tokenizer):
83
  ids = tokenizer.encode("ROMA", add_special_tokens=False)
84
+ expected = [32888, 3505, 32894, 32891, 32888, 2160, 32894,
85
+ 32888, 2788, 13]
86
  assert ids == expected
87
 
88
  def test_empty(self, tokenizer):