latin-bert / tokenization_latin_bert.py
diyclassics's picture
fix: apply do_lower_case to root-level HF-facing files
f25becd
"""HuggingFace PreTrainedTokenizer for Bamman & Burns (2020) Latin BERT.
Faithfully reproduces the original tensor2tensor SubwordTextEncoder
tokenization pipeline without requiring TensorFlow. Verified against
the original POS tagging case study (Perseus 95.2%, ITTB 99.2%,
PROIEL 98.2% β€” matching or exceeding paper targets).
The tokenization pipeline:
1. Character-class splitting (alphanumeric vs non-alphanumeric boundaries)
2. Escape special characters (backslash, underscore, out-of-alphabet)
3. Append trailing underscore (word boundary marker)
4. Greedy longest-match against subword vocabulary
IDs 0-4 are reserved for BERT special tokens ([PAD], [UNK], [CLS],
[SEP], [MASK]). SubwordTextEncoder subtokens start at ID 5.
Usage:
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
"latincy/latin-bert", trust_remote_code=True
)
model = AutoModel.from_pretrained("latincy/latin-bert")
inputs = tokenizer("Gallia est omnis divisa in partes tres",
return_tensors="pt")
outputs = model(**inputs)
"""
import os
import re
import unicodedata
from typing import Dict, List, Optional, Tuple
from transformers import PreTrainedTokenizer
# ── Character-class tokenizer ──────────────────────────────────────────
# Reproduces tensor2tensor.data_generators.tokenizer.encode()
_ALPHANUMERIC_CHAR_SET = set()
for _i in range(0x110000):
_c = chr(_i)
_cat = unicodedata.category(_c)
if _cat.startswith("L") or _cat.startswith("N"):
_ALPHANUMERIC_CHAR_SET.add(_c)
_ESCAPE_CHARS = set("\\_u;0123456789")
def _tokenizer_encode(text: str) -> List[str]:
"""Split text at alphanumeric / non-alphanumeric boundaries."""
if not text:
return []
tokens = []
current = [text[0]]
is_alnum = text[0] in _ALPHANUMERIC_CHAR_SET
for c in text[1:]:
c_is_alnum = c in _ALPHANUMERIC_CHAR_SET
if c_is_alnum == is_alnum:
current.append(c)
else:
tokens.append("".join(current))
current = [c]
is_alnum = c_is_alnum
tokens.append("".join(current))
return tokens
def _escape_token(token: str, alphabet: set) -> str:
"""Escape a token before subword encoding.
Reproduces tensor2tensor _escape_token():
- \\ β†’ \\\\
- _ β†’ \\u
- out-of-alphabet chars β†’ \\<ordinal>;
- append trailing _ (word boundary marker)
"""
token = token.replace("\\", "\\\\").replace("_", "\\u")
ret = []
for c in token:
if c in alphabet and c != "\n":
ret.append(c)
else:
ret.append("\\%d;" % ord(c))
return "".join(ret) + "_"
# ── BERT special tokens ───────────────────────────────────────────────
SPECIAL_TOKENS = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
NUM_SPECIAL = 5 # IDs 0-4 reserved for BERT special tokens
# ── HuggingFace tokenizer ─────────────────────────────────────────────
# Vocab file name expected by HF save/load
VOCAB_FILES_NAMES = {"vocab_file": "latin.subword.encoder"}
class LatinBertTokenizer(PreTrainedTokenizer):
"""HuggingFace tokenizer for Bamman & Burns (2020) Latin BERT.
Wraps the original tensor2tensor SubwordTextEncoder as a
PreTrainedTokenizer so it works with AutoTokenizer and standard
HF pipelines.
IDs 0-4 are reserved for BERT special tokens:
0=[PAD], 1=[UNK], 2=[CLS], 3=[SEP], 4=[MASK]
SubwordTextEncoder subtokens are shifted to start at ID 5.
"""
vocab_files_names = VOCAB_FILES_NAMES
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file: str,
do_lower_case: bool = True,
pad_token: str = "[PAD]",
unk_token: str = "[UNK]",
cls_token: str = "[CLS]",
sep_token: str = "[SEP]",
mask_token: str = "[MASK]",
eos_token: str = "<EOS>_",
**kwargs,
):
# Load subword vocabulary before super().__init__ so that
# special token resolution can use the vocab.
self._subtoken_strings: List[str] = []
self._subtoken_to_id: Dict[str, int] = {}
self._max_subtoken_len: int = 0
self._alphabet: set = set()
self._cache_size: int = 2**20
self._cache: List[Tuple] = [(None, None)] * self._cache_size
self._load_vocab(vocab_file)
self.vocab_file = vocab_file
self.do_lower_case = do_lower_case
super().__init__(
pad_token=pad_token,
unk_token=unk_token,
cls_token=cls_token,
sep_token=sep_token,
mask_token=mask_token,
eos_token=eos_token,
**kwargs,
)
# ── Vocab loading ──────────────────────────────────────────────
def _load_vocab(self, filename: str) -> None:
subtoken_strings = []
with open(filename, encoding="utf-8") as f:
for line in f:
s = line.rstrip()
if (
(s.startswith("'") and s.endswith("'"))
or (s.startswith('"') and s.endswith('"'))
):
s = s[1:-1]
subtoken_strings.append(s)
# IDs 0-4 are reserved for BERT special tokens [PAD],[UNK],[CLS],[SEP],[MASK]
# SubwordTextEncoder subtokens are shifted to IDs 5+
self._subtoken_strings = subtoken_strings
self._max_subtoken_len = (
max(len(s) for s in subtoken_strings) if subtoken_strings else 0
)
self._subtoken_to_id = {
s: i + NUM_SPECIAL for i, s in enumerate(subtoken_strings) if s
}
# Also map special tokens to their IDs
for i, tok in enumerate(SPECIAL_TOKENS):
self._subtoken_to_id[tok] = i
self._alphabet = {c for token in subtoken_strings for c in token}
self._alphabet |= _ESCAPE_CHARS
# ── PreTrainedTokenizer interface ──────────────────────────────
@property
def vocab_size(self) -> int:
return len(self._subtoken_strings) + NUM_SPECIAL
def get_vocab(self) -> Dict[str, int]:
vocab = {tok: i for i, tok in enumerate(SPECIAL_TOKENS)}
vocab.update(self._subtoken_to_id)
return vocab
def _tokenize(self, text: str, **kwargs) -> List[str]:
"""Tokenize text into subtoken strings."""
if self.do_lower_case:
text = text.lower()
tokens = _tokenizer_encode(text)
subtokens = []
for token in tokens:
subtokens.extend(self._token_to_subtokens(token))
return subtokens
def _token_to_subtokens(self, token: str) -> List[str]:
"""Convert a single word-level token to subtoken strings."""
cache_location = hash(token) % self._cache_size
cache_key, cache_value = self._cache[cache_location]
if cache_key == token:
return cache_value
escaped = _escape_token(token, self._alphabet)
subtokens = self._greedy_match(escaped)
self._cache[cache_location] = (token, subtokens)
return subtokens
def _greedy_match(self, escaped_token: str) -> List[str]:
"""Greedy longest-match against subword vocabulary."""
ret = []
start = 0
token_len = len(escaped_token)
while start < token_len:
found = False
for end in range(
min(token_len, start + self._max_subtoken_len), start, -1
):
subtoken = escaped_token[start:end]
if subtoken in self._subtoken_to_id:
ret.append(subtoken)
start = end
found = True
break
if not found:
ret.append(self.unk_token)
start += 1
return ret
def _convert_token_to_id(self, token: str) -> int:
return self._subtoken_to_id.get(token, 1) # 1 = [UNK]
def _convert_id_to_token(self, index: int) -> str:
if 0 <= index < NUM_SPECIAL:
return SPECIAL_TOKENS[index]
subtoken_index = index - NUM_SPECIAL
if 0 <= subtoken_index < len(self._subtoken_strings):
return self._subtoken_strings[subtoken_index]
return self.unk_token
def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Reverse the tokenization: unescape and join."""
# Filter out special tokens before joining
filtered = [t for t in tokens if t not in SPECIAL_TOKENS]
text = "".join(filtered)
# Remove trailing underscores (word boundary markers)
# and unescape: \\u β†’ _, \\\\ β†’ \\, \\<digits>; β†’ chr
text = re.sub(r"(?<!\\)_", "", text)
text = re.sub(r"\\(\d+);", lambda m: chr(int(m.group(1))), text)
text = text.replace("\\u", "_").replace("\\\\", "\\")
return text
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
cls_id = [self.convert_tokens_to_ids("[CLS]")]
sep_id = [self.convert_tokens_to_ids("[SEP]")]
if token_ids_1 is None:
return cls_id + token_ids_0 + sep_id
return cls_id + token_ids_0 + sep_id + token_ids_1 + sep_id
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False
) -> List[int]:
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0, token_ids_1, already_has_special_tokens=True
)
if token_ids_1 is None:
return [1] + [0] * len(token_ids_0) + [1]
return [1] + [0] * len(token_ids_0) + [1] + [0] * len(token_ids_1) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
sep = [self.convert_tokens_to_ids("[SEP]")]
cls_ = [self.convert_tokens_to_ids("[CLS]")]
if token_ids_1 is None:
return [0] * (len(cls_) + len(token_ids_0) + len(sep))
return [0] * (len(cls_) + len(token_ids_0) + len(sep)) + [1] * (len(token_ids_1) + len(sep))
def save_vocabulary(
self, save_directory: str, filename_prefix: Optional[str] = None
) -> Tuple[str]:
if not os.path.isdir(save_directory):
os.makedirs(save_directory, exist_ok=True)
prefix = filename_prefix + "-" if filename_prefix else ""
vocab_path = os.path.join(
save_directory, prefix + VOCAB_FILES_NAMES["vocab_file"]
)
with open(vocab_path, "w", encoding="utf-8") as f:
for subtoken in self._subtoken_strings:
f.write(f"'{subtoken}'\n")
return (vocab_path,)