"""GPT-2 tokenizer wrapper. We use the HuggingFace fast tokenizer because its byte-level BPE is fully deterministic given a fixed `transformers` version and produces the same output across CPU/GPU and Python releases. """ from __future__ import annotations from typing import Iterable, List from transformers import AutoTokenizer, PreTrainedTokenizerBase def load_tokenizer(name: str = "gpt2") -> PreTrainedTokenizerBase: """Load the GPT-2 tokenizer with `pad_token = eos_token`. GPT-2 has no native pad token. Setting it to EOS is the standard idiom used by HuggingFace causal-LM training code; padded positions are masked out by the attention mask, so the choice is harmless. """ tok = AutoTokenizer.from_pretrained(name, use_fast=True) if tok.pad_token is None: tok.pad_token = tok.eos_token return tok def encode( tokenizer: PreTrainedTokenizerBase, text: str, add_eos: bool = True, ) -> List[int]: """Encode a single document. Returns a list of token IDs.""" ids = tokenizer.encode(text, add_special_tokens=False) if add_eos: ids.append(tokenizer.eos_token_id) return ids def encode_stream( tokenizer: PreTrainedTokenizerBase, docs: Iterable[str], add_eos: bool = True, ) -> Iterable[List[int]]: """Encode an iterable of documents lazily, preserving order.""" for doc in docs: yield encode(tokenizer, doc, add_eos=add_eos)