from transformers import GPT2TokenizerFast from typing import Optional def load_gpt2_tokenizer(cache_dir: Optional[str] = None) -> GPT2TokenizerFast: tok = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir=cache_dir) # GPT-2 vocab size should be 50257; do not add pad token to avoid changing embedding size. assert tok.vocab_size == 50257, f"Unexpected GPT-2 vocab size: {tok.vocab_size}" return tok