File size: 429 Bytes
8174855 |
1 2 3 4 5 6 7 8 9 10 |
from transformers import GPT2TokenizerFast
from typing import Optional
def load_gpt2_tokenizer(cache_dir: Optional[str] = None) -> GPT2TokenizerFast:
tok = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir=cache_dir)
# GPT-2 vocab size should be 50257; do not add pad token to avoid changing embedding size.
assert tok.vocab_size == 50257, f"Unexpected GPT-2 vocab size: {tok.vocab_size}"
return tok
|