algorythmtechnologies's picture
Upload folder using huggingface_hub
8174855 verified
raw
history blame contribute delete
429 Bytes
from transformers import GPT2TokenizerFast
from typing import Optional
def load_gpt2_tokenizer(cache_dir: Optional[str] = None) -> GPT2TokenizerFast:
tok = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir=cache_dir)
# GPT-2 vocab size should be 50257; do not add pad token to avoid changing embedding size.
assert tok.vocab_size == 50257, f"Unexpected GPT-2 vocab size: {tok.vocab_size}"
return tok