Aegis-1B-Agent / tokenization /save_hf_tokenizer.py
literallybannedfromcallingbob's picture
Upload folder using huggingface_hub
238d0e8 verified
from transformers import PreTrainedTokenizerFast, AutoTokenizer
# Load the tokenizer.json
hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
# Set special tokens
hf_tokenizer.pad_token = "[PAD]"
hf_tokenizer.unk_token = "[UNK]"
hf_tokenizer.cls_token = "[CLS]"
hf_tokenizer.sep_token = "[SEP]"
hf_tokenizer.mask_token = "[MASK]"
# Save in HuggingFace format
hf_tokenizer.save_pretrained("../my-hf-chatbot")
# Test loading
loaded_tokenizer = AutoTokenizer.from_pretrained("../my-hf-chatbot")
print("Loaded tokenizer vocab size:", len(loaded_tokenizer))
print("Special tokens:")
print("PAD:", loaded_tokenizer.pad_token, loaded_tokenizer.pad_token_id)
print("UNK:", loaded_tokenizer.unk_token, loaded_tokenizer.unk_token_id)
print("CLS:", loaded_tokenizer.cls_token, loaded_tokenizer.cls_token_id)
print("SEP:", loaded_tokenizer.sep_token, loaded_tokenizer.sep_token_id)
print("MASK:", loaded_tokenizer.mask_token, loaded_tokenizer.mask_token_id)
# Test encoding
sample = "Let's test this tokenizer."
enc = loaded_tokenizer(sample)
print("Encoded tokens:", enc.tokens())