|
|
from transformers import PreTrainedTokenizerFast, AutoTokenizer
|
|
|
|
|
|
|
|
|
hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
|
|
|
|
|
|
|
|
|
hf_tokenizer.pad_token = "[PAD]"
|
|
|
hf_tokenizer.unk_token = "[UNK]"
|
|
|
hf_tokenizer.cls_token = "[CLS]"
|
|
|
hf_tokenizer.sep_token = "[SEP]"
|
|
|
hf_tokenizer.mask_token = "[MASK]"
|
|
|
|
|
|
|
|
|
hf_tokenizer.save_pretrained("../my-hf-chatbot")
|
|
|
|
|
|
|
|
|
loaded_tokenizer = AutoTokenizer.from_pretrained("../my-hf-chatbot")
|
|
|
print("Loaded tokenizer vocab size:", len(loaded_tokenizer))
|
|
|
print("Special tokens:")
|
|
|
print("PAD:", loaded_tokenizer.pad_token, loaded_tokenizer.pad_token_id)
|
|
|
print("UNK:", loaded_tokenizer.unk_token, loaded_tokenizer.unk_token_id)
|
|
|
print("CLS:", loaded_tokenizer.cls_token, loaded_tokenizer.cls_token_id)
|
|
|
print("SEP:", loaded_tokenizer.sep_token, loaded_tokenizer.sep_token_id)
|
|
|
print("MASK:", loaded_tokenizer.mask_token, loaded_tokenizer.mask_token_id)
|
|
|
|
|
|
|
|
|
sample = "Let's test this tokenizer."
|
|
|
enc = loaded_tokenizer(sample)
|
|
|
print("Encoded tokens:", enc.tokens()) |