from transformers import PreTrainedTokenizerFast, AutoTokenizer # Load the tokenizer.json hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json") # Set special tokens hf_tokenizer.pad_token = "[PAD]" hf_tokenizer.unk_token = "[UNK]" hf_tokenizer.cls_token = "[CLS]" hf_tokenizer.sep_token = "[SEP]" hf_tokenizer.mask_token = "[MASK]" # Save in HuggingFace format hf_tokenizer.save_pretrained("../my-hf-chatbot") # Test loading loaded_tokenizer = AutoTokenizer.from_pretrained("../my-hf-chatbot") print("Loaded tokenizer vocab size:", len(loaded_tokenizer)) print("Special tokens:") print("PAD:", loaded_tokenizer.pad_token, loaded_tokenizer.pad_token_id) print("UNK:", loaded_tokenizer.unk_token, loaded_tokenizer.unk_token_id) print("CLS:", loaded_tokenizer.cls_token, loaded_tokenizer.cls_token_id) print("SEP:", loaded_tokenizer.sep_token, loaded_tokenizer.sep_token_id) print("MASK:", loaded_tokenizer.mask_token, loaded_tokenizer.mask_token_id) # Test encoding sample = "Let's test this tokenizer." enc = loaded_tokenizer(sample) print("Encoded tokens:", enc.tokens())