File size: 553 Bytes
a433a25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from transformers import LlamaTokenizerFast

# Load the raw spm model
tokenizer = LlamaTokenizerFast(vocab_file="/home/aviinashh/projects/Mini-LLM/Tokenizer/BPE/spm.model")

# Add your special tokens manually to the HF config part
tokenizer.add_special_tokens({
    "bos_token": "<s>",
    "eos_token": "</s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "additional_special_tokens": ["<user>", "<assistant>", "<system>"]
})

# Save the json version
tokenizer.save_pretrained("Tokenizer/")

print("Converted to tokenizer.json successfully!")