| from transformers import AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("/workspace/dolphin-2.6-mistral-7b-hf") | |
| # 1. Remove the "</s>" token from the vocabulary | |
| vocab = tokenizer.get_vocab() | |
| del vocab['</s>'] | |
| vocab['<|im_end|>'] = 2 | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "/workspace/dolphin-2.6-mistral-7b-hf", | |
| vocab=vocab | |
| ) | |
| tokenizer.eos_token = "<|im_end|>" | |
| tokenizer.pad_token = "<|im_end|>" | |
| # 5. Save the modified tokenizer | |
| tokenizer.save_pretrained('/workspace/dolphin-new-tokenizer/') |