| |
| |
| |
| from transformers import PreTrainedTokenizerFast |
|
|
| class LlamaCanonTokenizer(PreTrainedTokenizerFast): |
| @classmethod |
| def from_pretrained(cls, pretrained_model_name_or_path, *args, variant="default", **kwargs): |
| from huggingface_hub import hf_hub_download |
| import os, json |
| if os.path.isfile(os.path.join(pretrained_model_name_or_path, variant, "params.json")): |
| config_path = os.path.join(pretrained_model_name_or_path, variant, "params.json") |
| else: |
| config_path = hf_hub_download( |
| repo_id=pretrained_model_name_or_path, |
| filename=f"{variant}/params.json", |
| ) |
| |
| print("Please ignore the tokenizer name mismatch warning; this LlamaCanonTokenizer is simply a wrapper of either Llama2 or Llama3 tokenizer, depending on params.json") |
| with open(config_path, "r") as f: |
| dd = json.load(f) |
| if dd['data']['tokenizer']['name']=='sp': |
| print("Using Llama2 tokenizer") |
| |
| return super().from_pretrained("NousResearch/Llama-2-7b-hf") |
| elif dd['data']['tokenizer']['name']=='tiktoken': |
| print("Using Llama3 tokenizer") |
| |
| |
| return super().from_pretrained("nvidia/Llama-3.1-Nemotron-70B-Instruct-HF") |
| else: |
| raise ValueError(f"Unsupported tokenizer name: {dd['data']['tokenizer']['name']}") |