| ```python | |
| from transformers import LlamaTokenizer | |
| tokenizer = LlamaTokenizer.from_pretrained( | |
| 'ocisd4/openllama_tokenizer_v2', | |
| add_bos_token=False, | |
| add_eos_token=True, | |
| force_download=False, | |
| use_auth_token=True, | |
| # additional_special_tokens=['<|spcout|>', '<|sep|>', '<|eot|>', '<|output|>'] | |
| ) | |
| print('vocab size:',tokenizer.vocab_size) | |
| #vocab size: 51456 | |
| text = '今天天氣真好!' | |
| print(tokenizer.tokenize(text)) | |
| #['▁', '今天', '天氣', '真', '好', '!'] | |
| print(tokenizer.encode(text)) | |
| #[29500, 32097, 32916, 30615, 30192, 30042, 2] | |
| print(tokenizer.decode(tokenizer.encode(text))) | |
| # 今天天氣真好!</s> | |
| ``` |