| ```python | |
| from transformers import LlamaTokenizer | |
| tokenizer = LlamaTokenizer.from_pretrained( | |
| 'ocisd4/llama_tokenizer_ext_zhtw', | |
| pad_token='<unk>', | |
| add_bos_token=True, | |
| add_eos_token=False | |
| ) | |
| #vocab size: 36128 | |
| print(tokenizer.tokenize('今天天氣真好!')) | |
| #['▁', '今', '天', '天', '氣', '真', '好', '!'] | |
| print(tokenizer.encode('今天天氣真好!')) | |
| #[1, 29871, 31482, 30408, 30408, 32045, 30848, 31076, 30584] | |
| print(tokenizer.decode(tokenizer.encode('今天天氣真好!'))) | |
| # <s>今天天氣真好! | |
| ``` |