ocisd4
/

openllama_tokenizer_ext_zh

Model card Files Files and versions

samleeasus commited on Jun 5, 2023

Commit

966a300

·

1 Parent(s): c3a8fbf

Update README.md

Files changed (1) hide show

README.md +1 -2

README.md CHANGED Viewed

@@ -10,11 +10,10 @@ tokenizer = LlamaTokenizer.from_pretrained(
 )
 print('vocab size:',tokenizer.vocab_size)
-#vocab size: 52992
 text = '今天天氣真好！'
-print([k for k, v in tokenizer.get_vocab().items() if v  > tokenizer.vocab_size -7])
 print(tokenizer.tokenize(text))
 #['▁', '今天', '天氣', '真', '好', '<0xEF>', '<0xBC>', '<0x81>']

 )
 print('vocab size:',tokenizer.vocab_size)
+#vocab size: 52928
 text = '今天天氣真好！'
 print(tokenizer.tokenize(text))
 #['▁', '今天', '天氣', '真', '好', '<0xEF>', '<0xBC>', '<0x81>']