samleeasus commited on
Commit
bccccd8
·
1 Parent(s): 1853fcd

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +37 -0
README.md ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```python
2
+ from transformers import LlamaTokenizer
3
+
4
+ tokenizer = LlamaTokenizer.from_pretrained(
5
+ 'ocisd4/openllama_tokenizer_ext_zh',
6
+ pad_token="<pad>",
7
+ add_bos_token=False,
8
+ add_eos_token=True,
9
+ use_auth_token='True',
10
+ )
11
+
12
+ print('vocab size:',tokenizer.vocab_size)
13
+ #vocab size: 52992
14
+
15
+ text = '今天天氣真好!'
16
+
17
+ print([k for k, v in tokenizer.get_vocab().items() if v > tokenizer.vocab_size -7])
18
+
19
+ print(tokenizer.tokenize(text))
20
+ #['▁', '今天', '天氣', '真', '好', '<0xEF>', '<0xBC>', '<0x81>']
21
+
22
+ print(tokenizer.encode(text))
23
+ #[1, 31822, 32101, 32927, 45489, 45301, 242, 191, 132]
24
+
25
+ print(tokenizer.decode(tokenizer.encode(text)))
26
+ # 今天天氣真好!</s>
27
+ ```
28
+
29
+ ** note: **
30
+ - The first token might be a whitespace in LLamaTokenizer.
31
+ - Open LlaMa的tokenizer is incompatible with original LlaMa
32
+ - This tokenizer will encode continuous spaces to ONE space
33
+
34
+
35
+ ### updated
36
+ #### 2023-06-02
37
+ - add special tokens: <|output|>, <|input|>, <|sep|>, <|emb|>, <|rwd|>, <|ctx|>