samleeasus commited on
Commit
d157dac
·
verified ·
1 Parent(s): b7d28aa

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +13 -6
README.md CHANGED
@@ -1,24 +1,31 @@
 
 
 
 
1
  ```python
2
- from transformers import LlamaTokenizer
3
 
4
- tokenizer = LlamaTokenizer.from_pretrained(
5
- 'ocisd4/llama_tokenizer_ext_zhtw',
6
  pad_token='<unk>',
7
  add_bos_token=True,
8
  add_eos_token=False
9
  )
10
 
11
- #vocab size: 36128
 
12
 
13
  print(tokenizer.tokenize('今天天氣真好!'))
14
  #['▁', '今', '天', '天', '氣', '真', '好', '!']
15
 
16
  print(tokenizer.encode('今天天氣真好!'))
17
- #[1, 29871, 31482, 30408, 30408, 32045, 30848, 31076, 30584]
18
 
19
  print(tokenizer.decode(tokenizer.encode('今天天氣真好!')))
20
- # <s>今天天氣真好!
21
  ```
22
 
23
 
 
 
24
 
 
1
+
2
+ Mistral擴充詞表只包含教育部常用8000字
3
+
4
+
5
  ```python
6
+ from transformers import AutoTokenizer
7
 
8
+ tokenizer = AutoTokenizer.from_pretrained(
9
+ 'ocisd4/mistral_tokenizer_ext',
10
  pad_token='<unk>',
11
  add_bos_token=True,
12
  add_eos_token=False
13
  )
14
 
15
+ print('vocab size:', tokenizer.vocab_size)
16
+ #vocab size: 35712
17
 
18
  print(tokenizer.tokenize('今天天氣真好!'))
19
  #['▁', '今', '天', '天', '氣', '真', '好', '!']
20
 
21
  print(tokenizer.encode('今天天氣真好!'))
22
+ #[1, 28705, 30316, 29354, 29354, 32004, 29974, 29530, 29267]
23
 
24
  print(tokenizer.decode(tokenizer.encode('今天天氣真好!')))
25
+ #<s> 今天天氣真好!
26
  ```
27
 
28
 
29
+ Mistral
30
+
31