Update README.md
Browse files
README.md
CHANGED
|
@@ -1,24 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
```python
|
| 2 |
-
from transformers import
|
| 3 |
|
| 4 |
-
tokenizer =
|
| 5 |
-
'ocisd4/
|
| 6 |
pad_token='<unk>',
|
| 7 |
add_bos_token=True,
|
| 8 |
add_eos_token=False
|
| 9 |
)
|
| 10 |
|
| 11 |
-
|
|
|
|
| 12 |
|
| 13 |
print(tokenizer.tokenize('今天天氣真好!'))
|
| 14 |
#['▁', '今', '天', '天', '氣', '真', '好', '!']
|
| 15 |
|
| 16 |
print(tokenizer.encode('今天天氣真好!'))
|
| 17 |
-
#[1,
|
| 18 |
|
| 19 |
print(tokenizer.decode(tokenizer.encode('今天天氣真好!')))
|
| 20 |
-
|
| 21 |
```
|
| 22 |
|
| 23 |
|
|
|
|
|
|
|
| 24 |
|
|
|
|
| 1 |
+
|
| 2 |
+
Mistral擴充詞表只包含教育部常用8000字
|
| 3 |
+
|
| 4 |
+
|
| 5 |
```python
|
| 6 |
+
from transformers import AutoTokenizer
|
| 7 |
|
| 8 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 9 |
+
'ocisd4/mistral_tokenizer_ext',
|
| 10 |
pad_token='<unk>',
|
| 11 |
add_bos_token=True,
|
| 12 |
add_eos_token=False
|
| 13 |
)
|
| 14 |
|
| 15 |
+
print('vocab size:', tokenizer.vocab_size)
|
| 16 |
+
#vocab size: 35712
|
| 17 |
|
| 18 |
print(tokenizer.tokenize('今天天氣真好!'))
|
| 19 |
#['▁', '今', '天', '天', '氣', '真', '好', '!']
|
| 20 |
|
| 21 |
print(tokenizer.encode('今天天氣真好!'))
|
| 22 |
+
#[1, 28705, 30316, 29354, 29354, 32004, 29974, 29530, 29267]
|
| 23 |
|
| 24 |
print(tokenizer.decode(tokenizer.encode('今天天氣真好!')))
|
| 25 |
+
#<s> 今天天氣真好!
|
| 26 |
```
|
| 27 |
|
| 28 |
|
| 29 |
+
Mistral
|
| 30 |
+
|
| 31 |
|