Commit
·
b08f9b9
1
Parent(s):
26bb9ac
Update README.md
Browse files
README.md
CHANGED
|
@@ -3,12 +3,13 @@ from transformers import LlamaTokenizer
|
|
| 3 |
|
| 4 |
tokenizer = LlamaTokenizer.from_pretrained(
|
| 5 |
'ocisd4/openllama_tokenizer_ext_zh',
|
| 6 |
-
pad_token="<pad>",
|
| 7 |
add_bos_token=True,
|
| 8 |
add_eos_token=False,
|
| 9 |
use_auth_token='True',
|
| 10 |
)
|
| 11 |
|
|
|
|
|
|
|
| 12 |
print('vocab size:',tokenizer.vocab_size)
|
| 13 |
#vocab size: 52928
|
| 14 |
|
|
@@ -33,4 +34,4 @@ print(tokenizer.decode(tokenizer.encode(text)))
|
|
| 33 |
|
| 34 |
### updated
|
| 35 |
#### 2023-06-02
|
| 36 |
-
- add special tokens: <|output|>, <|input|>, <|sep|>, <|emb|>, <|rwd|>, <|ctx|>
|
|
|
|
| 3 |
|
| 4 |
tokenizer = LlamaTokenizer.from_pretrained(
|
| 5 |
'ocisd4/openllama_tokenizer_ext_zh',
|
|
|
|
| 6 |
add_bos_token=True,
|
| 7 |
add_eos_token=False,
|
| 8 |
use_auth_token='True',
|
| 9 |
)
|
| 10 |
|
| 11 |
+
tokenizer.pad_token_id = tokenizer.vocab_size -1
|
| 12 |
+
|
| 13 |
print('vocab size:',tokenizer.vocab_size)
|
| 14 |
#vocab size: 52928
|
| 15 |
|
|
|
|
| 34 |
|
| 35 |
### updated
|
| 36 |
#### 2023-06-02
|
| 37 |
+
- add special tokens: <|pad|><|output|>, <|input|>, <|sep|>, <|emb|>, <|rwd|>, <|ctx|>
|