AlexHT_Hung
commited on
Commit
·
9c962ca
1
Parent(s):
a4300b1
Remove dummy tokens, add func_start, func_end
Browse files- README.md +5 -3
- added_tokens.json +2 -2
- tokenizer.json +3 -29
- tokenizer_config.json +2 -2
README.md
CHANGED
|
@@ -1,8 +1,10 @@
|
|
| 1 |
|
| 2 |
Mistral擴充詞表只包含與教育部常用4808字的交集
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
```python
|
|
@@ -16,7 +18,7 @@ tokenizer = AutoTokenizer.from_pretrained(
|
|
| 16 |
)
|
| 17 |
|
| 18 |
print('vocab size:', tokenizer.vocab_size)
|
| 19 |
-
#vocab size:
|
| 20 |
|
| 21 |
print(tokenizer.tokenize('今天天氣真好!'))
|
| 22 |
#['▁', '今', '天', '天', '氣', '真', '好', '!']
|
|
|
|
| 1 |
|
| 2 |
Mistral擴充詞表只包含與教育部常用4808字的交集
|
| 3 |
|
| 4 |
+
~~後面補了25個dummy token,補到64的倍數可以增加訓練效率~~
|
| 5 |
+
~~未來可以作為special token的預留空間~~
|
| 6 |
+
- 移除dummy token
|
| 7 |
+
- 增加`<|func_start|>`, `<|func_end|>`
|
| 8 |
|
| 9 |
|
| 10 |
```python
|
|
|
|
| 18 |
)
|
| 19 |
|
| 20 |
print('vocab size:', tokenizer.vocab_size)
|
| 21 |
+
#vocab size: 35686
|
| 22 |
|
| 23 |
print(tokenizer.tokenize('今天天氣真好!'))
|
| 24 |
#['▁', '今', '天', '天', '氣', '真', '好', '!']
|
added_tokens.json
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
{
|
| 2 |
-
"<|func_end|>":
|
| 3 |
-
"<|func_start|>":
|
| 4 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"<|func_end|>": 35686,
|
| 3 |
+
"<|func_start|>": 35687
|
| 4 |
}
|
tokenizer.json
CHANGED
|
@@ -31,7 +31,7 @@
|
|
| 31 |
"special": true
|
| 32 |
},
|
| 33 |
{
|
| 34 |
-
"id":
|
| 35 |
"content": "<|func_end|>",
|
| 36 |
"single_word": true,
|
| 37 |
"lstrip": true,
|
|
@@ -40,7 +40,7 @@
|
|
| 40 |
"special": false
|
| 41 |
},
|
| 42 |
{
|
| 43 |
-
"id":
|
| 44 |
"content": "<|func_start|>",
|
| 45 |
"single_word": true,
|
| 46 |
"lstrip": true,
|
|
@@ -35838,33 +35838,7 @@
|
|
| 35838 |
"賅": 35682,
|
| 35839 |
"簞": 35683,
|
| 35840 |
"鼴": 35684,
|
| 35841 |
-
"躂": 35685
|
| 35842 |
-
"<DUMMY_0>": 35686,
|
| 35843 |
-
"<DUMMY_1>": 35687,
|
| 35844 |
-
"<DUMMY_2>": 35688,
|
| 35845 |
-
"<DUMMY_3>": 35689,
|
| 35846 |
-
"<DUMMY_4>": 35690,
|
| 35847 |
-
"<DUMMY_5>": 35691,
|
| 35848 |
-
"<DUMMY_6>": 35692,
|
| 35849 |
-
"<DUMMY_7>": 35693,
|
| 35850 |
-
"<DUMMY_8>": 35694,
|
| 35851 |
-
"<DUMMY_9>": 35695,
|
| 35852 |
-
"<DUMMY_10>": 35696,
|
| 35853 |
-
"<DUMMY_11>": 35697,
|
| 35854 |
-
"<DUMMY_12>": 35698,
|
| 35855 |
-
"<DUMMY_13>": 35699,
|
| 35856 |
-
"<DUMMY_14>": 35700,
|
| 35857 |
-
"<DUMMY_15>": 35701,
|
| 35858 |
-
"<DUMMY_16>": 35702,
|
| 35859 |
-
"<DUMMY_17>": 35703,
|
| 35860 |
-
"<DUMMY_18>": 35704,
|
| 35861 |
-
"<DUMMY_19>": 35705,
|
| 35862 |
-
"<DUMMY_20>": 35706,
|
| 35863 |
-
"<DUMMY_21>": 35707,
|
| 35864 |
-
"<DUMMY_22>": 35708,
|
| 35865 |
-
"<DUMMY_23>": 35709,
|
| 35866 |
-
"<DUMMY_24>": 35710,
|
| 35867 |
-
"<DUMMY_25>": 35711
|
| 35868 |
},
|
| 35869 |
"merges": [
|
| 35870 |
"▁ t",
|
|
|
|
| 31 |
"special": true
|
| 32 |
},
|
| 33 |
{
|
| 34 |
+
"id": 35686,
|
| 35 |
"content": "<|func_end|>",
|
| 36 |
"single_word": true,
|
| 37 |
"lstrip": true,
|
|
|
|
| 40 |
"special": false
|
| 41 |
},
|
| 42 |
{
|
| 43 |
+
"id": 35687,
|
| 44 |
"content": "<|func_start|>",
|
| 45 |
"single_word": true,
|
| 46 |
"lstrip": true,
|
|
|
|
| 35838 |
"賅": 35682,
|
| 35839 |
"簞": 35683,
|
| 35840 |
"鼴": 35684,
|
| 35841 |
+
"躂": 35685
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35842 |
},
|
| 35843 |
"merges": [
|
| 35844 |
"▁ t",
|
tokenizer_config.json
CHANGED
|
@@ -26,7 +26,7 @@
|
|
| 26 |
"single_word": false,
|
| 27 |
"special": true
|
| 28 |
},
|
| 29 |
-
"
|
| 30 |
"content": "<|func_end|>",
|
| 31 |
"lstrip": true,
|
| 32 |
"normalized": false,
|
|
@@ -34,7 +34,7 @@
|
|
| 34 |
"single_word": true,
|
| 35 |
"special": false
|
| 36 |
},
|
| 37 |
-
"
|
| 38 |
"content": "<|func_start|>",
|
| 39 |
"lstrip": true,
|
| 40 |
"normalized": false,
|
|
|
|
| 26 |
"single_word": false,
|
| 27 |
"special": true
|
| 28 |
},
|
| 29 |
+
"35686": {
|
| 30 |
"content": "<|func_end|>",
|
| 31 |
"lstrip": true,
|
| 32 |
"normalized": false,
|
|
|
|
| 34 |
"single_word": true,
|
| 35 |
"special": false
|
| 36 |
},
|
| 37 |
+
"35687": {
|
| 38 |
"content": "<|func_start|>",
|
| 39 |
"lstrip": true,
|
| 40 |
"normalized": false,
|