infi
commited on
Commit
·
a1de04e
1
Parent(s):
859935e
Upload tokenizer
Browse files- added_tokens.json +12 -1
- tokenizer_config.json +88 -0
added_tokens.json
CHANGED
|
@@ -2,5 +2,16 @@
|
|
| 2 |
"\n": 64001,
|
| 3 |
"<mask>": 64000,
|
| 4 |
"<token_echap>": 64003,
|
| 5 |
-
"<token_schap>": 64002
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
}
|
|
|
|
| 2 |
"\n": 64001,
|
| 3 |
"<mask>": 64000,
|
| 4 |
"<token_echap>": 64003,
|
| 5 |
+
"<token_schap>": 64002,
|
| 6 |
+
"<|answer|>": 64010,
|
| 7 |
+
"<|chap|>": 64004,
|
| 8 |
+
"<|endbox|>": 64008,
|
| 9 |
+
"<|para|>": 64009,
|
| 10 |
+
"<|question|>": 64012,
|
| 11 |
+
"<|section|>": 64005,
|
| 12 |
+
"<|startbox|>": 64007,
|
| 13 |
+
"<|subsection|>": 64006,
|
| 14 |
+
"<|teaser|>": 64014,
|
| 15 |
+
"<|title|>": 64011,
|
| 16 |
+
"<|topic|>": 64013
|
| 17 |
}
|
tokenizer_config.json
CHANGED
|
@@ -63,6 +63,94 @@
|
|
| 63 |
"rstrip": false,
|
| 64 |
"single_word": false,
|
| 65 |
"special": false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
}
|
| 67 |
},
|
| 68 |
"bos_token": "<s>",
|
|
|
|
| 63 |
"rstrip": false,
|
| 64 |
"single_word": false,
|
| 65 |
"special": false
|
| 66 |
+
},
|
| 67 |
+
"64004": {
|
| 68 |
+
"content": "<|chap|>",
|
| 69 |
+
"lstrip": false,
|
| 70 |
+
"normalized": true,
|
| 71 |
+
"rstrip": false,
|
| 72 |
+
"single_word": false,
|
| 73 |
+
"special": false
|
| 74 |
+
},
|
| 75 |
+
"64005": {
|
| 76 |
+
"content": "<|section|>",
|
| 77 |
+
"lstrip": false,
|
| 78 |
+
"normalized": true,
|
| 79 |
+
"rstrip": false,
|
| 80 |
+
"single_word": false,
|
| 81 |
+
"special": false
|
| 82 |
+
},
|
| 83 |
+
"64006": {
|
| 84 |
+
"content": "<|subsection|>",
|
| 85 |
+
"lstrip": false,
|
| 86 |
+
"normalized": true,
|
| 87 |
+
"rstrip": false,
|
| 88 |
+
"single_word": false,
|
| 89 |
+
"special": false
|
| 90 |
+
},
|
| 91 |
+
"64007": {
|
| 92 |
+
"content": "<|startbox|>",
|
| 93 |
+
"lstrip": false,
|
| 94 |
+
"normalized": true,
|
| 95 |
+
"rstrip": false,
|
| 96 |
+
"single_word": false,
|
| 97 |
+
"special": false
|
| 98 |
+
},
|
| 99 |
+
"64008": {
|
| 100 |
+
"content": "<|endbox|>",
|
| 101 |
+
"lstrip": false,
|
| 102 |
+
"normalized": true,
|
| 103 |
+
"rstrip": false,
|
| 104 |
+
"single_word": false,
|
| 105 |
+
"special": false
|
| 106 |
+
},
|
| 107 |
+
"64009": {
|
| 108 |
+
"content": "<|para|>",
|
| 109 |
+
"lstrip": false,
|
| 110 |
+
"normalized": true,
|
| 111 |
+
"rstrip": false,
|
| 112 |
+
"single_word": false,
|
| 113 |
+
"special": false
|
| 114 |
+
},
|
| 115 |
+
"64010": {
|
| 116 |
+
"content": "<|answer|>",
|
| 117 |
+
"lstrip": false,
|
| 118 |
+
"normalized": true,
|
| 119 |
+
"rstrip": false,
|
| 120 |
+
"single_word": false,
|
| 121 |
+
"special": false
|
| 122 |
+
},
|
| 123 |
+
"64011": {
|
| 124 |
+
"content": "<|title|>",
|
| 125 |
+
"lstrip": false,
|
| 126 |
+
"normalized": true,
|
| 127 |
+
"rstrip": false,
|
| 128 |
+
"single_word": false,
|
| 129 |
+
"special": false
|
| 130 |
+
},
|
| 131 |
+
"64012": {
|
| 132 |
+
"content": "<|question|>",
|
| 133 |
+
"lstrip": false,
|
| 134 |
+
"normalized": true,
|
| 135 |
+
"rstrip": false,
|
| 136 |
+
"single_word": false,
|
| 137 |
+
"special": false
|
| 138 |
+
},
|
| 139 |
+
"64013": {
|
| 140 |
+
"content": "<|topic|>",
|
| 141 |
+
"lstrip": false,
|
| 142 |
+
"normalized": true,
|
| 143 |
+
"rstrip": false,
|
| 144 |
+
"single_word": false,
|
| 145 |
+
"special": false
|
| 146 |
+
},
|
| 147 |
+
"64014": {
|
| 148 |
+
"content": "<|teaser|>",
|
| 149 |
+
"lstrip": false,
|
| 150 |
+
"normalized": true,
|
| 151 |
+
"rstrip": false,
|
| 152 |
+
"single_word": false,
|
| 153 |
+
"special": false
|
| 154 |
}
|
| 155 |
},
|
| 156 |
"bos_token": "<s>",
|