Add tokenizer_config.json
Browse files- tokenizer_config.json +22 -0
tokenizer_config.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"vocab_target": 6000,
|
| 3 |
+
"max_merge_steps": 200000,
|
| 4 |
+
"special_tokens": [
|
| 5 |
+
"<PAD>",
|
| 6 |
+
"<UNK>",
|
| 7 |
+
"<BOS>",
|
| 8 |
+
"<EOS>"
|
| 9 |
+
],
|
| 10 |
+
"placeholders": {
|
| 11 |
+
"URL": "<URL>",
|
| 12 |
+
"USER": "<USER>"
|
| 13 |
+
},
|
| 14 |
+
"protect_regex": {
|
| 15 |
+
"url": "https?://\\S+",
|
| 16 |
+
"user": "@[A-Za-z0-9_]+",
|
| 17 |
+
"time": "\\b\\d{1,2}:\\d{2}\\b",
|
| 18 |
+
"number": "\\b\\d+(?:\\.\\d+)?\\b",
|
| 19 |
+
"emote": "(?:(?:[A-Za-z]+){2,}[A-Z]{2,}|[A-Za-z]+(?:LOL|KEKW|LUL|OMEGA)\\b)"
|
| 20 |
+
},
|
| 21 |
+
"version": 1
|
| 22 |
+
}
|