Soldier-Boy commited on
Commit
a1c81d5
·
verified ·
1 Parent(s): 08ebe76

Add tokenizer_config.json

Browse files
Files changed (1) hide show
  1. tokenizer_config.json +22 -0
tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_target": 6000,
3
+ "max_merge_steps": 200000,
4
+ "special_tokens": [
5
+ "<PAD>",
6
+ "<UNK>",
7
+ "<BOS>",
8
+ "<EOS>"
9
+ ],
10
+ "placeholders": {
11
+ "URL": "<URL>",
12
+ "USER": "<USER>"
13
+ },
14
+ "protect_regex": {
15
+ "url": "https?://\\S+",
16
+ "user": "@[A-Za-z0-9_]+",
17
+ "time": "\\b\\d{1,2}:\\d{2}\\b",
18
+ "number": "\\b\\d+(?:\\.\\d+)?\\b",
19
+ "emote": "(?:(?:[A-Za-z]+){2,}[A-Z]{2,}|[A-Za-z]+(?:LOL|KEKW|LUL|OMEGA)\\b)"
20
+ },
21
+ "version": 1
22
+ }