suku9 commited on
Commit
3ac082b
·
verified ·
1 Parent(s): 1bd3fed

Upload tokenizer

Browse files
Files changed (4) hide show
  1. merges.txt +1 -0
  2. special_tokens_map.json +28 -4
  3. tokenizer_config.json +45 -36
  4. vocab.json +32 -0
merges.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ #version: 0.2
special_tokens_map.json CHANGED
@@ -1,6 +1,30 @@
1
  {
2
- "bos_token": "<go>",
3
- "eos_token": "</s>",
4
- "pad_token": "<pad>",
5
- "unk_token": "<unk>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<go>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
  }
tokenizer_config.json CHANGED
@@ -1,38 +1,47 @@
1
  {
2
- "pad_token": "<pad>",
3
- "eos_token": "</s>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  "bos_token": "<go>",
5
- "unk_token": "<unk>",
6
- "vocab": [
7
- "<pad>",
8
- "</s>",
9
- "<unk>",
10
- "<go>",
11
- "S",
12
- "O",
13
- "2",
14
- "n",
15
- "l",
16
- "F",
17
- "H",
18
- "C",
19
- "o",
20
- "5",
21
- "r",
22
- "s",
23
- "=",
24
- "6",
25
- "[",
26
- "N",
27
- "4",
28
- "c",
29
- "-",
30
- "3",
31
- ")",
32
- "#",
33
- "]",
34
- "B",
35
- "(",
36
- "1"
37
- ]
38
- }
 
1
  {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "</s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<unk>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<go>",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ }
37
+ },
38
  "bos_token": "<go>",
39
+ "clean_up_tokenization_spaces": false,
40
+ "eos_token": "</s>",
41
+ "errors": "replace",
42
+ "extra_special_tokens": {},
43
+ "model_max_length": 1000000000000000019884624838656,
44
+ "pad_token": "<pad>",
45
+ "tokenizer_class": "GPT2Tokenizer",
46
+ "unk_token": "<unk>"
47
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "#": 25,
3
+ "(": 28,
4
+ ")": 24,
5
+ "-": 22,
6
+ "1": 29,
7
+ "2": 6,
8
+ "3": 23,
9
+ "4": 20,
10
+ "5": 13,
11
+ "6": 17,
12
+ "</s>": 1,
13
+ "<go>": 3,
14
+ "<pad>": 0,
15
+ "<unk>": 2,
16
+ "=": 16,
17
+ "B": 27,
18
+ "C": 11,
19
+ "F": 9,
20
+ "H": 10,
21
+ "N": 19,
22
+ "O": 5,
23
+ "S": 4,
24
+ "[": 18,
25
+ "]": 26,
26
+ "c": 21,
27
+ "l": 8,
28
+ "n": 7,
29
+ "o": 12,
30
+ "r": 14,
31
+ "s": 15
32
+ }