speakingPotato commited on
Commit
437282d
·
verified ·
1 Parent(s): bd3207d

Upload tokenizer

Browse files
Files changed (4) hide show
  1. special_tokens_map.json +14 -0
  2. tokenizer.json +12 -12
  3. tokenizer_config.json +6 -4
  4. vocab.txt +2 -2
special_tokens_map.json CHANGED
@@ -1,4 +1,11 @@
1
  {
 
 
 
 
 
 
 
2
  "cls_token": {
3
  "content": "[CLS]",
4
  "lstrip": false,
@@ -6,6 +13,13 @@
6
  "rstrip": false,
7
  "single_word": false
8
  },
 
 
 
 
 
 
 
9
  "mask_token": {
10
  "content": "[MASK]",
11
  "lstrip": false,
 
1
  {
2
+ "bos_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
  "cls_token": {
10
  "content": "[CLS]",
11
  "lstrip": false,
 
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
+ "eos_token": {
17
+ "content": "[SEP]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
  "mask_token": {
24
  "content": "[MASK]",
25
  "lstrip": false,
tokenizer.json CHANGED
@@ -5,7 +5,7 @@
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
- "content": "[PAD]",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
@@ -14,7 +14,7 @@
14
  },
15
  {
16
  "id": 1,
17
- "content": "[UNK]",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
@@ -23,7 +23,7 @@
23
  },
24
  {
25
  "id": 2,
26
- "content": "[CLS]",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
@@ -32,7 +32,7 @@
32
  },
33
  {
34
  "id": 3,
35
- "content": "[SEP]",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
@@ -103,13 +103,13 @@
103
  {
104
  "Sequence": {
105
  "id": "B",
106
- "type_id": 1
107
  }
108
  },
109
  {
110
  "SpecialToken": {
111
  "id": "[SEP]",
112
- "type_id": 1
113
  }
114
  }
115
  ],
@@ -117,7 +117,7 @@
117
  "[CLS]": {
118
  "id": "[CLS]",
119
  "ids": [
120
- 2
121
  ],
122
  "tokens": [
123
  "[CLS]"
@@ -126,7 +126,7 @@
126
  "[SEP]": {
127
  "id": "[SEP]",
128
  "ids": [
129
- 3
130
  ],
131
  "tokens": [
132
  "[SEP]"
@@ -145,10 +145,10 @@
145
  "continuing_subword_prefix": "##",
146
  "max_input_chars_per_word": 100,
147
  "vocab": {
148
- "[PAD]": 0,
149
- "[UNK]": 1,
150
- "[CLS]": 2,
151
- "[SEP]": 3,
152
  "[MASK]": 4,
153
  "!": 5,
154
  "\"": 6,
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
+ "content": "[CLS]",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
 
14
  },
15
  {
16
  "id": 1,
17
+ "content": "[PAD]",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
 
23
  },
24
  {
25
  "id": 2,
26
+ "content": "[SEP]",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
 
32
  },
33
  {
34
  "id": 3,
35
+ "content": "[UNK]",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
 
103
  {
104
  "Sequence": {
105
  "id": "B",
106
+ "type_id": 0
107
  }
108
  },
109
  {
110
  "SpecialToken": {
111
  "id": "[SEP]",
112
+ "type_id": 0
113
  }
114
  }
115
  ],
 
117
  "[CLS]": {
118
  "id": "[CLS]",
119
  "ids": [
120
+ 0
121
  ],
122
  "tokens": [
123
  "[CLS]"
 
126
  "[SEP]": {
127
  "id": "[SEP]",
128
  "ids": [
129
+ 2
130
  ],
131
  "tokens": [
132
  "[SEP]"
 
145
  "continuing_subword_prefix": "##",
146
  "max_input_chars_per_word": 100,
147
  "vocab": {
148
+ "[CLS]": 0,
149
+ "[PAD]": 1,
150
+ "[SEP]": 2,
151
+ "[UNK]": 3,
152
  "[MASK]": 4,
153
  "!": 5,
154
  "\"": 6,
tokenizer_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "[PAD]",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
@@ -9,7 +9,7 @@
9
  "special": true
10
  },
11
  "1": {
12
- "content": "[UNK]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
@@ -17,7 +17,7 @@
17
  "special": true
18
  },
19
  "2": {
20
- "content": "[CLS]",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
@@ -25,7 +25,7 @@
25
  "special": true
26
  },
27
  "3": {
28
- "content": "[SEP]",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
@@ -41,10 +41,12 @@
41
  "special": true
42
  }
43
  },
 
44
  "clean_up_tokenization_spaces": true,
45
  "cls_token": "[CLS]",
46
  "do_basic_tokenize": true,
47
  "do_lower_case": false,
 
48
  "mask_token": "[MASK]",
49
  "model_max_length": 512,
50
  "never_split": null,
 
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
+ "content": "[CLS]",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
 
9
  "special": true
10
  },
11
  "1": {
12
+ "content": "[PAD]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
 
17
  "special": true
18
  },
19
  "2": {
20
+ "content": "[SEP]",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
 
25
  "special": true
26
  },
27
  "3": {
28
+ "content": "[UNK]",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
 
41
  "special": true
42
  }
43
  },
44
+ "bos_token": "[CLS]",
45
  "clean_up_tokenization_spaces": true,
46
  "cls_token": "[CLS]",
47
  "do_basic_tokenize": true,
48
  "do_lower_case": false,
49
+ "eos_token": "[SEP]",
50
  "mask_token": "[MASK]",
51
  "model_max_length": 512,
52
  "never_split": null,
vocab.txt CHANGED
@@ -1,7 +1,7 @@
1
- [PAD]
2
- [UNK]
3
  [CLS]
 
4
  [SEP]
 
5
  [MASK]
6
  !
7
  "
 
 
 
1
  [CLS]
2
+ [PAD]
3
  [SEP]
4
+ [UNK]
5
  [MASK]
6
  !
7
  "