Franso commited on
Commit
cfbcd30
·
verified ·
1 Parent(s): 8318575

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.json +174 -38
  2. tokenizer_config.json +4 -4
tokenizer.json CHANGED
@@ -4,7 +4,7 @@
4
  "padding": null,
5
  "added_tokens": [
6
  {
7
- "id": 33,
8
  "content": "<pad>",
9
  "single_word": false,
10
  "lstrip": false,
@@ -13,7 +13,7 @@
13
  "special": true
14
  },
15
  {
16
- "id": 34,
17
  "content": "<s>",
18
  "single_word": false,
19
  "lstrip": false,
@@ -22,7 +22,7 @@
22
  "special": true
23
  },
24
  {
25
- "id": 35,
26
  "content": "</s>",
27
  "single_word": false,
28
  "lstrip": false,
@@ -31,7 +31,7 @@
31
  "special": true
32
  },
33
  {
34
- "id": 36,
35
  "content": "<unk>",
36
  "single_word": false,
37
  "lstrip": false,
@@ -44,7 +44,7 @@
44
  "pre_tokenizer": {
45
  "type": "Split",
46
  "pattern": {
47
- "Regex": "(|)"
48
  },
49
  "behavior": "Isolated",
50
  "invert": false
@@ -65,39 +65,175 @@
65
  "ignore_merges": false,
66
  "vocab": {
67
  "#": 0,
68
- "(": 1,
69
- ")": 2,
70
- "+": 3,
71
- "-": 4,
72
- "/": 5,
73
- "1": 6,
74
- "2": 7,
75
- "3": 8,
76
- "4": 9,
77
- "5": 10,
78
- "6": 11,
79
- "=": 12,
80
- "@": 13,
81
- "B": 14,
82
- "C": 15,
83
- "F": 16,
84
- "H": 17,
85
- "I": 18,
86
- "N": 19,
87
- "O": 20,
88
- "P": 21,
89
- "S": 22,
90
- "[": 23,
91
- "\\": 24,
92
- "]": 25,
93
- "c": 26,
94
- "i": 27,
95
- "l": 28,
96
- "n": 29,
97
- "o": 30,
98
- "r": 31,
99
- "s": 32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  },
101
- "merges": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  }
103
  }
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
7
+ "id": 64,
8
  "content": "<pad>",
9
  "single_word": false,
10
  "lstrip": false,
 
13
  "special": true
14
  },
15
  {
16
+ "id": 65,
17
  "content": "<s>",
18
  "single_word": false,
19
  "lstrip": false,
 
22
  "special": true
23
  },
24
  {
25
+ "id": 66,
26
  "content": "</s>",
27
  "single_word": false,
28
  "lstrip": false,
 
31
  "special": true
32
  },
33
  {
34
+ "id": 67,
35
  "content": "<unk>",
36
  "single_word": false,
37
  "lstrip": false,
 
44
  "pre_tokenizer": {
45
  "type": "Split",
46
  "pattern": {
47
+ "Regex": "\\(|\\)"
48
  },
49
  "behavior": "Isolated",
50
  "invert": false
 
65
  "ignore_merges": false,
66
  "vocab": {
67
  "#": 0,
68
+ "%": 1,
69
+ "(": 2,
70
+ ")": 3,
71
+ "+": 4,
72
+ "-": 5,
73
+ "/": 6,
74
+ "0": 7,
75
+ "1": 8,
76
+ "2": 9,
77
+ "3": 10,
78
+ "4": 11,
79
+ "5": 12,
80
+ "6": 13,
81
+ "7": 14,
82
+ "8": 15,
83
+ "9": 16,
84
+ "=": 17,
85
+ "@": 18,
86
+ "B": 19,
87
+ "C": 20,
88
+ "F": 21,
89
+ "H": 22,
90
+ "I": 23,
91
+ "N": 24,
92
+ "O": 25,
93
+ "P": 26,
94
+ "S": 27,
95
+ "[": 28,
96
+ "\\": 29,
97
+ "]": 30,
98
+ "c": 31,
99
+ "i": 32,
100
+ "l": 33,
101
+ "n": 34,
102
+ "o": 35,
103
+ "r": 36,
104
+ "s": 37,
105
+ "cc": 38,
106
+ "CC": 39,
107
+ "c1": 40,
108
+ "=O": 41,
109
+ "c2": 42,
110
+ "H]": 43,
111
+ "[C": 44,
112
+ "[C@": 45,
113
+ "c1cc": 46,
114
+ "[C@@": 47,
115
+ "c3": 48,
116
+ "c2cc": 49,
117
+ "[C@H]": 50,
118
+ "[C@@H]": 51,
119
+ "NC": 52,
120
+ "c1ccc": 53,
121
+ "CCC": 54,
122
+ "CO": 55,
123
+ "cc1": 56,
124
+ "=C": 57,
125
+ "c1cccc": 58,
126
+ "n1": 59,
127
+ "N1": 60,
128
+ "nc": 61,
129
+ "c2cccc": 62,
130
+ "OC": 63
131
  },
132
+ "merges": [
133
+ [
134
+ "c",
135
+ "c"
136
+ ],
137
+ [
138
+ "C",
139
+ "C"
140
+ ],
141
+ [
142
+ "c",
143
+ "1"
144
+ ],
145
+ [
146
+ "=",
147
+ "O"
148
+ ],
149
+ [
150
+ "c",
151
+ "2"
152
+ ],
153
+ [
154
+ "H",
155
+ "]"
156
+ ],
157
+ [
158
+ "[",
159
+ "C"
160
+ ],
161
+ [
162
+ "[C",
163
+ "@"
164
+ ],
165
+ [
166
+ "c1",
167
+ "cc"
168
+ ],
169
+ [
170
+ "[C@",
171
+ "@"
172
+ ],
173
+ [
174
+ "c",
175
+ "3"
176
+ ],
177
+ [
178
+ "c2",
179
+ "cc"
180
+ ],
181
+ [
182
+ "[C@",
183
+ "H]"
184
+ ],
185
+ [
186
+ "[C@@",
187
+ "H]"
188
+ ],
189
+ [
190
+ "N",
191
+ "C"
192
+ ],
193
+ [
194
+ "c1cc",
195
+ "c"
196
+ ],
197
+ [
198
+ "CC",
199
+ "C"
200
+ ],
201
+ [
202
+ "C",
203
+ "O"
204
+ ],
205
+ [
206
+ "cc",
207
+ "1"
208
+ ],
209
+ [
210
+ "=",
211
+ "C"
212
+ ],
213
+ [
214
+ "c1cc",
215
+ "cc"
216
+ ],
217
+ [
218
+ "n",
219
+ "1"
220
+ ],
221
+ [
222
+ "N",
223
+ "1"
224
+ ],
225
+ [
226
+ "n",
227
+ "c"
228
+ ],
229
+ [
230
+ "c2cc",
231
+ "cc"
232
+ ],
233
+ [
234
+ "O",
235
+ "C"
236
+ ]
237
+ ]
238
  }
239
  }
tokenizer_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "added_tokens_decoder": {
3
- "33": {
4
  "content": "<pad>",
5
  "lstrip": false,
6
  "normalized": false,
@@ -8,7 +8,7 @@
8
  "single_word": false,
9
  "special": true
10
  },
11
- "34": {
12
  "content": "<s>",
13
  "lstrip": false,
14
  "normalized": false,
@@ -16,7 +16,7 @@
16
  "single_word": false,
17
  "special": true
18
  },
19
- "35": {
20
  "content": "</s>",
21
  "lstrip": false,
22
  "normalized": false,
@@ -24,7 +24,7 @@
24
  "single_word": false,
25
  "special": true
26
  },
27
- "36": {
28
  "content": "<unk>",
29
  "lstrip": false,
30
  "normalized": false,
 
1
  {
2
  "added_tokens_decoder": {
3
+ "64": {
4
  "content": "<pad>",
5
  "lstrip": false,
6
  "normalized": false,
 
8
  "single_word": false,
9
  "special": true
10
  },
11
+ "65": {
12
  "content": "<s>",
13
  "lstrip": false,
14
  "normalized": false,
 
16
  "single_word": false,
17
  "special": true
18
  },
19
+ "66": {
20
  "content": "</s>",
21
  "lstrip": false,
22
  "normalized": false,
 
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "67": {
28
  "content": "<unk>",
29
  "lstrip": false,
30
  "normalized": false,