File size: 12,381 Bytes
ca97aa9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
import { EsmTokenizer } from "../../../src/tokenizers.js";
import { BASE_TEST_STRINGS, ESM_TEST_STRINGS } from "../test_strings.js";

export const TOKENIZER_CLASS = EsmTokenizer;
export const TEST_CONFIG = {
  "Xenova/nucleotide-transformer-500m-human-ref": {
    SIMPLE: {
      text: BASE_TEST_STRINGS.SIMPLE,
      // "tokens": ["How", "are", "you", "doing?"],
      ids: [3, 0, 0, 0, 0],
      decoded: "<cls> <unk> <unk> <unk> <unk>",
    },
    SIMPLE_WITH_PUNCTUATION: {
      text: BASE_TEST_STRINGS.SIMPLE_WITH_PUNCTUATION,
      // "tokens": ["You", "should've", "done", "this"],
      ids: [3, 0, 0, 0, 0],
      decoded: "<cls> <unk> <unk> <unk> <unk>",
    },
    NUMBERS: {
      text: BASE_TEST_STRINGS.NUMBERS,
      // "tokens": ["0123456789", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "100", "1000"],
      ids: [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      decoded: "<cls> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>",
    },
    TEXT_WITH_NUMBERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_NUMBERS,
      // "tokens": ["T", "he", "company", "was", "founded", "in", "2016."],
      ids: [3, 4101, 0, 0, 0, 0, 0, 0],
      decoded: "<cls> T <unk> <unk> <unk> <unk> <unk> <unk>",
    },
    PUNCTUATION: {
      text: BASE_TEST_STRINGS.PUNCTUATION,
      // "tokens": ["A", "'ll", "!!to?'d''d", "of,", "can't."],
      ids: [3, 4100, 0, 0, 0, 0],
      decoded: "<cls> A <unk> <unk> <unk> <unk>",
    },
    PYTHON_CODE: {
      text: BASE_TEST_STRINGS.PYTHON_CODE,
      // "tokens": ["def", "main():", "pass"],
      ids: [3, 0, 0, 0],
      decoded: "<cls> <unk> <unk> <unk>",
    },
    JAVASCRIPT_CODE: {
      text: BASE_TEST_STRINGS.JAVASCRIPT_CODE,
      // "tokens": ["let", "a", "=", "obj.toString();", "toString();"],
      ids: [3, 0, 0, 0, 0, 0],
      decoded: "<cls> <unk> <unk> <unk> <unk> <unk>",
    },
    NEWLINES: {
      text: BASE_TEST_STRINGS.NEWLINES,
      // "tokens": ["T", "his", "is", "a", "test."],
      ids: [3, 4101, 0, 0, 0, 0],
      decoded: "<cls> T <unk> <unk> <unk> <unk>",
    },
    BASIC: {
      text: BASE_TEST_STRINGS.BASIC,
      // "tokens": ["U", "N", "want\u00e9d,running"],
      ids: [3, 0, 4104, 0],
      decoded: "<cls> <unk> N <unk>",
    },
    CONTROL_TOKENS: {
      text: BASE_TEST_STRINGS.CONTROL_TOKENS,
      // "tokens": ["1\u00002\ufffd3"],
      ids: [3, 0],
      decoded: "<cls> <unk>",
    },
    HELLO_WORLD_TITLECASE: {
      text: BASE_TEST_STRINGS.HELLO_WORLD_TITLECASE,
      // "tokens": ["Hello", "World"],
      ids: [3, 0, 0],
      decoded: "<cls> <unk> <unk>",
    },
    HELLO_WORLD_LOWERCASE: {
      text: BASE_TEST_STRINGS.HELLO_WORLD_LOWERCASE,
      // "tokens": ["hello", "world"],
      ids: [3, 0, 0],
      decoded: "<cls> <unk> <unk>",
    },
    CHINESE_ONLY: {
      text: BASE_TEST_STRINGS.CHINESE_ONLY,
      // "tokens": ["\u751f\u6d3b\u7684\u771f\u8c1b\u662f"],
      ids: [3, 0],
      decoded: "<cls> <unk>",
    },
    LEADING_SPACE: {
      text: BASE_TEST_STRINGS.LEADING_SPACE,
      // "tokens": ["leading", "space"],
      ids: [3, 0, 0],
      decoded: "<cls> <unk> <unk>",
    },
    TRAILING_SPACE: {
      text: BASE_TEST_STRINGS.TRAILING_SPACE,
      // "tokens": ["trailing", "space"],
      ids: [3, 0, 0],
      decoded: "<cls> <unk> <unk>",
    },
    DOUBLE_SPACE: {
      text: BASE_TEST_STRINGS.DOUBLE_SPACE,
      // "tokens": ["Hi", "Hello"],
      ids: [3, 0, 0],
      decoded: "<cls> <unk> <unk>",
    },
    CURRENCY: {
      text: BASE_TEST_STRINGS.CURRENCY,
      // "tokens": ["test", "$1", "R2", "#3", "\u20ac4", "\u00a35", "\u00a56", "\u20a37", "\u20b98", "\u20b19", "test"],
      ids: [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      decoded: "<cls> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>",
    },
    CURRENCY_WITH_DECIMALS: {
      text: BASE_TEST_STRINGS.CURRENCY_WITH_DECIMALS,
      // "tokens": ["I", "bought", "an", "apple", "for", "$1.00", "at", "the", "store."],
      ids: [3, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      decoded: "<cls> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>",
    },
    ELLIPSIS: {
      text: BASE_TEST_STRINGS.ELLIPSIS,
      // "tokens": ["you\u2026"],
      ids: [3, 0],
      decoded: "<cls> <unk>",
    },
    TEXT_WITH_ESCAPE_CHARACTERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS,
      // "tokens": ["you\u2026"],
      ids: [3, 0],
      decoded: "<cls> <unk>",
    },
    TEXT_WITH_ESCAPE_CHARACTERS_2: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS_2,
      // "tokens": ["you\u2026", "you\u2026"],
      ids: [3, 0, 0],
      decoded: "<cls> <unk> <unk>",
    },
    TILDE_NORMALIZATION: {
      text: BASE_TEST_STRINGS.TILDE_NORMALIZATION,
      // "tokens": ["weird", "\uff5e", "edge", "\uff5e", "case"],
      ids: [3, 0, 0, 0, 0, 0],
      decoded: "<cls> <unk> <unk> <unk> <unk> <unk>",
    },
    SPIECE_UNDERSCORE: {
      text: BASE_TEST_STRINGS.SPIECE_UNDERSCORE,
      // "tokens": ["\u2581", "T", "his", "\u2581is", "\u2581a", "\u2581test", "\u2581."],
      ids: [3, 0, 4101, 0, 0, 0, 0, 0],
      decoded: "<cls> <unk> T <unk> <unk> <unk> <unk> <unk>",
    },
    SPECIAL_TOKENS: {
      text: ESM_TEST_STRINGS.SPECIAL_TOKENS,
      tokens: ["<unk>", "<pad>", "<mask>", "<cls>", "<eos>", "<bos>"],
      ids: [3, 0, 1, 2, 3, 4105, 4106],
      decoded: "<cls> <unk> <pad> <mask> <cls> <eos> <bos>",
    },
    PROTEIN_SEQUENCES_1: {
      text: ESM_TEST_STRINGS.PROTEIN_SEQUENCES_1,
      tokens: ["ATTCCG", "ATTCCG", "ATTCCG"],
      ids: [3, 367, 367, 367],
      decoded: "<cls> ATTCCG ATTCCG ATTCCG",
    },
    PROTEIN_SEQUENCES_2: {
      text: ESM_TEST_STRINGS.PROTEIN_SEQUENCES_2,
      tokens: ["ATTTCT", "CTCTCT", "CTCTGA", "GATCGA", "TCGATC", "G", "A", "T"],
      ids: [3, 349, 2461, 2464, 3184, 1738, 4103, 4100, 4101],
      decoded: "<cls> ATTTCT CTCTCT CTCTGA GATCGA TCGATC G A T",
    },
  },
  "Xenova/esm2_t12_35M_UR50D": {
    SIMPLE: {
      text: BASE_TEST_STRINGS.SIMPLE,
      // "tokens": ["H", "ow", "are", "you", "doing?"],
      ids: [0, 21, 3, 3, 3, 3, 2],
      decoded: "<cls> H <unk> <unk> <unk> <unk> <eos>",
    },
    SIMPLE_WITH_PUNCTUATION: {
      text: BASE_TEST_STRINGS.SIMPLE_WITH_PUNCTUATION,
      // "tokens": ["Y", "ou", "should've", "done", "this"],
      ids: [0, 19, 3, 3, 3, 3, 2],
      decoded: "<cls> Y <unk> <unk> <unk> <unk> <eos>",
    },
    NUMBERS: {
      text: BASE_TEST_STRINGS.NUMBERS,
      // "tokens": ["0123456789", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "100", "1000"],
      ids: [0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2],
      decoded: "<cls> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <eos>",
    },
    TEXT_WITH_NUMBERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_NUMBERS,
      // "tokens": ["T", "he", "company", "was", "founded", "in", "2016", "."],
      ids: [0, 11, 3, 3, 3, 3, 3, 3, 29, 2],
      decoded: "<cls> T <unk> <unk> <unk> <unk> <unk> <unk>. <eos>",
    },
    PUNCTUATION: {
      text: BASE_TEST_STRINGS.PUNCTUATION,
      // "tokens": ["A", "'ll", "!!to?'d''d", "of,", "can't", "."],
      ids: [0, 5, 3, 3, 3, 3, 29, 2],
      decoded: "<cls> A <unk> <unk> <unk> <unk>. <eos>",
    },
    PYTHON_CODE: {
      text: BASE_TEST_STRINGS.PYTHON_CODE,
      // "tokens": ["def", "main():", "pass"],
      ids: [0, 3, 3, 3, 2],
      decoded: "<cls> <unk> <unk> <unk> <eos>",
    },
    JAVASCRIPT_CODE: {
      text: BASE_TEST_STRINGS.JAVASCRIPT_CODE,
      // "tokens": ["let", "a", "=", "obj", ".", "to", "S", "tring();", "to", "S", "tring();"],
      ids: [0, 3, 3, 3, 3, 29, 3, 8, 3, 3, 8, 3, 2],
      decoded: "<cls> <unk> <unk> <unk> <unk>. <unk> S <unk> <unk> S <unk> <eos>",
    },
    NEWLINES: {
      text: BASE_TEST_STRINGS.NEWLINES,
      // "tokens": ["T", "his", "is", "a", "test", "."],
      ids: [0, 11, 3, 3, 3, 3, 29, 2],
      decoded: "<cls> T <unk> <unk> <unk> <unk>. <eos>",
    },
    BASIC: {
      text: BASE_TEST_STRINGS.BASIC,
      // "tokens": ["U", "N", "want\u00e9d,running"],
      ids: [0, 26, 17, 3, 2],
      decoded: "<cls> U N <unk> <eos>",
    },
    CONTROL_TOKENS: {
      text: BASE_TEST_STRINGS.CONTROL_TOKENS,
      // "tokens": ["1\u00002\ufffd3"],
      ids: [0, 3, 2],
      decoded: "<cls> <unk> <eos>",
    },
    HELLO_WORLD_TITLECASE: {
      text: BASE_TEST_STRINGS.HELLO_WORLD_TITLECASE,
      // "tokens": ["H", "ello", "W", "orld"],
      ids: [0, 21, 3, 22, 3, 2],
      decoded: "<cls> H <unk> W <unk> <eos>",
    },
    HELLO_WORLD_LOWERCASE: {
      text: BASE_TEST_STRINGS.HELLO_WORLD_LOWERCASE,
      // "tokens": ["hello", "world"],
      ids: [0, 3, 3, 2],
      decoded: "<cls> <unk> <unk> <eos>",
    },
    CHINESE_ONLY: {
      text: BASE_TEST_STRINGS.CHINESE_ONLY,
      // "tokens": ["\u751f\u6d3b\u7684\u771f\u8c1b\u662f"],
      ids: [0, 3, 2],
      decoded: "<cls> <unk> <eos>",
    },
    LEADING_SPACE: {
      text: BASE_TEST_STRINGS.LEADING_SPACE,
      // "tokens": ["leading", "space"],
      ids: [0, 3, 3, 2],
      decoded: "<cls> <unk> <unk> <eos>",
    },
    TRAILING_SPACE: {
      text: BASE_TEST_STRINGS.TRAILING_SPACE,
      // "tokens": ["trailing", "space"],
      ids: [0, 3, 3, 2],
      decoded: "<cls> <unk> <unk> <eos>",
    },
    DOUBLE_SPACE: {
      text: BASE_TEST_STRINGS.DOUBLE_SPACE,
      // "tokens": ["H", "i", "H", "ello"],
      ids: [0, 21, 3, 21, 3, 2],
      decoded: "<cls> H <unk> H <unk> <eos>",
    },
    CURRENCY: {
      text: BASE_TEST_STRINGS.CURRENCY,
      // "tokens": ["test", "$1", "R", "2", "#3", "\u20ac4", "\u00a35", "\u00a56", "\u20a37", "\u20b98", "\u20b19", "test"],
      ids: [0, 3, 3, 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2],
      decoded: "<cls> <unk> <unk> R <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <eos>",
    },
    CURRENCY_WITH_DECIMALS: {
      text: BASE_TEST_STRINGS.CURRENCY_WITH_DECIMALS,
      // "tokens": ["I", "bought", "an", "apple", "for", "$1", ".", "00", "at", "the", "store", "."],
      ids: [0, 12, 3, 3, 3, 3, 3, 29, 3, 3, 3, 3, 29, 2],
      decoded: "<cls> I <unk> <unk> <unk> <unk> <unk>. <unk> <unk> <unk> <unk>. <eos>",
    },
    ELLIPSIS: {
      text: BASE_TEST_STRINGS.ELLIPSIS,
      // "tokens": ["you\u2026"],
      ids: [0, 3, 2],
      decoded: "<cls> <unk> <eos>",
    },
    TEXT_WITH_ESCAPE_CHARACTERS: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS,
      // "tokens": ["you\u2026"],
      ids: [0, 3, 2],
      decoded: "<cls> <unk> <eos>",
    },
    TEXT_WITH_ESCAPE_CHARACTERS_2: {
      text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS_2,
      // "tokens": ["you\u2026", "you\u2026"],
      ids: [0, 3, 3, 2],
      decoded: "<cls> <unk> <unk> <eos>",
    },
    TILDE_NORMALIZATION: {
      text: BASE_TEST_STRINGS.TILDE_NORMALIZATION,
      // "tokens": ["weird", "\uff5e", "edge", "\uff5e", "case"],
      ids: [0, 3, 3, 3, 3, 3, 2],
      decoded: "<cls> <unk> <unk> <unk> <unk> <unk> <eos>",
    },
    SPIECE_UNDERSCORE: {
      text: BASE_TEST_STRINGS.SPIECE_UNDERSCORE,
      // "tokens": ["\u2581", "T", "his", "\u2581is", "\u2581a", "\u2581test", "\u2581", "."],
      ids: [0, 3, 11, 3, 3, 3, 3, 3, 29, 2],
      decoded: "<cls> <unk> T <unk> <unk> <unk> <unk> <unk>. <eos>",
    },
    SPECIAL_TOKENS: {
      text: ESM_TEST_STRINGS.SPECIAL_TOKENS,
      // "tokens": ["<unk>", "<pad>", "<mask>", "<cls>", "<eos>", "<bos>"],
      ids: [0, 3, 1, 32, 0, 2, 3, 2],
      decoded: "<cls> <unk> <pad> <mask> <cls> <eos> <unk> <eos>",
    },
    PROTEIN_SEQUENCES_1: {
      text: ESM_TEST_STRINGS.PROTEIN_SEQUENCES_1,
      tokens: ["A", "T", "T", "C", "C", "G", "A", "T", "T", "C", "C", "G", "A", "T", "T", "C", "C", "G"],
      ids: [0, 5, 11, 11, 23, 23, 6, 5, 11, 11, 23, 23, 6, 5, 11, 11, 23, 23, 6, 2],
      decoded: "<cls> A T T C C G A T T C C G A T T C C G <eos>",
    },
    PROTEIN_SEQUENCES_2: {
      text: ESM_TEST_STRINGS.PROTEIN_SEQUENCES_2,
      tokens: ["A", "T", "T", "T", "C", "T", "C", "T", "C", "T", "C", "T", "C", "T", "C", "T", "G", "A", "G", "A", "T", "C", "G", "A", "T", "C", "G", "A", "T", "C", "G", "A", "T"],
      ids: [0, 5, 11, 11, 11, 23, 11, 23, 11, 23, 11, 23, 11, 23, 11, 23, 11, 6, 5, 6, 5, 11, 23, 6, 5, 11, 23, 6, 5, 11, 23, 6, 5, 11, 2],
      decoded: "<cls> A T T T C T C T C T C T C T C T G A G A T C G A T C G A T C G A T <eos>",
    },
  },
};