supanthadey1 commited on
Commit
fdbd126
·
verified ·
1 Parent(s): b64c0a4

Harden ambiguity token map for WURCS uncertainty markers

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. SHA256SUMS +2 -2
  3. vocab/bpe_ambiguity_tokens.json +853 -424
README.md CHANGED
@@ -20,7 +20,7 @@ This repository contains the contrastive Bertose checkpoint used to score ambigu
20
 
21
  - `checkpoints/best_v51_contrastive_model.pt` - contrastive ambiguity-resolution checkpoint.
22
  - `vocab/bpe_vocabulary.json` - WURCS BPE vocabulary.
23
- - `vocab/bpe_ambiguity_tokens.json` - ambiguous BPE token map used by the resolver.
24
  - `src/multimodal_glycan_bert_v3.py` - model definition.
25
  - `src/glycan_bert.py` - base BERT layers used by the multimodal model.
26
  - `src/wurcs_bpe_tokenizer.py` - WURCS BPE tokenizer.
 
20
 
21
  - `checkpoints/best_v51_contrastive_model.pt` - contrastive ambiguity-resolution checkpoint.
22
  - `vocab/bpe_vocabulary.json` - WURCS BPE vocabulary.
23
+ - `vocab/bpe_ambiguity_tokens.json` - ambiguous BPE token map used by the resolver, including common WURCS uncertainty-marker tokens.
24
  - `src/multimodal_glycan_bert_v3.py` - model definition.
25
  - `src/glycan_bert.py` - base BERT layers used by the multimodal model.
26
  - `src/wurcs_bpe_tokenizer.py` - WURCS BPE tokenizer.
SHA256SUMS CHANGED
@@ -20,7 +20,7 @@ eb200fe67e613751c0571950e9a7f22f9f44fde0f85b73a40d392189a203f465 ./.cache/huggi
20
  e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 ./.cache/huggingface/upload/vocab/bpe_vocabulary.json.lock
21
  c00560217b399adfb341aacc38053299c7d4b33b4229e89e68275cd454bb7f5b ./.cache/huggingface/upload/vocab/bpe_vocabulary.json.metadata
22
  622368f62c23e97e9137c277eaadcc93ee3901cbb420b591422bb1c2e19689a5 ./.gitattributes
23
- 21912ebe4c2b720eac3164c3628f37a39d6c918221c84e04b76a914fd709752d ./README.md
24
  ae468f4e8c06dc0c3848138a474dc43249aa6d14dfd0df8f58d68fcaad371152 ./checkpoints/best_v51_contrastive_model.pt
25
  daf55c190fece0678064e41697a9545592beb1285f8aa74e595b933b9d37b4c2 ./config.json
26
  6a56e6f73b8f874470ecde6e538f3f5029ae23aa6c10559817d1c2a8b59b7c0f ./requirements.txt
@@ -30,5 +30,5 @@ daf55c190fece0678064e41697a9545592beb1285f8aa74e595b933b9d37b4c2 ./config.json
30
  b69f14c9976951325e3a0a4e8107a16126e67d410e966650f513f1f538a732bb ./src/glycan_bert.py
31
  0d9ce16bf90242f38621d64cd974ea5679bff4c2013bea8d7bffe1b8dd120794 ./src/multimodal_glycan_bert_v3.py
32
  0bc54399362945601bcfd403441fc80968d173200dd0561f57568b2053a94839 ./src/wurcs_bpe_tokenizer.py
33
- c68cd003370b2dcdb162f848f766e4e62f2653c6c38d205f8cbe53a9aabe2d74 ./vocab/bpe_ambiguity_tokens.json
34
  6a572afdf53f1494ab96c896876b824ca7ea749777352606aa9f96bf270ceecc ./vocab/bpe_vocabulary.json
 
20
  e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 ./.cache/huggingface/upload/vocab/bpe_vocabulary.json.lock
21
  c00560217b399adfb341aacc38053299c7d4b33b4229e89e68275cd454bb7f5b ./.cache/huggingface/upload/vocab/bpe_vocabulary.json.metadata
22
  622368f62c23e97e9137c277eaadcc93ee3901cbb420b591422bb1c2e19689a5 ./.gitattributes
23
+ 4d7dc814c23492bd75c154e35436fff108221285956106deeb117e561a6c560b ./README.md
24
  ae468f4e8c06dc0c3848138a474dc43249aa6d14dfd0df8f58d68fcaad371152 ./checkpoints/best_v51_contrastive_model.pt
25
  daf55c190fece0678064e41697a9545592beb1285f8aa74e595b933b9d37b4c2 ./config.json
26
  6a56e6f73b8f874470ecde6e538f3f5029ae23aa6c10559817d1c2a8b59b7c0f ./requirements.txt
 
30
  b69f14c9976951325e3a0a4e8107a16126e67d410e966650f513f1f538a732bb ./src/glycan_bert.py
31
  0d9ce16bf90242f38621d64cd974ea5679bff4c2013bea8d7bffe1b8dd120794 ./src/multimodal_glycan_bert_v3.py
32
  0bc54399362945601bcfd403441fc80968d173200dd0561f57568b2053a94839 ./src/wurcs_bpe_tokenizer.py
33
+ ae6ab1ee4f2be992099ee5766de073954c74ccb005c490179cc70418c587c5b7 ./vocab/bpe_ambiguity_tokens.json
34
  6a572afdf53f1494ab96c896876b824ca7ea749777352606aa9f96bf270ceecc ./vocab/bpe_vocabulary.json
vocab/bpe_ambiguity_tokens.json CHANGED
@@ -1,7 +1,10 @@
1
  {
2
  "ambiguous_tokens": {
3
  "?": 32,
 
 
4
  "?|": 90,
 
5
  "a?|": 108,
6
  "a?|b": 109,
7
  "?|c": 110,
@@ -13,354 +16,568 @@
13
  "?|f": 116,
14
  "a?|b?|c?|d?|e?|f": 117,
15
  "?-": 118,
16
- "?|g": 119,
17
- "a?|b?|c?|d?|e?|f?|g": 120,
18
  "?|h": 122,
19
  "?|i": 123,
20
  "?|h?|i": 124,
21
- "?|j": 125,
22
- "?|h?|i?|j": 126,
23
  "?|k": 128,
24
  "?|h?|i?|j?|k": 129,
25
  "?|l": 130,
26
  "?|h?|i?|j?|k?|l": 131,
27
- "?|m": 132,
28
- "?|h?|i?|j?|k?|l?|m": 133,
29
  "?|h?|i?|j?|k?|l?|m?|": 138,
30
  "n?|": 141,
31
  "n?|o": 142,
32
  "?}": 143,
33
- "n?|o?|": 146,
34
- "n?|o?|p": 147,
35
  "?}-": 149,
36
  "?}-{": 150,
37
- "n?|o?|p?|": 153,
38
- "n?|o?|p?|q": 154,
 
39
  "n?|o?|p?|q?|": 157,
40
  "n?|o?|p?|q?|r": 158,
 
 
41
  "n?|o?|p?|q?|r?|": 165,
42
  "n?|o?|p?|q?|r?|s": 166,
43
  "n?|o?|p?|q?|r?|s?|": 170,
44
  "n?|o?|p?|q?|r?|s?|t": 171,
45
- "?|u": 189,
46
- "a?-": 197,
 
 
 
 
 
 
 
47
  "c?-": 201,
48
- "?|u?|": 209,
49
- "?|u?|v": 210,
50
- "b?-": 211,
 
51
  "a?-b1": 213,
52
  "d?-": 217,
 
53
  "b?-c1": 221,
54
- "c?-d1": 230,
55
- "?|u?|v?|": 231,
56
- "?|u?|v?|w": 232,
 
 
 
57
  "1-?": 242,
58
  "d?-e1": 244,
59
- "e?-": 245,
 
60
  "e?-f1": 262,
61
  "?|u?|v?|w?|": 266,
62
  "?|u?|v?|w?|x": 267,
63
- "f?-": 273,
 
 
64
  "?|u?|v?|w?|x?|": 288,
65
  "?|u?|v?|w?|x?|y": 289,
66
- "g?-": 298,
67
- "n?|o?|p?|q?|r?|s?|t?": 304,
 
68
  "i?-": 306,
 
69
  "h?-": 308,
70
- "?|u?}-{": 312,
71
- "?|u?": 313,
72
- "n?|o?|p?|q?|r?}-{": 314,
73
- "n?|o?|p?|q?|r?": 315,
74
- "f?-g1": 318,
75
  "n?|o?}-{": 322,
76
  "n?|o?": 323,
77
  "?|u?|v?|w?|x?|y?|": 325,
78
  "?|u?|v?|w?|x?|y?|z": 326,
79
- "n?|o?|p?|q?|r?|s?}-{": 328,
80
- "n?|o?|p?|q?|r?|s?": 329,
81
- "n?|o?|p?}-{": 331,
82
- "n?|o?|p?": 332,
 
83
  "n?|o?|p?|q?}-{": 336,
84
  "n?|o?|p?|q?": 337,
85
  "g?-h1": 339,
86
- "?|h?|i?|j?|k?|l?}-{": 342,
87
- "?|h?|i?|j?|k?|l?": 343,
88
- "n?}-{": 344,
89
- "n?": 345,
90
- "j?-": 346,
91
  "h?-i1": 347,
92
  "?|u?|v?}-{": 351,
93
  "?|u?|v?": 352,
94
  "k?-": 353,
95
- "i?-j1": 355,
96
- "?|h?|i?|j?|k?|l?|m?": 363,
97
- "?|h?|i?|j?|k?}-{": 364,
98
- "?|h?|i?|j?|k?": 365,
99
- "?|u?|v?|w?|x?|y?|z?|": 369,
 
 
100
  "?|h?|i?|j?}-{": 375,
101
  "?|h?|i?|j?": 376,
102
- "l?-": 377,
103
- "A?|": 392,
104
- "A?|B": 393,
 
 
 
 
105
  "j?-k1": 401,
106
- "m?-": 404,
107
  "?|h?|i?}-{": 408,
108
  "?|h?|i?": 409,
109
- "k?-l1": 418,
110
- "?|h?}-{": 420,
111
- "?|h?": 421,
112
- "A?|B?|": 424,
113
- "A?|B?|C": 425,
114
- "a?|b?|c?|d?|e?|f?|g?": 427,
115
- "f?-g2": 431,
116
- "a?|b?|c?|d?|e?|f?}-{": 437,
117
- "a?|b?|c?|d?|e?|f?": 438,
118
- "l?-m1": 442,
119
- "?|u?|v?|w?}-{": 450,
120
- "?|u?|v?|w?": 451,
 
 
 
 
 
 
 
121
  "A?|B?|C?|": 464,
122
  "A?|B?|C?|D": 465,
123
- "a?|b?|c?|d?|e?}-{": 475,
124
- "a?|b?|c?|d?|e?": 476,
125
- "n?-": 499,
 
126
  "a?|b?|c?|d?}-{": 502,
127
  "a?|b?|c?|d?": 503,
128
- "m?-n1": 518,
129
- "A?|B?|C?|D?|": 521,
130
- "A?|B?|C?|D?|E": 522,
131
- "o?-": 534,
132
- "d?-h1": 536,
133
- "A?|B?|C?|D?|E?|": 542,
134
- "A?|B?|C?|D?|E?|F": 543,
135
- "c?-i1": 544,
136
- "c?-h1": 549,
 
 
 
 
 
 
137
  "A?|B?|C?|D?|E?|F?|": 550,
138
  "A?|B?|C?|D?|E?|F?|G": 551,
139
- "?|u?|v?|w?|x?}-{": 563,
140
- "?|u?|v?|w?|x?": 564,
141
  "a?|b?|c?}-{": 571,
142
  "a?|b?|c?}-{a?|b?|c": 572,
143
  "a?|b?|c?}-{a?|b?|c?": 573,
144
- "?|H": 581,
145
- "2-?": 592,
 
 
 
 
146
  "?|H?|": 598,
147
  "?|H?|I": 599,
148
  "?}*OC": 600,
149
- "c?-k1": 607,
 
150
  "c?-g1": 609,
151
- "?|H?|I?|": 615,
152
- "?|H?|I?|J": 616,
153
- "n?-o1": 617,
154
- "d?-g1": 629,
155
- "o?-p1": 634,
 
 
 
156
  "p?-": 646,
157
- "?|u?|v?|w?|x?|y?}-{": 653,
158
- "?|u?|v?|w?|x?|y?": 654,
159
- "b?-c2": 656,
160
- "d?-i1": 658,
161
- "c?-j1": 691,
162
- "?}*OSO": 696,
163
- "e?-h1": 701,
164
- "q?-": 713,
165
- "c?-f1": 720,
166
- "i?-j2": 728,
167
- "?|h?|i?}": 742,
168
- "h?-i2": 747,
169
- "g?-h2": 753,
170
- "c?-l1": 756,
171
- "j?-k2": 758,
172
- "?|h?}": 759,
173
- "c?-e1": 760,
 
 
 
 
 
 
 
 
 
 
 
174
  "?|H?|I?|J?|": 761,
175
  "?|H?|I?|J?|K": 762,
176
- "a?|b?|c?|d?|e?|f?}": 772,
177
- "b?-e1": 774,
 
 
178
  "b?-f1": 791,
179
- "d?-f1": 794,
180
- "p?-q1": 796,
181
- "a?|b?|c?|d?|e?}": 798,
182
- "a?-d1": 800,
183
- "m?-n2": 803,
184
- "e?-g1": 809,
185
- "?|h?|i?|j?}": 812,
186
- "r?-": 817,
187
- "a?-c1": 818,
188
- "?|u?|v?|w?|x?|y?|z?": 822,
189
- "a?-e1": 826,
190
- "d?-j1": 833,
191
- "b?-g1": 834,
192
- "q?-r1": 847,
193
- "d?-e2": 854,
194
- "c?-m1": 860,
 
 
 
 
 
 
 
 
 
195
  "a?-f1": 875,
196
- "b?-d1": 887,
197
- "?|H?|I?|J?|K?|": 892,
198
- "?|H?|I?|J?|K?|L": 893,
199
- "?|H?|I?|J?|K?|L?|": 894,
200
- "?|H?|I?|J?|K?|L?|M": 895,
201
- "?|H?|I?|J?|K?|L?|M?|": 896,
202
- "a?-l1": 920,
203
- "?*OSO/3=O/3=O": 923,
204
- "k?-l2": 940,
205
- "k?-o1": 942,
206
- "N?|": 965,
207
- "N?|O": 966,
208
- "N?|O?|": 967,
209
- "N?|O?|P": 968,
210
- "N?|O?|P?|": 969,
211
- "N?|O?|P?|Q": 970,
212
- "N?|O?|P?|Q?|": 971,
213
- "N?|O?|P?|Q?|R": 972,
214
- "N?|O?|P?|Q?|R?|": 973,
215
- "N?|O?|P?|Q?|R?|S": 974,
216
- "N?|O?|P?|Q?|R?|S?|": 975,
217
- "N?|O?|P?|Q?|R?|S?|T": 976,
218
- "?|U": 977,
219
- "?|U?|": 978,
220
- "?|U?|V": 979,
221
- "c?-d2": 983,
222
- "r?-s1": 988,
223
- "a?|b?}-{": 995,
224
- "a?|b?}-{a?|b": 996,
225
- "a?|b?}-{a?|b?": 997,
226
- "e?-f2": 1001,
227
- "g?-i1": 1006,
228
- "i?-l1": 1010,
229
- "s?-": 1011,
230
- "?|h?|i?|j?|k?}": 1017,
231
- "b?-h1": 1034,
232
- "a?-j1": 1038,
233
- "n?-o2": 1046,
234
- "a?-b2": 1069,
235
- "e?-i1": 1095,
236
- "h?-j1": 1102,
237
- "a?-k1": 1108,
238
- "i?-k1": 1115,
239
- "a?-g1": 1116,
240
- "?}*OPO": 1122,
241
- "d?-k1": 1129,
242
- "a?-m1": 1151,
243
- "a?-i1": 1159,
244
- "A?}-{": 1174,
245
- "A?": 1175,
246
- "?}*OCC": 1177,
247
- "l?-m2": 1179,
248
- "A?|B?}-{": 1180,
249
- "A?|B?": 1181,
250
- "f?-h1": 1183,
251
- "a?-n1": 1189,
252
- "p?-q2": 1192,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  "c?-n1": 1197,
254
- "?|U?|V?|": 1202,
255
- "?|U?|V?|W": 1203,
256
- "?|U?|V?|W?|": 1204,
257
- "?|U?|V?|W?|X": 1205,
258
- "?|U?|V?|W?|X?|": 1206,
259
- "?|U?|V?|W?|X?|Y": 1207,
260
- "?|a": 1208,
 
 
 
 
261
  "s?-t1": 1223,
262
- "?|h?|i?|j?|k?|l?|m?}": 1228,
263
- "g?-j1": 1234,
264
- "A?|B?|C?|D?}-{": 1242,
265
- "A?|B?|C?|D?": 1243,
266
- "a?-h1": 1253,
267
- "?|H?|I?|J?}-{": 1257,
268
- "?|H?|I?|J?": 1258,
269
- "o?-p2": 1261,
270
- "b?-i1": 1273,
271
- "?|h?|i?|j?|k?|l?}": 1309,
272
- "j?-m1": 1317,
273
- "c?-o1": 1318,
274
- "a?-o1": 1330,
275
- "a?|b?|c?}*OC": 1331,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  "b?-j1": 1357,
277
  "a?-r1": 1361,
278
  "n?}": 1363,
 
279
  "A?|B?|C?}-{": 1371,
280
  "A?|B?|C?": 1372,
281
- "m?-p1": 1375,
282
- "l?-p1": 1383,
283
- "a?-p1": 1444,
284
- "k?-n1": 1446,
285
- "j?-l1": 1470,
286
- "?|U?|V?|W?|X?|Y?|": 1471,
287
- "?|U?|V?|W?|X?|Y?|Z": 1472,
288
- "?|aa?|": 1473,
289
- "?|aa?|a": 1474,
290
- "?|aa?|ab": 1475,
291
- "?*OPO/3O/3=O": 1476,
292
- "l?-q1": 1489,
293
- "l?-n1": 1499,
294
- "a?-s1": 1517,
295
- "k?-m1": 1524,
296
- "a?-q1": 1546,
297
- "c?-q1": 1547,
298
- "t?-": 1551,
299
- "a?|b?|c?|d?}*OC": 1565,
300
- "f?-i1": 1590,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  "c?-p1": 1591,
302
  "n?-q1": 1593,
303
- "?|i?}": 1611,
304
- "a?|b?|c?|d?|e?}*OC": 1612,
305
- "m?-q1": 1617,
306
- "q?-r2": 1623,
307
- "l?-o1": 1624,
308
- "m?-r1": 1628,
309
- "a?-t1": 1630,
 
 
 
 
 
 
 
310
  "a?|b?|c?|d?}*OSO": 1649,
311
- "c?-r1": 1675,
312
- "1-d?|i?}": 1683,
313
- "j?-n1": 1691,
314
- "u?-": 1694,
 
 
 
 
 
 
 
315
  "a?|b?|c?|d?|e?}*OSO": 1718,
316
- "?*OCC/3=O": 1723,
317
- "?%": 1752,
318
- "?*OP^XOCCN/3O/3=O": 1770,
319
- "t?-u1": 1772,
320
- "?*": 1774,
321
- "c?-s1": 1775,
322
- "a?-u1": 1793,
323
- "f?-h2": 1808,
324
- "e?-j1": 1811,
325
- "c?-t1": 1818,
326
- "f1-a?|b?|c?|d?|e?}": 1822,
327
- "u?-v1": 1835,
328
- "h?-k1": 1841,
329
- "?|H?|I?|J?|K?}-{": 1846,
330
- "?|H?|I?|J?|K?": 1847,
331
- "n?|o?}": 1851,
332
- "1-d?|h?}": 1852,
333
- "q?-s1": 1872,
334
- "%?%": 1880,
335
- "b?-g2": 1881,
336
- "r?-s2": 1882,
337
- "d?-l1": 1898,
338
- "v?-": 1917,
339
- "b?-k1": 1927,
340
- "?|aa?|ab?|a": 1942,
341
- "?|aa?|ab?|ac": 1943,
342
- "?|aa?|ab?|ac?|": 1944,
343
- "?|aa?|ab?|ac?|ad": 1945,
344
- "a?|b?}*OC": 1949,
345
- "?*OC": 1952,
346
- "e?-k1": 1955,
347
- "a?-d2": 1999,
348
- "s?-t2": 2013,
349
- "a?-f2": 2027,
350
- "o?-q1": 2030,
351
- "?}*OP^XOCCN": 2040,
352
- "a?|b?|c?}*OCC": 2047,
353
- "m?-o1": 2048,
354
- "c?-f2": 2058,
355
- "A?|B?|C?|D?|E?|F?|G?": 2060,
356
- "a?|b?|c?}*OSO": 2071,
357
- "?|U?|V?}-{": 2079,
358
- "?|U?|V?": 2080,
359
- "c?-u1": 2087
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
  },
361
  "ambiguous_ids": [
362
  32,
 
 
363
  90,
 
364
  108,
365
  109,
366
  110,
@@ -372,73 +589,94 @@
372
  116,
373
  117,
374
  118,
375
- 119,
376
  120,
 
377
  122,
378
  123,
379
  124,
380
- 125,
381
  126,
 
382
  128,
383
  129,
384
  130,
385
  131,
386
- 132,
387
  133,
 
388
  138,
389
  141,
390
  142,
391
  143,
392
- 146,
393
  147,
 
394
  149,
395
  150,
396
  153,
397
  154,
 
398
  157,
399
  158,
 
 
400
  165,
401
  166,
402
  170,
403
  171,
 
 
 
 
404
  189,
 
 
405
  197,
 
406
  201,
 
407
  209,
408
  210,
409
  211,
410
  213,
411
  217,
 
412
  221,
 
413
  230,
414
- 231,
415
- 232,
 
 
416
  242,
417
  244,
418
- 245,
 
419
  262,
420
  266,
421
  267,
422
- 273,
 
 
423
  288,
424
  289,
425
- 298,
426
- 304,
 
427
  306,
 
428
  308,
429
- 312,
430
  313,
431
  314,
432
  315,
433
- 318,
 
434
  322,
435
  323,
436
  325,
437
  326,
438
- 328,
439
  329,
440
- 331,
441
  332,
 
 
442
  336,
443
  337,
444
  339,
@@ -451,123 +689,181 @@
451
  351,
452
  352,
453
  353,
 
454
  355,
455
- 363,
456
  364,
457
- 365,
458
- 369,
 
 
459
  375,
460
  376,
461
- 377,
462
- 392,
 
463
  393,
 
 
 
464
  401,
465
- 404,
466
  408,
467
  409,
468
- 418,
469
- 420,
470
  421,
471
- 424,
472
  425,
473
- 427,
 
 
474
  431,
475
- 437,
476
  438,
477
- 442,
478
- 450,
 
 
 
 
479
  451,
 
 
480
  464,
481
  465,
482
- 475,
483
  476,
484
- 499,
 
 
485
  502,
486
  503,
487
- 518,
488
- 521,
489
- 522,
490
- 534,
 
 
 
 
 
491
  536,
492
- 542,
493
  543,
494
  544,
 
 
495
  549,
496
  550,
497
  551,
498
- 563,
499
- 564,
500
  571,
501
  572,
502
  573,
503
- 581,
504
- 592,
 
 
 
 
505
  598,
506
  599,
507
  600,
508
- 607,
 
509
  609,
510
- 615,
511
- 616,
512
  617,
513
- 629,
514
- 634,
 
 
 
 
 
515
  646,
516
- 653,
517
- 654,
518
  656,
 
519
  658,
520
- 691,
 
 
 
 
521
  696,
522
- 701,
 
523
  713,
 
 
 
524
  720,
525
- 728,
 
526
  742,
527
- 747,
528
- 753,
 
 
529
  756,
 
530
  758,
531
- 759,
532
  760,
533
  761,
534
  762,
535
- 772,
536
- 774,
 
 
537
  791,
538
- 794,
539
- 796,
540
- 798,
541
  800,
 
 
542
  803,
543
- 809,
544
  812,
545
- 817,
546
  818,
547
- 822,
548
- 826,
549
- 833,
550
- 834,
551
- 847,
552
- 854,
553
- 860,
 
 
 
 
 
 
 
 
554
  875,
 
 
555
  887,
556
- 892,
557
- 893,
558
  894,
559
  895,
560
  896,
561
- 920,
 
 
 
 
562
  923,
563
- 940,
564
- 942,
565
- 965,
566
- 966,
 
 
 
 
 
 
 
567
  967,
568
- 968,
569
- 969,
570
- 970,
571
  971,
572
  972,
573
  973,
@@ -577,145 +873,278 @@
577
  977,
578
  978,
579
  979,
 
 
 
580
  983,
581
- 988,
582
- 995,
 
 
583
  996,
584
  997,
 
 
585
  1001,
586
- 1006,
587
- 1010,
588
- 1011,
589
- 1017,
 
 
 
590
  1034,
591
- 1038,
592
- 1046,
593
- 1069,
594
- 1095,
595
- 1102,
 
 
 
 
596
  1108,
597
- 1115,
598
- 1116,
599
- 1122,
600
- 1129,
601
- 1151,
 
 
 
 
 
602
  1159,
603
- 1174,
604
- 1175,
605
- 1177,
 
606
  1179,
607
  1180,
608
- 1181,
609
- 1183,
 
 
610
  1189,
611
- 1192,
 
 
612
  1197,
613
- 1202,
 
614
  1203,
615
  1204,
616
  1205,
617
  1206,
618
  1207,
619
  1208,
 
 
 
620
  1223,
621
- 1228,
622
- 1234,
 
 
623
  1242,
624
- 1243,
625
- 1253,
 
 
 
 
626
  1257,
627
- 1258,
628
  1261,
629
- 1273,
630
- 1309,
631
- 1317,
632
- 1318,
633
- 1330,
 
 
 
 
 
 
 
 
 
 
634
  1331,
 
 
635
  1357,
636
  1361,
637
  1363,
 
638
  1371,
639
  1372,
640
- 1375,
641
- 1383,
642
- 1444,
 
 
 
 
 
643
  1446,
644
- 1470,
645
- 1471,
 
646
  1472,
647
  1473,
648
  1474,
649
  1475,
650
  1476,
 
 
 
 
651
  1489,
652
- 1499,
653
- 1517,
654
- 1524,
655
- 1546,
656
- 1547,
 
 
 
 
 
657
  1551,
658
- 1565,
659
- 1590,
 
 
 
660
  1591,
661
  1593,
662
  1611,
663
  1612,
664
- 1617,
665
- 1623,
666
- 1624,
667
- 1628,
 
 
 
668
  1630,
 
 
 
 
669
  1649,
670
- 1675,
671
- 1683,
672
- 1691,
673
- 1694,
 
 
 
 
 
 
 
674
  1718,
675
- 1723,
676
- 1752,
677
- 1770,
678
- 1772,
679
- 1774,
680
- 1775,
 
 
 
 
 
 
 
 
 
 
 
 
681
  1793,
 
 
682
  1808,
683
- 1811,
684
- 1818,
685
  1822,
686
- 1835,
 
 
 
687
  1841,
688
- 1846,
689
- 1847,
690
  1851,
691
  1852,
 
 
692
  1872,
693
- 1880,
694
- 1881,
695
  1882,
696
- 1898,
 
 
 
 
 
 
 
 
 
697
  1917,
698
- 1927,
699
- 1942,
700
- 1943,
 
 
 
 
 
701
  1944,
702
  1945,
 
 
703
  1949,
704
  1952,
705
- 1955,
706
- 1999,
707
- 2013,
 
 
 
 
 
 
 
 
 
 
 
 
708
  2027,
 
 
709
  2030,
710
- 2040,
711
- 2047,
712
- 2048,
713
- 2058,
714
- 2060,
 
 
 
 
715
  2071,
716
- 2079,
717
  2080,
718
- 2087
 
 
 
719
  ],
720
- "source_vocab": "data/bpe_vocabulary_clean.json"
721
- }
 
 
1
  {
2
  "ambiguous_tokens": {
3
  "?": 32,
4
+ "u": 80,
5
+ "x": 83,
6
  "?|": 90,
7
+ "xx": 105,
8
  "a?|": 108,
9
  "a?|b": 109,
10
  "?|c": 110,
 
16
  "?|f": 116,
17
  "a?|b?|c?|d?|e?|f": 117,
18
  "?-": 118,
19
+ "?|g": 120,
20
+ "a?|b?|c?|d?|e?|f?|g": 121,
21
  "?|h": 122,
22
  "?|i": 123,
23
  "?|h?|i": 124,
24
+ "?|j": 126,
25
+ "?|h?|i?|j": 127,
26
  "?|k": 128,
27
  "?|h?|i?|j?|k": 129,
28
  "?|l": 130,
29
  "?|h?|i?|j?|k?|l": 131,
30
+ "?|m": 133,
31
+ "?|h?|i?|j?|k?|l?|m": 134,
32
  "?|h?|i?|j?|k?|l?|m?|": 138,
33
  "n?|": 141,
34
  "n?|o": 142,
35
  "?}": 143,
36
+ "n?|o?|": 147,
37
+ "n?|o?|p": 148,
38
  "?}-": 149,
39
  "?}-{": 150,
40
+ "xxxx": 153,
41
+ "n?|o?|p?|": 154,
42
+ "n?|o?|p?|q": 155,
43
  "n?|o?|p?|q?|": 157,
44
  "n?|o?|p?|q?|r": 158,
45
+ "a2122h-1x": 159,
46
+ "u2": 160,
47
  "n?|o?|p?|q?|r?|": 165,
48
  "n?|o?|p?|q?|r?|s": 166,
49
  "n?|o?|p?|q?|r?|s?|": 170,
50
  "n?|o?|p?|q?|r?|s?|t": 171,
51
+ "uxxxx": 174,
52
+ "h-1x": 176,
53
+ "a2112h-1x": 179,
54
+ "uxxxxh": 188,
55
+ "u2122h": 189,
56
+ "?|u": 190,
57
+ "axxxx": 195,
58
+ "a1122h-1x": 197,
59
+ "a?-": 198,
60
  "c?-": 201,
61
+ "axxxxh-1x": 202,
62
+ "b?-": 209,
63
+ "?|u?|": 210,
64
+ "?|u?|v": 211,
65
  "a?-b1": 213,
66
  "d?-": 217,
67
+ "u2112h": 220,
68
  "b?-c1": 221,
69
+ "c?-d1": 229,
70
+ "u1": 230,
71
+ "?|u?|v?|": 233,
72
+ "?|u?|v?|w": 234,
73
+ "Aad21122h-2x": 235,
74
+ "a1221m-1x": 241,
75
  "1-?": 242,
76
  "d?-e1": 244,
77
+ "e?-": 246,
78
+ "u1122h": 254,
79
  "e?-f1": 262,
80
  "?|u?|v?|w?|": 266,
81
  "?|u?|v?|w?|x": 267,
82
+ "f?-": 272,
83
+ "xh": 280,
84
+ "m-1x": 284,
85
  "?|u?|v?|w?|x?|": 288,
86
  "?|u?|v?|w?|x?|y": 289,
87
+ "g?-": 296,
88
+ "u1221": 297,
89
+ "u1221m": 300,
90
  "i?-": 306,
91
+ "n?|o?|p?|q?|r?|s?|t?": 307,
92
  "h?-": 308,
93
+ "?|u?}-{": 313,
94
+ "?|u?": 314,
95
+ "n?|o?|p?|q?|r?}-{": 315,
96
+ "n?|o?|p?|q?|r?": 316,
97
+ "f?-g1": 317,
98
  "n?|o?}-{": 322,
99
  "n?|o?": 323,
100
  "?|u?|v?|w?|x?|y?|": 325,
101
  "?|u?|v?|w?|x?|y?|z": 326,
102
+ "n?|o?|p?|q?|r?|s?}-{": 329,
103
+ "n?|o?|p?|q?|r?|s?": 330,
104
+ "n?|o?|p?}-{": 332,
105
+ "n?|o?|p?": 333,
106
+ "uxxxxm": 335,
107
  "n?|o?|p?|q?}-{": 336,
108
  "n?|o?|p?|q?": 337,
109
  "g?-h1": 339,
110
+ "j?-": 342,
111
+ "?|h?|i?|j?|k?|l?}-{": 343,
112
+ "?|h?|i?|j?|k?|l?": 344,
113
+ "n?}-{": 345,
114
+ "n?": 346,
115
  "h?-i1": 347,
116
  "?|u?|v?}-{": 351,
117
  "?|u?|v?": 352,
118
  "k?-": 353,
119
+ "i?-j1": 354,
120
+ "a11221h-1x": 355,
121
+ "?|h?|i?|j?|k?|l?|m?": 364,
122
+ "?|h?|i?|j?|k?}-{": 366,
123
+ "?|h?|i?|j?|k?": 367,
124
+ "?|u?|v?|w?|x?|y?|z?|": 370,
125
+ "xh-1": 372,
126
  "?|h?|i?|j?}-{": 375,
127
  "?|h?|i?|j?": 376,
128
+ "l?-": 379,
129
+ "dxxxx": 381,
130
+ "AUdxxxx": 388,
131
+ "A?|": 393,
132
+ "A?|B": 394,
133
+ "A-1x": 395,
134
+ "ax": 399,
135
  "j?-k1": 401,
136
+ "m?-": 403,
137
  "?|h?|i?}-{": 408,
138
  "?|h?|i?": 409,
139
+ "k?-l1": 417,
140
+ "f?-g2": 419,
141
+ "?|h?}-{": 421,
142
+ "?|h?": 422,
143
+ "A?|B?|": 425,
144
+ "A?|B?|C": 426,
145
+ "Aad1122h-2x": 428,
146
+ "a?|b?|c?|d?|e?|f?|g?": 429,
147
+ "a2211m-1x": 431,
148
+ "a?|b?|c?|d?|e?|f?}-{": 438,
149
+ "a?|b?|c?|d?|e?|f?": 439,
150
+ "AUdxxxxxh": 440,
151
+ "l?-m1": 443,
152
+ "xh-1u": 445,
153
+ "axx": 446,
154
+ "a212h-1x": 448,
155
+ "?|u?|v?|w?}-{": 451,
156
+ "?|u?|v?|w?": 452,
157
+ "a2122A-1x": 461,
158
  "A?|B?|C?|": 464,
159
  "A?|B?|C?|D": 465,
160
+ "a?|b?|c?|d?|e?}-{": 476,
161
+ "a?|b?|c?|d?|e?": 477,
162
+ "u11221h": 492,
163
+ "n?-": 497,
164
  "a?|b?|c?|d?}-{": 502,
165
  "a?|b?|c?|d?": 503,
166
+ "uxxxxxh": 508,
167
+ "u2211": 513,
168
+ "axxxxm-1x": 515,
169
+ "m?-n1": 520,
170
+ "A?|B?|C?|D?|": 525,
171
+ "A?|B?|C?|D?|E": 526,
172
+ "xh-1x": 527,
173
+ "u2211m": 529,
174
+ "d?-h1": 535,
175
+ "o?-": 536,
176
+ "c?-i1": 543,
177
+ "A?|B?|C?|D?|E?|": 544,
178
+ "A?|B?|C?|D?|E?|F": 545,
179
+ "c?-h1": 546,
180
+ "u2122": 549,
181
  "A?|B?|C?|D?|E?|F?|": 550,
182
  "A?|B?|C?|D?|E?|F?|G": 551,
183
+ "?|u?|v?|w?|x?}-{": 566,
184
+ "?|u?|v?|w?|x?": 567,
185
  "a?|b?|c?}-{": 571,
186
  "a?|b?|c?}-{a?|b?|c": 572,
187
  "a?|b?|c?}-{a?|b?|c?": 573,
188
+ "uxx": 574,
189
+ "?|H": 582,
190
+ "xh-1d": 591,
191
+ "2-?": 593,
192
+ "u212h": 594,
193
+ "ax12": 595,
194
  "?|H?|": 598,
195
  "?|H?|I": 599,
196
  "?}*OC": 600,
197
+ "c?-k1": 606,
198
+ "x2": 608,
199
  "c?-g1": 609,
200
+ "?|H?|I?|": 617,
201
+ "?|H?|I?|J": 618,
202
+ "n?-o1": 619,
203
+ "d?-g1": 628,
204
+ "x1": 632,
205
+ "o?-p1": 633,
206
+ "u2112": 636,
207
+ "u2122A": 640,
208
  "p?-": 646,
209
+ "uxxxxA": 651,
210
+ "d?-i1": 656,
211
+ "?|u?|v?|w?|x?|y?}-{": 657,
212
+ "?|u?|v?|w?|x?|y?": 658,
213
+ "b?-c2": 659,
214
+ "a211h-1x": 663,
215
+ "a2112A-1x": 664,
216
+ "c?-j1": 690,
217
+ "a2112m-1x": 695,
218
+ "e?-h1": 696,
219
+ "?}*OSO": 699,
220
+ "q?-": 712,
221
+ "ax11": 713,
222
+ "c?-f1": 715,
223
+ "AUdxxxxh": 716,
224
+ "ud": 718,
225
+ "hxh": 720,
226
+ "i?-j2": 724,
227
+ "axx2": 739,
228
+ "h?-i2": 742,
229
+ "g?-h2": 743,
230
+ "?|h?|i?}": 744,
231
+ "j?-k2": 750,
232
+ "uxxxh": 755,
233
+ "c?-e1": 756,
234
+ "a2121A-1x": 757,
235
+ "c?-l1": 758,
236
+ "?|h?}": 760,
237
  "?|H?|I?|J?|": 761,
238
  "?|H?|I?|J?|K": 762,
239
+ "ux": 766,
240
+ "b?-e1": 771,
241
+ "a?|b?|c?|d?|e?|f?}": 773,
242
+ "-2x": 777,
243
  "b?-f1": 791,
244
+ "d?-f1": 792,
245
+ "p?-q1": 799,
246
+ "m?-n2": 800,
247
+ "a?|b?|c?|d?|e?}": 801,
248
+ "a?-d1": 802,
249
+ "u2112m": 803,
250
+ "e?-g1": 804,
251
+ "r?-": 812,
252
+ "?|h?|i?|j?}": 815,
253
+ "u211h": 818,
254
+ "a?-c1": 820,
255
+ "a?-e1": 824,
256
+ "?|u?|v?|w?|x?|y?|z?": 825,
257
+ "d?-j1": 828,
258
+ "a11222h-1x": 831,
259
+ "b?-g1": 835,
260
+ "a122h-1x": 841,
261
+ "x21": 845,
262
+ "q?-r1": 849,
263
+ "d?-e2": 852,
264
+ "axx1": 855,
265
+ "c?-m1": 861,
266
+ "Aax21": 865,
267
+ "ax12xh-1u": 867,
268
+ "ax11xh-1u": 868,
269
  "a?-f1": 875,
270
+ "b?-d1": 877,
271
+ "axxxxxh-1x": 886,
272
+ "Aax21122h-2b": 887,
273
+ "?|H?|I?|J?|K?|": 894,
274
+ "?|H?|I?|J?|K?|L": 895,
275
+ "?|H?|I?|J?|K?|L?|": 896,
276
+ "?|H?|I?|J?|K?|L?|M": 897,
277
+ "?|H?|I?|J?|K?|L?|M?|": 898,
278
+ "xxxxh": 913,
279
+ "a2122m-1x": 917,
280
+ "ha122h-2x": 919,
281
+ "xm-1": 923,
282
+ "a?-l1": 926,
283
+ "k?-l2": 928,
284
+ "?*OSO/3=O/3=O": 929,
285
+ "dx": 933,
286
+ "k?-o1": 943,
287
+ "hxxxxh": 946,
288
+ "u2112A": 947,
289
+ "a1122m-1x": 953,
290
+ "axxxh-1x": 956,
291
+ "u12": 960,
292
+ "u212": 964,
293
+ "c?-d2": 967,
294
+ "N?|": 971,
295
+ "N?|O": 972,
296
+ "N?|O?|": 973,
297
+ "N?|O?|P": 974,
298
+ "N?|O?|P?|": 975,
299
+ "N?|O?|P?|Q": 976,
300
+ "N?|O?|P?|Q?|": 977,
301
+ "N?|O?|P?|Q?|R": 978,
302
+ "N?|O?|P?|Q?|R?|": 979,
303
+ "N?|O?|P?|Q?|R?|S": 980,
304
+ "N?|O?|P?|Q?|R?|S?|": 981,
305
+ "N?|O?|P?|Q?|R?|S?|T": 982,
306
+ "?|U": 983,
307
+ "?|U?|": 984,
308
+ "?|U?|V": 985,
309
+ "r?-s1": 991,
310
+ "e?-f2": 992,
311
+ "a?|b?}-{": 996,
312
+ "a?|b?}-{a?|b": 997,
313
+ "a?|b?}-{a?|b?": 998,
314
+ "xm-1u": 1000,
315
+ "a2222h-1x": 1001,
316
+ "g?-i1": 1002,
317
+ "u2122m": 1003,
318
+ "i?-l1": 1007,
319
+ "s?-": 1008,
320
+ "a1x2": 1014,
321
+ "?|h?|i?|j?|k?}": 1020,
322
+ "ax12xh-1d": 1031,
323
+ "n?-o2": 1034,
324
+ "b?-h1": 1035,
325
+ "a?-j1": 1040,
326
+ "hdx": 1050,
327
+ "hdxdh": 1053,
328
+ "u22": 1067,
329
+ "a?-b2": 1073,
330
+ "axxxxA-1x": 1085,
331
+ "e?-i1": 1092,
332
+ "h?-j1": 1100,
333
+ "u122h": 1108,
334
+ "a?-k1": 1111,
335
+ "u21": 1112,
336
+ "i?-k1": 1113,
337
+ "a?-g1": 1119,
338
+ "?}*OPO": 1124,
339
+ "d?-k1": 1130,
340
+ "a1x1": 1147,
341
+ "a?-m1": 1153,
342
+ "l?-m2": 1154,
343
+ "xm": 1158,
344
+ "f?-h1": 1159,
345
+ "axx1xh-1u": 1160,
346
+ "a?-i1": 1161,
347
+ "u11222h": 1162,
348
+ "a4334h-1x": 1169,
349
+ "A?}-{": 1179,
350
+ "A?": 1180,
351
+ "?}*OCC": 1182,
352
+ "p?-q2": 1184,
353
+ "A?|B?}-{": 1185,
354
+ "A?|B?": 1186,
355
+ "a1112h-1x": 1189,
356
+ "u2121A": 1190,
357
+ "a?-n1": 1191,
358
+ "axx2xh-1d": 1196,
359
  "c?-n1": 1197,
360
+ "uxx2": 1199,
361
+ "xA-1": 1200,
362
+ "?|U?|V?|": 1203,
363
+ "?|U?|V?|W": 1204,
364
+ "?|U?|V?|W?|": 1205,
365
+ "?|U?|V?|W?|X": 1206,
366
+ "?|U?|V?|W?|X?|": 1207,
367
+ "?|U?|V?|W?|X?|Y": 1208,
368
+ "?|a": 1209,
369
+ "u1122m": 1218,
370
+ "xxxh": 1220,
371
  "s?-t1": 1223,
372
+ "?|h?|i?|j?|k?|l?|m?}": 1229,
373
+ "g?-j1": 1232,
374
+ "a1211h-1x": 1233,
375
+ "o?-p2": 1241,
376
+ "u4": 1242,
377
+ "axx2xh-1u": 1245,
378
+ "A?|B?|C?|D?}-{": 1247,
379
+ "A?|B?|C?|D?": 1248,
380
+ "u222h": 1249,
381
+ "ax22": 1255,
382
+ "a4344h-1x": 1256,
383
+ "a?-h1": 1257,
384
+ "?|H?|I?|J?}-{": 1261,
385
+ "?|H?|I?|J?": 1262,
386
+ "b?-i1": 1274,
387
+ "a1122A-1x": 1278,
388
+ "a222h-1x": 1283,
389
+ "a1221h-1x": 1286,
390
+ "xA-1u": 1287,
391
+ "ux12": 1305,
392
+ "u112": 1306,
393
+ "j?-m1": 1308,
394
+ "?|h?|i?|j?|k?|l?}": 1311,
395
+ "xh-2x": 1314,
396
+ "c?-o1": 1319,
397
+ "a2211h-1x": 1323,
398
+ "dxx": 1325,
399
+ "axxxxx": 1329,
400
+ "a?-o1": 1331,
401
+ "a?|b?|c?}*OC": 1332,
402
+ "uxx2xh": 1345,
403
  "b?-j1": 1357,
404
  "a?-r1": 1361,
405
  "n?}": 1363,
406
+ "axx21": 1370,
407
  "A?|B?|C?}-{": 1371,
408
  "A?|B?|C?": 1372,
409
+ "m?-p1": 1374,
410
+ "ux12xh": 1378,
411
+ "l?-p1": 1385,
412
+ "ax1": 1386,
413
+ "u2222h": 1411,
414
+ "ax21": 1419,
415
+ "k?-n1": 1441,
416
+ "x2h": 1445,
417
+ "a2212h-1x": 1446,
418
+ "a?-p1": 1447,
419
+ "xxh": 1454,
420
+ "j?-l1": 1459,
421
+ "?|U?|V?|W?|X?|Y?|": 1472,
422
+ "?|U?|V?|W?|X?|Y?|Z": 1473,
423
+ "?|aa?|": 1474,
424
+ "?|aa?|a": 1475,
425
+ "?|aa?|ab": 1476,
426
+ "?*OPO/3O/3=O": 1477,
427
+ "a1x2xh-1u": 1483,
428
+ "ad222m-1x": 1485,
429
+ "l?-q1": 1487,
430
+ "x22h": 1489,
431
+ "l?-n1": 1492,
432
+ "a3344h-1x": 1493,
433
+ "122h-2x": 1504,
434
+ "a?-s1": 1520,
435
+ "11xh": 1523,
436
+ "k?-m1": 1526,
437
+ "a1x1xh-1u": 1527,
438
+ "axxxxxh-1a": 1529,
439
+ "t?-": 1544,
440
+ "a?-q1": 1550,
441
+ "c?-q1": 1551,
442
+ "f?-i1": 1553,
443
+ "a?|b?|c?|d?}*OC": 1567,
444
+ "a21EEA-1x": 1569,
445
+ "u1211h": 1573,
446
+ "a1222h-1x": 1579,
447
  "c?-p1": 1591,
448
  "n?-q1": 1593,
449
+ "q?-r2": 1611,
450
+ "?|i?}": 1612,
451
+ "a?|b?|c?|d?|e?}*OC": 1613,
452
+ "x2xh": 1616,
453
+ "m?-q1": 1618,
454
+ "a21eEA-1x": 1622,
455
+ "l?-o1": 1625,
456
+ "a21FFA-1x": 1627,
457
+ "haxx": 1629,
458
+ "m?-r1": 1630,
459
+ "adxx": 1631,
460
+ "a?-t1": 1632,
461
+ "a2221h-1x": 1634,
462
+ "ud222m": 1648,
463
  "a?|b?|c?|d?}*OSO": 1649,
464
+ "u2-": 1651,
465
+ "u1221h": 1654,
466
+ "xm-1x": 1663,
467
+ "c?-r1": 1676,
468
+ "u1121h": 1679,
469
+ "1-d?|i?}": 1684,
470
+ "a2x": 1689,
471
+ "j?-n1": 1690,
472
+ "u?-": 1692,
473
+ "ax1xx": 1701,
474
+ "Aadxxxx": 1710,
475
  "a?|b?|c?|d?|e?}*OSO": 1718,
476
+ "u1112h": 1722,
477
+ "?*OCC/3=O": 1724,
478
+ "a344h-1x": 1731,
479
+ "a2x2": 1734,
480
+ "ad122m-1x": 1740,
481
+ "xd": 1742,
482
+ "?%": 1753,
483
+ "u2221h": 1754,
484
+ "ux11xh": 1762,
485
+ "haxxxh-2x": 1765,
486
+ "uxx1": 1766,
487
+ "?*OP^XOCCN/3O/3=O": 1771,
488
+ "t?-u1": 1773,
489
+ "?*": 1776,
490
+ "c?-s1": 1777,
491
+ "ax11xh-1d": 1785,
492
+ "xxxm": 1786,
493
+ "e?-j1": 1787,
494
+ "u1122": 1793,
495
+ "a?-u1": 1798,
496
+ "f?-h2": 1805,
497
+ "a2221m-1x": 1808,
498
+ "a21x2": 1815,
499
+ "c?-t1": 1822,
500
+ "f1-a?|b?|c?|d?|e?}": 1827,
501
+ "udxxxm": 1831,
502
+ "h?-k1": 1832,
503
+ "r?-s2": 1838,
504
+ "u?-v1": 1841,
505
+ "u1222h": 1845,
506
+ "?|H?|I?|J?|K?}-{": 1851,
507
+ "?|H?|I?|J?|K?": 1852,
508
+ "n?|o?}": 1856,
509
+ "1-d?|h?}": 1857,
510
+ "a2121h-1x": 1872,
511
+ "q?-s1": 1874,
512
+ "hUxxxh": 1878,
513
+ "%?%": 1882,
514
+ "b?-g2": 1883,
515
+ "ud122m": 1887,
516
+ "ud2": 1890,
517
+ "u2121h": 1895,
518
+ "u1x2xh": 1899,
519
+ "d?-l1": 1900,
520
+ "Aax1": 1904,
521
+ "ax12xh-1x": 1908,
522
+ "v?-": 1912,
523
+ "d2h-1x": 1914,
524
+ "u4334h": 1917,
525
+ "Aadxxxxxh-2x": 1923,
526
+ "uxx1xh": 1924,
527
+ "dxh": 1925,
528
+ "b?-k1": 1929,
529
+ "a2111h-1x": 1932,
530
+ "u1122A": 1933,
531
+ "Aa11122h-2x": 1936,
532
+ "e?-k1": 1940,
533
+ "?|aa?|ab?|a": 1944,
534
+ "?|aa?|ab?|ac": 1945,
535
+ "?|aa?|ab?|ac?|": 1946,
536
+ "?|aa?|ab?|ac?|ad": 1947,
537
+ "axx1xh-1d": 1949,
538
+ "a?|b?}*OC": 1952,
539
+ "?*OC": 1957,
540
+ "a2x1": 1958,
541
+ "ax22xh-1d": 1959,
542
+ "u4-": 1963,
543
+ "u2-t3|t": 1978,
544
+ "u2-t3|t6": 1979,
545
+ "s?-t2": 1985,
546
+ "u2212h": 1991,
547
+ "u4344h": 1997,
548
+ "a?-d2": 2001,
549
+ "a1121h-1x": 2003,
550
+ "axx21h-1b": 2008,
551
+ "u2211h": 2014,
552
+ "a3344m-1x": 2018,
553
+ "ax12xA-1u": 2025,
554
+ "ud122h": 2027,
555
+ "o?-q1": 2028,
556
+ "a1121A-1x": 2029,
557
+ "a?-f2": 2030,
558
+ "a1111h-1x": 2034,
559
+ "m?-o1": 2035,
560
+ "ud1": 2041,
561
+ "?}*OP^XOCCN": 2044,
562
+ "a?|b?|c?}*OCC": 2049,
563
+ "a2d12m-1x": 2051,
564
+ "c?-f2": 2059,
565
+ "A?|B?|C?|D?|E?|F?|G?": 2061,
566
+ "ad122h-1x": 2069,
567
+ "a112h-1x": 2071,
568
+ "a?|b?|c?}*OSO": 2072,
569
+ "?|U?|V?}-{": 2080,
570
+ "?|U?|V?": 2081,
571
+ "u2221m": 2083,
572
+ "x22h-2a": 2086,
573
+ "c?-u1": 2088
574
  },
575
  "ambiguous_ids": [
576
  32,
577
+ 80,
578
+ 83,
579
  90,
580
+ 105,
581
  108,
582
  109,
583
  110,
 
589
  116,
590
  117,
591
  118,
 
592
  120,
593
+ 121,
594
  122,
595
  123,
596
  124,
 
597
  126,
598
+ 127,
599
  128,
600
  129,
601
  130,
602
  131,
 
603
  133,
604
+ 134,
605
  138,
606
  141,
607
  142,
608
  143,
 
609
  147,
610
+ 148,
611
  149,
612
  150,
613
  153,
614
  154,
615
+ 155,
616
  157,
617
  158,
618
+ 159,
619
+ 160,
620
  165,
621
  166,
622
  170,
623
  171,
624
+ 174,
625
+ 176,
626
+ 179,
627
+ 188,
628
  189,
629
+ 190,
630
+ 195,
631
  197,
632
+ 198,
633
  201,
634
+ 202,
635
  209,
636
  210,
637
  211,
638
  213,
639
  217,
640
+ 220,
641
  221,
642
+ 229,
643
  230,
644
+ 233,
645
+ 234,
646
+ 235,
647
+ 241,
648
  242,
649
  244,
650
+ 246,
651
+ 254,
652
  262,
653
  266,
654
  267,
655
+ 272,
656
+ 280,
657
+ 284,
658
  288,
659
  289,
660
+ 296,
661
+ 297,
662
+ 300,
663
  306,
664
+ 307,
665
  308,
 
666
  313,
667
  314,
668
  315,
669
+ 316,
670
+ 317,
671
  322,
672
  323,
673
  325,
674
  326,
 
675
  329,
676
+ 330,
677
  332,
678
+ 333,
679
+ 335,
680
  336,
681
  337,
682
  339,
 
689
  351,
690
  352,
691
  353,
692
+ 354,
693
  355,
 
694
  364,
695
+ 366,
696
+ 367,
697
+ 370,
698
+ 372,
699
  375,
700
  376,
701
+ 379,
702
+ 381,
703
+ 388,
704
  393,
705
+ 394,
706
+ 395,
707
+ 399,
708
  401,
709
+ 403,
710
  408,
711
  409,
712
+ 417,
713
+ 419,
714
  421,
715
+ 422,
716
  425,
717
+ 426,
718
+ 428,
719
+ 429,
720
  431,
 
721
  438,
722
+ 439,
723
+ 440,
724
+ 443,
725
+ 445,
726
+ 446,
727
+ 448,
728
  451,
729
+ 452,
730
+ 461,
731
  464,
732
  465,
 
733
  476,
734
+ 477,
735
+ 492,
736
+ 497,
737
  502,
738
  503,
739
+ 508,
740
+ 513,
741
+ 515,
742
+ 520,
743
+ 525,
744
+ 526,
745
+ 527,
746
+ 529,
747
+ 535,
748
  536,
 
749
  543,
750
  544,
751
+ 545,
752
+ 546,
753
  549,
754
  550,
755
  551,
756
+ 566,
757
+ 567,
758
  571,
759
  572,
760
  573,
761
+ 574,
762
+ 582,
763
+ 591,
764
+ 593,
765
+ 594,
766
+ 595,
767
  598,
768
  599,
769
  600,
770
+ 606,
771
+ 608,
772
  609,
 
 
773
  617,
774
+ 618,
775
+ 619,
776
+ 628,
777
+ 632,
778
+ 633,
779
+ 636,
780
+ 640,
781
  646,
782
+ 651,
 
783
  656,
784
+ 657,
785
  658,
786
+ 659,
787
+ 663,
788
+ 664,
789
+ 690,
790
+ 695,
791
  696,
792
+ 699,
793
+ 712,
794
  713,
795
+ 715,
796
+ 716,
797
+ 718,
798
  720,
799
+ 724,
800
+ 739,
801
  742,
802
+ 743,
803
+ 744,
804
+ 750,
805
+ 755,
806
  756,
807
+ 757,
808
  758,
 
809
  760,
810
  761,
811
  762,
812
+ 766,
813
+ 771,
814
+ 773,
815
+ 777,
816
  791,
817
+ 792,
818
+ 799,
 
819
  800,
820
+ 801,
821
+ 802,
822
  803,
823
+ 804,
824
  812,
825
+ 815,
826
  818,
827
+ 820,
828
+ 824,
829
+ 825,
830
+ 828,
831
+ 831,
832
+ 835,
833
+ 841,
834
+ 845,
835
+ 849,
836
+ 852,
837
+ 855,
838
+ 861,
839
+ 865,
840
+ 867,
841
+ 868,
842
  875,
843
+ 877,
844
+ 886,
845
  887,
 
 
846
  894,
847
  895,
848
  896,
849
+ 897,
850
+ 898,
851
+ 913,
852
+ 917,
853
+ 919,
854
  923,
855
+ 926,
856
+ 928,
857
+ 929,
858
+ 933,
859
+ 943,
860
+ 946,
861
+ 947,
862
+ 953,
863
+ 956,
864
+ 960,
865
+ 964,
866
  967,
 
 
 
867
  971,
868
  972,
869
  973,
 
873
  977,
874
  978,
875
  979,
876
+ 980,
877
+ 981,
878
+ 982,
879
  983,
880
+ 984,
881
+ 985,
882
+ 991,
883
+ 992,
884
  996,
885
  997,
886
+ 998,
887
+ 1000,
888
  1001,
889
+ 1002,
890
+ 1003,
891
+ 1007,
892
+ 1008,
893
+ 1014,
894
+ 1020,
895
+ 1031,
896
  1034,
897
+ 1035,
898
+ 1040,
899
+ 1050,
900
+ 1053,
901
+ 1067,
902
+ 1073,
903
+ 1085,
904
+ 1092,
905
+ 1100,
906
  1108,
907
+ 1111,
908
+ 1112,
909
+ 1113,
910
+ 1119,
911
+ 1124,
912
+ 1130,
913
+ 1147,
914
+ 1153,
915
+ 1154,
916
+ 1158,
917
  1159,
918
+ 1160,
919
+ 1161,
920
+ 1162,
921
+ 1169,
922
  1179,
923
  1180,
924
+ 1182,
925
+ 1184,
926
+ 1185,
927
+ 1186,
928
  1189,
929
+ 1190,
930
+ 1191,
931
+ 1196,
932
  1197,
933
+ 1199,
934
+ 1200,
935
  1203,
936
  1204,
937
  1205,
938
  1206,
939
  1207,
940
  1208,
941
+ 1209,
942
+ 1218,
943
+ 1220,
944
  1223,
945
+ 1229,
946
+ 1232,
947
+ 1233,
948
+ 1241,
949
  1242,
950
+ 1245,
951
+ 1247,
952
+ 1248,
953
+ 1249,
954
+ 1255,
955
+ 1256,
956
  1257,
 
957
  1261,
958
+ 1262,
959
+ 1274,
960
+ 1278,
961
+ 1283,
962
+ 1286,
963
+ 1287,
964
+ 1305,
965
+ 1306,
966
+ 1308,
967
+ 1311,
968
+ 1314,
969
+ 1319,
970
+ 1323,
971
+ 1325,
972
+ 1329,
973
  1331,
974
+ 1332,
975
+ 1345,
976
  1357,
977
  1361,
978
  1363,
979
+ 1370,
980
  1371,
981
  1372,
982
+ 1374,
983
+ 1378,
984
+ 1385,
985
+ 1386,
986
+ 1411,
987
+ 1419,
988
+ 1441,
989
+ 1445,
990
  1446,
991
+ 1447,
992
+ 1454,
993
+ 1459,
994
  1472,
995
  1473,
996
  1474,
997
  1475,
998
  1476,
999
+ 1477,
1000
+ 1483,
1001
+ 1485,
1002
+ 1487,
1003
  1489,
1004
+ 1492,
1005
+ 1493,
1006
+ 1504,
1007
+ 1520,
1008
+ 1523,
1009
+ 1526,
1010
+ 1527,
1011
+ 1529,
1012
+ 1544,
1013
+ 1550,
1014
  1551,
1015
+ 1553,
1016
+ 1567,
1017
+ 1569,
1018
+ 1573,
1019
+ 1579,
1020
  1591,
1021
  1593,
1022
  1611,
1023
  1612,
1024
+ 1613,
1025
+ 1616,
1026
+ 1618,
1027
+ 1622,
1028
+ 1625,
1029
+ 1627,
1030
+ 1629,
1031
  1630,
1032
+ 1631,
1033
+ 1632,
1034
+ 1634,
1035
+ 1648,
1036
  1649,
1037
+ 1651,
1038
+ 1654,
1039
+ 1663,
1040
+ 1676,
1041
+ 1679,
1042
+ 1684,
1043
+ 1689,
1044
+ 1690,
1045
+ 1692,
1046
+ 1701,
1047
+ 1710,
1048
  1718,
1049
+ 1722,
1050
+ 1724,
1051
+ 1731,
1052
+ 1734,
1053
+ 1740,
1054
+ 1742,
1055
+ 1753,
1056
+ 1754,
1057
+ 1762,
1058
+ 1765,
1059
+ 1766,
1060
+ 1771,
1061
+ 1773,
1062
+ 1776,
1063
+ 1777,
1064
+ 1785,
1065
+ 1786,
1066
+ 1787,
1067
  1793,
1068
+ 1798,
1069
+ 1805,
1070
  1808,
1071
+ 1815,
 
1072
  1822,
1073
+ 1827,
1074
+ 1831,
1075
+ 1832,
1076
+ 1838,
1077
  1841,
1078
+ 1845,
 
1079
  1851,
1080
  1852,
1081
+ 1856,
1082
+ 1857,
1083
  1872,
1084
+ 1874,
1085
+ 1878,
1086
  1882,
1087
+ 1883,
1088
+ 1887,
1089
+ 1890,
1090
+ 1895,
1091
+ 1899,
1092
+ 1900,
1093
+ 1904,
1094
+ 1908,
1095
+ 1912,
1096
+ 1914,
1097
  1917,
1098
+ 1923,
1099
+ 1924,
1100
+ 1925,
1101
+ 1929,
1102
+ 1932,
1103
+ 1933,
1104
+ 1936,
1105
+ 1940,
1106
  1944,
1107
  1945,
1108
+ 1946,
1109
+ 1947,
1110
  1949,
1111
  1952,
1112
+ 1957,
1113
+ 1958,
1114
+ 1959,
1115
+ 1963,
1116
+ 1978,
1117
+ 1979,
1118
+ 1985,
1119
+ 1991,
1120
+ 1997,
1121
+ 2001,
1122
+ 2003,
1123
+ 2008,
1124
+ 2014,
1125
+ 2018,
1126
+ 2025,
1127
  2027,
1128
+ 2028,
1129
+ 2029,
1130
  2030,
1131
+ 2034,
1132
+ 2035,
1133
+ 2041,
1134
+ 2044,
1135
+ 2049,
1136
+ 2051,
1137
+ 2059,
1138
+ 2061,
1139
+ 2069,
1140
  2071,
1141
+ 2072,
1142
  2080,
1143
+ 2081,
1144
+ 2083,
1145
+ 2086,
1146
+ 2088
1147
  ],
1148
+ "source_vocab": "data/bpe_vocabulary_clean.json",
1149
+ "augmentation_rule": "Includes original ambiguity map plus WURCS uncertainty marker tokens containing '?', containing 'x', or starting with lowercase 'u'."
1150
+ }