roeyba5 commited on
Commit
4f63bef
·
verified ·
1 Parent(s): 752750d

Upload 6 files

Browse files
onnx/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "roeyba5/urlbert-tiny-base-v4-onnx",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 192,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 768,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 64,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 8,
17
+ "num_hidden_layers": 8,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "problem_type": "single_label_classification",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.44.2",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 400
26
+ }
onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91126e31c80cda9a1711bddd80c48d7674d092ce4b9ff9f24708777b494887f1
3
+ size 14917307
onnx/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
onnx/tokenizer.json ADDED
@@ -0,0 +1,555 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 64,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": null,
10
+ "added_tokens": [
11
+ {
12
+ "id": 0,
13
+ "content": "[PAD]",
14
+ "single_word": false,
15
+ "lstrip": false,
16
+ "rstrip": false,
17
+ "normalized": false,
18
+ "special": true
19
+ },
20
+ {
21
+ "id": 1,
22
+ "content": "[UNK]",
23
+ "single_word": false,
24
+ "lstrip": false,
25
+ "rstrip": false,
26
+ "normalized": false,
27
+ "special": true
28
+ },
29
+ {
30
+ "id": 2,
31
+ "content": "[CLS]",
32
+ "single_word": false,
33
+ "lstrip": false,
34
+ "rstrip": false,
35
+ "normalized": false,
36
+ "special": true
37
+ },
38
+ {
39
+ "id": 3,
40
+ "content": "[SEP]",
41
+ "single_word": false,
42
+ "lstrip": false,
43
+ "rstrip": false,
44
+ "normalized": false,
45
+ "special": true
46
+ },
47
+ {
48
+ "id": 4,
49
+ "content": "[MASK]",
50
+ "single_word": false,
51
+ "lstrip": false,
52
+ "rstrip": false,
53
+ "normalized": false,
54
+ "special": true
55
+ }
56
+ ],
57
+ "normalizer": {
58
+ "type": "BertNormalizer",
59
+ "clean_text": true,
60
+ "handle_chinese_chars": true,
61
+ "strip_accents": null,
62
+ "lowercase": true
63
+ },
64
+ "pre_tokenizer": {
65
+ "type": "BertPreTokenizer"
66
+ },
67
+ "post_processor": {
68
+ "type": "TemplateProcessing",
69
+ "single": [
70
+ {
71
+ "SpecialToken": {
72
+ "id": "[CLS]",
73
+ "type_id": 0
74
+ }
75
+ },
76
+ {
77
+ "Sequence": {
78
+ "id": "A",
79
+ "type_id": 0
80
+ }
81
+ },
82
+ {
83
+ "SpecialToken": {
84
+ "id": "[SEP]",
85
+ "type_id": 0
86
+ }
87
+ }
88
+ ],
89
+ "pair": [
90
+ {
91
+ "SpecialToken": {
92
+ "id": "[CLS]",
93
+ "type_id": 0
94
+ }
95
+ },
96
+ {
97
+ "Sequence": {
98
+ "id": "A",
99
+ "type_id": 0
100
+ }
101
+ },
102
+ {
103
+ "SpecialToken": {
104
+ "id": "[SEP]",
105
+ "type_id": 0
106
+ }
107
+ },
108
+ {
109
+ "Sequence": {
110
+ "id": "B",
111
+ "type_id": 1
112
+ }
113
+ },
114
+ {
115
+ "SpecialToken": {
116
+ "id": "[SEP]",
117
+ "type_id": 1
118
+ }
119
+ }
120
+ ],
121
+ "special_tokens": {
122
+ "[CLS]": {
123
+ "id": "[CLS]",
124
+ "ids": [
125
+ 2
126
+ ],
127
+ "tokens": [
128
+ "[CLS]"
129
+ ]
130
+ },
131
+ "[SEP]": {
132
+ "id": "[SEP]",
133
+ "ids": [
134
+ 3
135
+ ],
136
+ "tokens": [
137
+ "[SEP]"
138
+ ]
139
+ }
140
+ }
141
+ },
142
+ "decoder": {
143
+ "type": "WordPiece",
144
+ "prefix": "##",
145
+ "cleanup": true
146
+ },
147
+ "model": {
148
+ "type": "WordPiece",
149
+ "unk_token": "[UNK]",
150
+ "continuing_subword_prefix": "##",
151
+ "max_input_chars_per_word": 100,
152
+ "vocab": {
153
+ "[PAD]": 0,
154
+ "[UNK]": 1,
155
+ "[CLS]": 2,
156
+ "[SEP]": 3,
157
+ "[MASK]": 4,
158
+ "http": 5,
159
+ "https": 6,
160
+ "www": 7,
161
+ "/": 8,
162
+ ".": 9,
163
+ ":": 10,
164
+ "&": 11,
165
+ "?": 12,
166
+ "-": 13,
167
+ "_": 14,
168
+ "%": 15,
169
+ "##0": 16,
170
+ "##1": 17,
171
+ "##2": 18,
172
+ "##3": 19,
173
+ "##4": 20,
174
+ "##5": 21,
175
+ "##6": 22,
176
+ "##7": 23,
177
+ "##8": 24,
178
+ "##9": 25,
179
+ "0": 26,
180
+ "1": 27,
181
+ "2": 28,
182
+ "3": 29,
183
+ "4": 30,
184
+ "5": 31,
185
+ "6": 32,
186
+ "7": 33,
187
+ "8": 34,
188
+ "9": 35,
189
+ "z": 36,
190
+ "y": 37,
191
+ "x": 38,
192
+ "w": 39,
193
+ "v": 40,
194
+ "u": 41,
195
+ "t": 42,
196
+ "s": 43,
197
+ "r": 44,
198
+ "q": 45,
199
+ "p": 46,
200
+ "o": 47,
201
+ "n": 48,
202
+ "m": 49,
203
+ "l": 50,
204
+ "k": 51,
205
+ "j": 52,
206
+ "i": 53,
207
+ "h": 54,
208
+ "g": 55,
209
+ "f": 56,
210
+ "e": 57,
211
+ "d": 58,
212
+ "c": 59,
213
+ "b": 60,
214
+ "a": 61,
215
+ "##z": 62,
216
+ "##y": 63,
217
+ "##x": 64,
218
+ "##w": 65,
219
+ "##v": 66,
220
+ "##u": 67,
221
+ "##t": 68,
222
+ "##s": 69,
223
+ "##r": 70,
224
+ "##q": 71,
225
+ "##p": 72,
226
+ "##o": 73,
227
+ "##n": 74,
228
+ "##m": 75,
229
+ "##l": 76,
230
+ "##k": 77,
231
+ "##j": 78,
232
+ "##i": 79,
233
+ "##h": 80,
234
+ "##g": 81,
235
+ "##f": 82,
236
+ "##e": 83,
237
+ "##d": 84,
238
+ "##c": 85,
239
+ "##b": 86,
240
+ "##a": 87,
241
+ "##ing": 88,
242
+ "##ly": 89,
243
+ "##er": 90,
244
+ "##in": 91,
245
+ "##tion": 92,
246
+ "##re": 93,
247
+ "##un": 94,
248
+ "##ed": 95,
249
+ "##al": 96,
250
+ "##ter": 97,
251
+ "##de": 98,
252
+ "##con": 99,
253
+ "##an": 100,
254
+ "##ti": 101,
255
+ "##ic": 102,
256
+ "##cal": 103,
257
+ "##to": 104,
258
+ "##ty": 105,
259
+ "##ness": 106,
260
+ "##ta": 107,
261
+ "##di": 108,
262
+ "##la": 109,
263
+ "##en": 110,
264
+ "##es": 111,
265
+ "##ma": 112,
266
+ "##per": 113,
267
+ "##man": 114,
268
+ "##ri": 115,
269
+ "##na": 116,
270
+ "##ca": 117,
271
+ "##ex": 118,
272
+ "##dis": 119,
273
+ "##ra": 120,
274
+ "##ers": 121,
275
+ "##non": 122,
276
+ "##tions": 123,
277
+ "##com": 124,
278
+ "##ni": 125,
279
+ "##co": 126,
280
+ "##pro": 127,
281
+ "##tive": 128,
282
+ "##mi": 129,
283
+ "##pre": 130,
284
+ "##der": 131,
285
+ "##sub": 132,
286
+ "##able": 133,
287
+ "##tor": 134,
288
+ "##li": 135,
289
+ "##si": 136,
290
+ "##hy": 137,
291
+ "##mo": 138,
292
+ "##men": 139,
293
+ "##ar": 140,
294
+ "##im": 141,
295
+ "##ton": 142,
296
+ "##sis": 143,
297
+ "##tic": 144,
298
+ "##da": 145,
299
+ "##at": 146,
300
+ "##ci": 147,
301
+ "##or": 148,
302
+ "##lar": 149,
303
+ "##car": 150,
304
+ "##ment": 151,
305
+ "##lo": 152,
306
+ "##ac": 153,
307
+ "##cy": 154,
308
+ "##tu": 155,
309
+ "##less": 156,
310
+ "##as": 157,
311
+ "##um": 158,
312
+ "##pa": 159,
313
+ "##tal": 160,
314
+ "##ry": 161,
315
+ "##ro": 162,
316
+ "##fi": 163,
317
+ "##over": 164,
318
+ "##po": 165,
319
+ "##is": 166,
320
+ "##son": 167,
321
+ "##so": 168,
322
+ "##do": 169,
323
+ "##cu": 170,
324
+ "##bi": 171,
325
+ "##be": 172,
326
+ "##tri": 173,
327
+ "##ful": 174,
328
+ "##vi": 175,
329
+ "##mis": 176,
330
+ "##su": 177,
331
+ "##va": 178,
332
+ "##ous": 179,
333
+ "ftp": 180,
334
+ "tel": 181,
335
+ "file": 182,
336
+ "ws": 183,
337
+ "wss": 184,
338
+ "ssh": 185,
339
+ "ldaps": 186,
340
+ "gopher": 187,
341
+ "view": 188,
342
+ "source": 189,
343
+ "about": 190,
344
+ "chrome": 191,
345
+ "data": 192,
346
+ "irc": 193,
347
+ "magnet": 194,
348
+ "mms": 195,
349
+ "redis": 196,
350
+ "svn": 197,
351
+ "vnc": 198,
352
+ "dns": 199,
353
+ "ntp": 200,
354
+ "ip": 201,
355
+ "com": 202,
356
+ "de": 203,
357
+ "net": 204,
358
+ "uk": 205,
359
+ "cn": 206,
360
+ "org": 207,
361
+ "info": 208,
362
+ "nl": 209,
363
+ "eu": 210,
364
+ "ru": 211,
365
+ "su": 212,
366
+ "br": 213,
367
+ "htm": 214,
368
+ "php": 215,
369
+ "co": 216,
370
+ "ly": 217,
371
+ "bit": 218,
372
+ "log": 219,
373
+ "index": 220,
374
+ "bank": 221,
375
+ "za": 222,
376
+ "direct": 223,
377
+ "mail": 224,
378
+ "it": 225,
379
+ "run": 226,
380
+ "security": 227,
381
+ "code": 228,
382
+ "promo": 229,
383
+ "jpg": 230,
384
+ "img": 231,
385
+ "pay": 232,
386
+ "form": 233,
387
+ "docs": 234,
388
+ "host": 235,
389
+ "ec": 236,
390
+ "cx": 237,
391
+ "free": 238,
392
+ "true": 239,
393
+ "amp": 240,
394
+ "blog": 241,
395
+ "key": 242,
396
+ "pal": 243,
397
+ "contact": 244,
398
+ "online": 245,
399
+ "abc": 246,
400
+ "media": 247,
401
+ "admin": 248,
402
+ "etc": 249,
403
+ "login": 250,
404
+ "cmd": 251,
405
+ "bin": 252,
406
+ "web": 253,
407
+ "verif": 254,
408
+ "the": 255,
409
+ "in": 256,
410
+ "of": 257,
411
+ "la": 258,
412
+ "en": 259,
413
+ "and": 260,
414
+ "to": 261,
415
+ "der": 262,
416
+ "un": 263,
417
+ "di": 264,
418
+ "que": 265,
419
+ "is": 266,
420
+ "el": 267,
421
+ "se": 268,
422
+ "del": 269,
423
+ "die": 270,
424
+ "und": 271,
425
+ "et": 272,
426
+ "na": 273,
427
+ "was": 274,
428
+ "on": 275,
429
+ "des": 276,
430
+ "den": 277,
431
+ "le": 278,
432
+ "for": 279,
433
+ "da": 280,
434
+ "je": 281,
435
+ "van": 282,
436
+ "as": 283,
437
+ "sa": 284,
438
+ "do": 285,
439
+ "an": 286,
440
+ "les": 287,
441
+ "una": 288,
442
+ "il": 289,
443
+ "by": 290,
444
+ "og": 291,
445
+ "at": 292,
446
+ "er": 293,
447
+ "al": 294,
448
+ "von": 295,
449
+ "du": 296,
450
+ "av": 297,
451
+ "med": 298,
452
+ "con": 299,
453
+ "est": 300,
454
+ "per": 301,
455
+ "som": 302,
456
+ "los": 303,
457
+ "por": 304,
458
+ "from": 305,
459
+ "that": 306,
460
+ "no": 307,
461
+ "11": 308,
462
+ "es": 309,
463
+ "ja": 310,
464
+ "km": 311,
465
+ "om": 312,
466
+ "im": 313,
467
+ "dan": 314,
468
+ "para": 315,
469
+ "mit": 316,
470
+ "El": 317,
471
+ "his": 318,
472
+ "ha": 319,
473
+ "une": 320,
474
+ "das": 321,
475
+ "par": 322,
476
+ "au": 323,
477
+ "dans": 324,
478
+ "he": 325,
479
+ "che": 326,
480
+ "em": 327,
481
+ "dem": 328,
482
+ "til": 329,
483
+ "се": 330,
484
+ "han": 331,
485
+ "las": 332,
486
+ "della": 333,
487
+ "new": 334,
488
+ "um": 335,
489
+ "si": 336,
490
+ "var": 337,
491
+ "are": 338,
492
+ "op": 339,
493
+ "zu": 340,
494
+ "were": 341,
495
+ "od": 342,
496
+ "son": 343,
497
+ "which": 344,
498
+ "va": 345,
499
+ "pour": 346,
500
+ "ve": 347,
501
+ "sur": 348,
502
+ "war": 349,
503
+ "be": 350,
504
+ "det": 351,
505
+ "gov": 352,
506
+ "qui": 353,
507
+ "az": 354,
508
+ "te": 355,
509
+ "had": 356,
510
+ "also": 357,
511
+ "so": 358,
512
+ "am": 359,
513
+ "has": 360,
514
+ "dos": 361,
515
+ "ur": 362,
516
+ "entre": 363,
517
+ "lo": 364,
518
+ "era": 365,
519
+ "ni": 366,
520
+ "first": 367,
521
+ "os": 368,
522
+ "met": 369,
523
+ "ou": 370,
524
+ "all": 371,
525
+ "aus": 372,
526
+ "non": 373,
527
+ "film": 374,
528
+ "po": 375,
529
+ "into": 376,
530
+ "till": 377,
531
+ "ble": 378,
532
+ "ka": 379,
533
+ "mai": 380,
534
+ "up": 381,
535
+ "ng": 382,
536
+ "aux": 383,
537
+ "ad": 384,
538
+ "ki": 385,
539
+ "me": 386,
540
+ "ze": 387,
541
+ "can": 388,
542
+ "out": 389,
543
+ "wie": 390,
544
+ "со": 391,
545
+ "fu": 392,
546
+ "vom": 393,
547
+ "nu": 394,
548
+ "club": 395,
549
+ "team": 396,
550
+ "ca": 397,
551
+ "pe": 398,
552
+ "ke": 399
553
+ }
554
+ }
555
+ }
onnx/tokenizer_config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "max_len": 64,
51
+ "max_length": 64,
52
+ "model_max_length": 64,
53
+ "never_split": null,
54
+ "pad_to_multiple_of": null,
55
+ "pad_token": "[PAD]",
56
+ "pad_token_type_id": 0,
57
+ "padding_side": "right",
58
+ "sep_token": "[SEP]",
59
+ "stride": 0,
60
+ "strip_accents": null,
61
+ "tokenize_chinese_chars": true,
62
+ "tokenizer_class": "BertTokenizer",
63
+ "truncation_side": "right",
64
+ "truncation_strategy": "longest_first",
65
+ "unk_token": "[UNK]"
66
+ }
onnx/vocab.txt ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [UNK]
3
+ [CLS]
4
+ [SEP]
5
+ [MASK]
6
+ http
7
+ https
8
+ www
9
+ /
10
+ .
11
+ :
12
+ &
13
+ ?
14
+ -
15
+ _
16
+ %
17
+ ##0
18
+ ##1
19
+ ##2
20
+ ##3
21
+ ##4
22
+ ##5
23
+ ##6
24
+ ##7
25
+ ##8
26
+ ##9
27
+ 0
28
+ 1
29
+ 2
30
+ 3
31
+ 4
32
+ 5
33
+ 6
34
+ 7
35
+ 8
36
+ 9
37
+ z
38
+ y
39
+ x
40
+ w
41
+ v
42
+ u
43
+ t
44
+ s
45
+ r
46
+ q
47
+ p
48
+ o
49
+ n
50
+ m
51
+ l
52
+ k
53
+ j
54
+ i
55
+ h
56
+ g
57
+ f
58
+ e
59
+ d
60
+ c
61
+ b
62
+ a
63
+ ##z
64
+ ##y
65
+ ##x
66
+ ##w
67
+ ##v
68
+ ##u
69
+ ##t
70
+ ##s
71
+ ##r
72
+ ##q
73
+ ##p
74
+ ##o
75
+ ##n
76
+ ##m
77
+ ##l
78
+ ##k
79
+ ##j
80
+ ##i
81
+ ##h
82
+ ##g
83
+ ##f
84
+ ##e
85
+ ##d
86
+ ##c
87
+ ##b
88
+ ##a
89
+ ##ing
90
+ ##ly
91
+ ##er
92
+ ##in
93
+ ##tion
94
+ ##re
95
+ ##un
96
+ ##ed
97
+ ##al
98
+ ##ter
99
+ ##de
100
+ ##con
101
+ ##an
102
+ ##ti
103
+ ##ic
104
+ ##cal
105
+ ##to
106
+ ##ty
107
+ ##ness
108
+ ##ta
109
+ ##di
110
+ ##la
111
+ ##en
112
+ ##es
113
+ ##ma
114
+ ##per
115
+ ##man
116
+ ##ri
117
+ ##na
118
+ ##ca
119
+ ##ex
120
+ ##dis
121
+ ##ra
122
+ ##ers
123
+ ##non
124
+ ##tions
125
+ ##com
126
+ ##ni
127
+ ##co
128
+ ##pro
129
+ ##tive
130
+ ##mi
131
+ ##pre
132
+ ##der
133
+ ##sub
134
+ ##able
135
+ ##tor
136
+ ##li
137
+ ##si
138
+ ##hy
139
+ ##mo
140
+ ##men
141
+ ##ar
142
+ ##im
143
+ ##ton
144
+ ##sis
145
+ ##tic
146
+ ##da
147
+ ##at
148
+ ##ci
149
+ ##or
150
+ ##lar
151
+ ##car
152
+ ##ment
153
+ ##lo
154
+ ##ac
155
+ ##cy
156
+ ##tu
157
+ ##less
158
+ ##as
159
+ ##um
160
+ ##pa
161
+ ##tal
162
+ ##ry
163
+ ##ro
164
+ ##fi
165
+ ##over
166
+ ##po
167
+ ##is
168
+ ##son
169
+ ##so
170
+ ##do
171
+ ##cu
172
+ ##bi
173
+ ##be
174
+ ##tri
175
+ ##ful
176
+ ##vi
177
+ ##mis
178
+ ##su
179
+ ##va
180
+ ##ous
181
+ ftp
182
+ tel
183
+ file
184
+ ws
185
+ wss
186
+ ssh
187
+ ldaps
188
+ gopher
189
+ view
190
+ source
191
+ about
192
+ chrome
193
+ data
194
+ irc
195
+ magnet
196
+ mms
197
+ redis
198
+ svn
199
+ vnc
200
+ dns
201
+ ntp
202
+ ip
203
+ com
204
+ de
205
+ net
206
+ uk
207
+ cn
208
+ org
209
+ info
210
+ nl
211
+ eu
212
+ ru
213
+ su
214
+ br
215
+ htm
216
+ php
217
+ co
218
+ ly
219
+ bit
220
+ log
221
+ index
222
+ bank
223
+ za
224
+ direct
225
+ mail
226
+ it
227
+ run
228
+ security
229
+ code
230
+ promo
231
+ jpg
232
+ img
233
+ pay
234
+ form
235
+ docs
236
+ host
237
+ ec
238
+ cx
239
+ free
240
+ true
241
+ amp
242
+ blog
243
+ key
244
+ pal
245
+ contact
246
+ online
247
+ abc
248
+ media
249
+ admin
250
+ etc
251
+ login
252
+ cmd
253
+ bin
254
+ web
255
+ verif
256
+ the
257
+ in
258
+ of
259
+ la
260
+ en
261
+ and
262
+ to
263
+ der
264
+ un
265
+ di
266
+ que
267
+ is
268
+ el
269
+ se
270
+ del
271
+ die
272
+ und
273
+ et
274
+ na
275
+ was
276
+ on
277
+ des
278
+ den
279
+ le
280
+ for
281
+ da
282
+ je
283
+ van
284
+ as
285
+ sa
286
+ do
287
+ an
288
+ les
289
+ una
290
+ il
291
+ by
292
+ og
293
+ at
294
+ er
295
+ al
296
+ von
297
+ du
298
+ av
299
+ med
300
+ con
301
+ est
302
+ per
303
+ som
304
+ los
305
+ por
306
+ from
307
+ that
308
+ no
309
+ 11
310
+ es
311
+ ja
312
+ km
313
+ om
314
+ im
315
+ dan
316
+ para
317
+ mit
318
+ El
319
+ his
320
+ ha
321
+ une
322
+ das
323
+ par
324
+ au
325
+ dans
326
+ he
327
+ che
328
+ em
329
+ dem
330
+ til
331
+ се
332
+ han
333
+ las
334
+ della
335
+ new
336
+ um
337
+ si
338
+ var
339
+ are
340
+ op
341
+ zu
342
+ were
343
+ od
344
+ son
345
+ which
346
+ va
347
+ pour
348
+ ve
349
+ sur
350
+ war
351
+ be
352
+ det
353
+ gov
354
+ qui
355
+ az
356
+ te
357
+ had
358
+ also
359
+ so
360
+ am
361
+ has
362
+ dos
363
+ ur
364
+ entre
365
+ lo
366
+ era
367
+ ni
368
+ first
369
+ os
370
+ met
371
+ ou
372
+ all
373
+ aus
374
+ non
375
+ film
376
+ po
377
+ into
378
+ till
379
+ ble
380
+ ka
381
+ mai
382
+ up
383
+ ng
384
+ aux
385
+ ad
386
+ ki
387
+ me
388
+ ze
389
+ can
390
+ out
391
+ wie
392
+ со
393
+ fu
394
+ vom
395
+ nu
396
+ club
397
+ team
398
+ ca
399
+ pe
400
+ ke