5dimension commited on
Commit
2ebe2b1
·
verified ·
1 Parent(s): 119d6f8

Add deep benchmark results (30 test cases, 4 tokenizer comparison)

Browse files
Files changed (1) hide show
  1. deep_benchmark_results.json +1124 -0
deep_benchmark_results.json ADDED
@@ -0,0 +1,1124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "per_sample": {
3
+ "Sentinel-SUT (61K)": {
4
+ "English": {
5
+ "tokens": 55,
6
+ "bytes": 251,
7
+ "words": 27,
8
+ "fertility": 2.037037037037037,
9
+ "compression": 4.5636363636363635,
10
+ "roundtrip": true
11
+ },
12
+ "French": {
13
+ "tokens": 58,
14
+ "bytes": 221,
15
+ "words": 29,
16
+ "fertility": 2.0,
17
+ "compression": 3.810344827586207,
18
+ "roundtrip": true
19
+ },
20
+ "German": {
21
+ "tokens": 56,
22
+ "bytes": 204,
23
+ "words": 17,
24
+ "fertility": 3.2941176470588234,
25
+ "compression": 3.642857142857143,
26
+ "roundtrip": true
27
+ },
28
+ "Spanish": {
29
+ "tokens": 49,
30
+ "bytes": 209,
31
+ "words": 24,
32
+ "fertility": 2.0416666666666665,
33
+ "compression": 4.26530612244898,
34
+ "roundtrip": true
35
+ },
36
+ "Portuguese": {
37
+ "tokens": 37,
38
+ "bytes": 145,
39
+ "words": 19,
40
+ "fertility": 1.9473684210526316,
41
+ "compression": 3.918918918918919,
42
+ "roundtrip": true
43
+ },
44
+ "Italian": {
45
+ "tokens": 41,
46
+ "bytes": 146,
47
+ "words": 17,
48
+ "fertility": 2.411764705882353,
49
+ "compression": 3.5609756097560976,
50
+ "roundtrip": true
51
+ },
52
+ "Dutch": {
53
+ "tokens": 41,
54
+ "bytes": 123,
55
+ "words": 12,
56
+ "fertility": 3.4166666666666665,
57
+ "compression": 3.0,
58
+ "roundtrip": true
59
+ },
60
+ "Polish": {
61
+ "tokens": 41,
62
+ "bytes": 122,
63
+ "words": 13,
64
+ "fertility": 3.1538461538461537,
65
+ "compression": 2.975609756097561,
66
+ "roundtrip": true
67
+ },
68
+ "Swedish": {
69
+ "tokens": 34,
70
+ "bytes": 109,
71
+ "words": 7,
72
+ "fertility": 4.857142857142857,
73
+ "compression": 3.2058823529411766,
74
+ "roundtrip": true
75
+ },
76
+ "Turkish": {
77
+ "tokens": 42,
78
+ "bytes": 134,
79
+ "words": 13,
80
+ "fertility": 3.230769230769231,
81
+ "compression": 3.1904761904761907,
82
+ "roundtrip": true
83
+ },
84
+ "Ukrainian": {
85
+ "tokens": 44,
86
+ "bytes": 225,
87
+ "words": 13,
88
+ "fertility": 3.3846153846153846,
89
+ "compression": 5.113636363636363,
90
+ "roundtrip": true
91
+ },
92
+ "Chinese": {
93
+ "tokens": 48,
94
+ "bytes": 173,
95
+ "words": 1,
96
+ "fertility": 48.0,
97
+ "compression": 3.6041666666666665,
98
+ "roundtrip": true
99
+ },
100
+ "Japanese": {
101
+ "tokens": 74,
102
+ "bytes": 296,
103
+ "words": 1,
104
+ "fertility": 74.0,
105
+ "compression": 4.0,
106
+ "roundtrip": true
107
+ },
108
+ "Korean": {
109
+ "tokens": 62,
110
+ "bytes": 285,
111
+ "words": 24,
112
+ "fertility": 2.5833333333333335,
113
+ "compression": 4.596774193548387,
114
+ "roundtrip": true
115
+ },
116
+ "Vietnamese": {
117
+ "tokens": 31,
118
+ "bytes": 144,
119
+ "words": 25,
120
+ "fertility": 1.24,
121
+ "compression": 4.645161290322581,
122
+ "roundtrip": true
123
+ },
124
+ "Thai": {
125
+ "tokens": 71,
126
+ "bytes": 339,
127
+ "words": 1,
128
+ "fertility": 71.0,
129
+ "compression": 4.774647887323944,
130
+ "roundtrip": false
131
+ },
132
+ "Hindi": {
133
+ "tokens": 91,
134
+ "bytes": 325,
135
+ "words": 16,
136
+ "fertility": 5.6875,
137
+ "compression": 3.5714285714285716,
138
+ "roundtrip": true
139
+ },
140
+ "Arabic": {
141
+ "tokens": 55,
142
+ "bytes": 265,
143
+ "words": 22,
144
+ "fertility": 2.5,
145
+ "compression": 4.818181818181818,
146
+ "roundtrip": true
147
+ },
148
+ "Russian": {
149
+ "tokens": 72,
150
+ "bytes": 390,
151
+ "words": 19,
152
+ "fertility": 3.789473684210526,
153
+ "compression": 5.416666666666667,
154
+ "roundtrip": true
155
+ },
156
+ "Python": {
157
+ "tokens": 115,
158
+ "bytes": 291,
159
+ "words": 39,
160
+ "fertility": 2.948717948717949,
161
+ "compression": 2.5304347826086957,
162
+ "roundtrip": true
163
+ },
164
+ "JavaScript": {
165
+ "tokens": 121,
166
+ "bytes": 314,
167
+ "words": 40,
168
+ "fertility": 3.025,
169
+ "compression": 2.5950413223140494,
170
+ "roundtrip": true
171
+ },
172
+ "Rust": {
173
+ "tokens": 136,
174
+ "bytes": 289,
175
+ "words": 44,
176
+ "fertility": 3.090909090909091,
177
+ "compression": 2.125,
178
+ "roundtrip": true
179
+ },
180
+ "LaTeX_Complex": {
181
+ "tokens": 143,
182
+ "bytes": 248,
183
+ "words": 26,
184
+ "fertility": 5.5,
185
+ "compression": 1.7342657342657342,
186
+ "roundtrip": true
187
+ },
188
+ "Unicode_Math": {
189
+ "tokens": 58,
190
+ "bytes": 115,
191
+ "words": 17,
192
+ "fertility": 3.411764705882353,
193
+ "compression": 1.9827586206896552,
194
+ "roundtrip": false
195
+ },
196
+ "Mixed_Notation": {
197
+ "tokens": 69,
198
+ "bytes": 138,
199
+ "words": 20,
200
+ "fertility": 3.45,
201
+ "compression": 2.0,
202
+ "roundtrip": false
203
+ },
204
+ "Emoji_Heavy": {
205
+ "tokens": 68,
206
+ "bytes": 155,
207
+ "words": 17,
208
+ "fertility": 4.0,
209
+ "compression": 2.2794117647058822,
210
+ "roundtrip": true
211
+ },
212
+ "Numbers_Heavy": {
213
+ "tokens": 67,
214
+ "bytes": 130,
215
+ "words": 15,
216
+ "fertility": 4.466666666666667,
217
+ "compression": 1.9402985074626866,
218
+ "roundtrip": false
219
+ },
220
+ "URL_Path": {
221
+ "tokens": 46,
222
+ "bytes": 113,
223
+ "words": 1,
224
+ "fertility": 46.0,
225
+ "compression": 2.4565217391304346,
226
+ "roundtrip": true
227
+ },
228
+ "Mixed_Script": {
229
+ "tokens": 44,
230
+ "bytes": 122,
231
+ "words": 17,
232
+ "fertility": 2.588235294117647,
233
+ "compression": 2.772727272727273,
234
+ "roundtrip": true
235
+ },
236
+ "Repetition": {
237
+ "tokens": 17,
238
+ "bytes": 109,
239
+ "words": 14,
240
+ "fertility": 1.2142857142857142,
241
+ "compression": 6.411764705882353,
242
+ "roundtrip": true
243
+ },
244
+ "Whitespace": {
245
+ "tokens": 22,
246
+ "bytes": 54,
247
+ "words": 6,
248
+ "fertility": 3.6666666666666665,
249
+ "compression": 2.4545454545454546,
250
+ "roundtrip": true
251
+ },
252
+ "Empty_Adjacent": {
253
+ "tokens": 14,
254
+ "bytes": 39,
255
+ "words": 5,
256
+ "fertility": 2.8,
257
+ "compression": 2.7857142857142856,
258
+ "roundtrip": true
259
+ }
260
+ },
261
+ "GPT-2 (50K)": {
262
+ "English": {
263
+ "tokens": 34,
264
+ "bytes": 251,
265
+ "words": 27,
266
+ "fertility": 1.2592592592592593,
267
+ "compression": 7.382352941176471,
268
+ "roundtrip": true
269
+ },
270
+ "French": {
271
+ "tokens": 60,
272
+ "bytes": 221,
273
+ "words": 29,
274
+ "fertility": 2.0689655172413794,
275
+ "compression": 3.683333333333333,
276
+ "roundtrip": true
277
+ },
278
+ "German": {
279
+ "tokens": 66,
280
+ "bytes": 204,
281
+ "words": 17,
282
+ "fertility": 3.8823529411764706,
283
+ "compression": 3.090909090909091,
284
+ "roundtrip": true
285
+ },
286
+ "Spanish": {
287
+ "tokens": 66,
288
+ "bytes": 209,
289
+ "words": 24,
290
+ "fertility": 2.75,
291
+ "compression": 3.1666666666666665,
292
+ "roundtrip": true
293
+ },
294
+ "Portuguese": {
295
+ "tokens": 52,
296
+ "bytes": 145,
297
+ "words": 19,
298
+ "fertility": 2.736842105263158,
299
+ "compression": 2.7884615384615383,
300
+ "roundtrip": true
301
+ },
302
+ "Italian": {
303
+ "tokens": 51,
304
+ "bytes": 146,
305
+ "words": 17,
306
+ "fertility": 3.0,
307
+ "compression": 2.8627450980392157,
308
+ "roundtrip": true
309
+ },
310
+ "Dutch": {
311
+ "tokens": 36,
312
+ "bytes": 123,
313
+ "words": 12,
314
+ "fertility": 3.0,
315
+ "compression": 3.4166666666666665,
316
+ "roundtrip": true
317
+ },
318
+ "Polish": {
319
+ "tokens": 53,
320
+ "bytes": 122,
321
+ "words": 13,
322
+ "fertility": 4.076923076923077,
323
+ "compression": 2.30188679245283,
324
+ "roundtrip": true
325
+ },
326
+ "Swedish": {
327
+ "tokens": 36,
328
+ "bytes": 109,
329
+ "words": 7,
330
+ "fertility": 5.142857142857143,
331
+ "compression": 3.0277777777777777,
332
+ "roundtrip": true
333
+ },
334
+ "Turkish": {
335
+ "tokens": 55,
336
+ "bytes": 134,
337
+ "words": 13,
338
+ "fertility": 4.230769230769231,
339
+ "compression": 2.4363636363636365,
340
+ "roundtrip": true
341
+ },
342
+ "Ukrainian": {
343
+ "tokens": 142,
344
+ "bytes": 225,
345
+ "words": 13,
346
+ "fertility": 10.923076923076923,
347
+ "compression": 1.5845070422535212,
348
+ "roundtrip": true
349
+ },
350
+ "Chinese": {
351
+ "tokens": 117,
352
+ "bytes": 173,
353
+ "words": 1,
354
+ "fertility": 117.0,
355
+ "compression": 1.4786324786324787,
356
+ "roundtrip": true
357
+ },
358
+ "Japanese": {
359
+ "tokens": 150,
360
+ "bytes": 296,
361
+ "words": 1,
362
+ "fertility": 150.0,
363
+ "compression": 1.9733333333333334,
364
+ "roundtrip": true
365
+ },
366
+ "Korean": {
367
+ "tokens": 242,
368
+ "bytes": 285,
369
+ "words": 24,
370
+ "fertility": 10.083333333333334,
371
+ "compression": 1.177685950413223,
372
+ "roundtrip": true
373
+ },
374
+ "Vietnamese": {
375
+ "tokens": 95,
376
+ "bytes": 144,
377
+ "words": 25,
378
+ "fertility": 3.8,
379
+ "compression": 1.5157894736842106,
380
+ "roundtrip": true
381
+ },
382
+ "Thai": {
383
+ "tokens": 226,
384
+ "bytes": 339,
385
+ "words": 1,
386
+ "fertility": 226.0,
387
+ "compression": 1.5,
388
+ "roundtrip": true
389
+ },
390
+ "Hindi": {
391
+ "tokens": 203,
392
+ "bytes": 325,
393
+ "words": 16,
394
+ "fertility": 12.6875,
395
+ "compression": 1.6009852216748768,
396
+ "roundtrip": true
397
+ },
398
+ "Arabic": {
399
+ "tokens": 142,
400
+ "bytes": 265,
401
+ "words": 22,
402
+ "fertility": 6.454545454545454,
403
+ "compression": 1.8661971830985915,
404
+ "roundtrip": true
405
+ },
406
+ "Russian": {
407
+ "tokens": 228,
408
+ "bytes": 390,
409
+ "words": 19,
410
+ "fertility": 12.0,
411
+ "compression": 1.7105263157894737,
412
+ "roundtrip": true
413
+ },
414
+ "Python": {
415
+ "tokens": 135,
416
+ "bytes": 291,
417
+ "words": 39,
418
+ "fertility": 3.4615384615384617,
419
+ "compression": 2.1555555555555554,
420
+ "roundtrip": true
421
+ },
422
+ "JavaScript": {
423
+ "tokens": 118,
424
+ "bytes": 314,
425
+ "words": 40,
426
+ "fertility": 2.95,
427
+ "compression": 2.6610169491525424,
428
+ "roundtrip": true
429
+ },
430
+ "Rust": {
431
+ "tokens": 144,
432
+ "bytes": 289,
433
+ "words": 44,
434
+ "fertility": 3.272727272727273,
435
+ "compression": 2.0069444444444446,
436
+ "roundtrip": true
437
+ },
438
+ "LaTeX_Complex": {
439
+ "tokens": 130,
440
+ "bytes": 248,
441
+ "words": 26,
442
+ "fertility": 5.0,
443
+ "compression": 1.9076923076923078,
444
+ "roundtrip": true
445
+ },
446
+ "Unicode_Math": {
447
+ "tokens": 74,
448
+ "bytes": 115,
449
+ "words": 17,
450
+ "fertility": 4.352941176470588,
451
+ "compression": 1.554054054054054,
452
+ "roundtrip": true
453
+ },
454
+ "Mixed_Notation": {
455
+ "tokens": 78,
456
+ "bytes": 138,
457
+ "words": 20,
458
+ "fertility": 3.9,
459
+ "compression": 1.7692307692307692,
460
+ "roundtrip": true
461
+ },
462
+ "Emoji_Heavy": {
463
+ "tokens": 70,
464
+ "bytes": 155,
465
+ "words": 17,
466
+ "fertility": 4.117647058823529,
467
+ "compression": 2.2142857142857144,
468
+ "roundtrip": true
469
+ },
470
+ "Numbers_Heavy": {
471
+ "tokens": 67,
472
+ "bytes": 130,
473
+ "words": 15,
474
+ "fertility": 4.466666666666667,
475
+ "compression": 1.9402985074626866,
476
+ "roundtrip": true
477
+ },
478
+ "URL_Path": {
479
+ "tokens": 39,
480
+ "bytes": 113,
481
+ "words": 1,
482
+ "fertility": 39.0,
483
+ "compression": 2.8974358974358974,
484
+ "roundtrip": true
485
+ },
486
+ "Mixed_Script": {
487
+ "tokens": 47,
488
+ "bytes": 122,
489
+ "words": 17,
490
+ "fertility": 2.764705882352941,
491
+ "compression": 2.595744680851064,
492
+ "roundtrip": true
493
+ },
494
+ "Repetition": {
495
+ "tokens": 14,
496
+ "bytes": 109,
497
+ "words": 14,
498
+ "fertility": 1.0,
499
+ "compression": 7.785714285714286,
500
+ "roundtrip": true
501
+ },
502
+ "Whitespace": {
503
+ "tokens": 24,
504
+ "bytes": 54,
505
+ "words": 6,
506
+ "fertility": 4.0,
507
+ "compression": 2.25,
508
+ "roundtrip": true
509
+ },
510
+ "Empty_Adjacent": {
511
+ "tokens": 20,
512
+ "bytes": 39,
513
+ "words": 5,
514
+ "fertility": 4.0,
515
+ "compression": 1.95,
516
+ "roundtrip": true
517
+ }
518
+ },
519
+ "Gemma (256K)": {
520
+ "English": {
521
+ "tokens": 32,
522
+ "bytes": 251,
523
+ "words": 27,
524
+ "fertility": 1.1851851851851851,
525
+ "compression": 7.84375,
526
+ "roundtrip": true
527
+ },
528
+ "French": {
529
+ "tokens": 41,
530
+ "bytes": 221,
531
+ "words": 29,
532
+ "fertility": 1.4137931034482758,
533
+ "compression": 5.390243902439025,
534
+ "roundtrip": true
535
+ },
536
+ "German": {
537
+ "tokens": 39,
538
+ "bytes": 204,
539
+ "words": 17,
540
+ "fertility": 2.2941176470588234,
541
+ "compression": 5.230769230769231,
542
+ "roundtrip": true
543
+ },
544
+ "Spanish": {
545
+ "tokens": 33,
546
+ "bytes": 209,
547
+ "words": 24,
548
+ "fertility": 1.375,
549
+ "compression": 6.333333333333333,
550
+ "roundtrip": true
551
+ },
552
+ "Portuguese": {
553
+ "tokens": 28,
554
+ "bytes": 145,
555
+ "words": 19,
556
+ "fertility": 1.4736842105263157,
557
+ "compression": 5.178571428571429,
558
+ "roundtrip": true
559
+ },
560
+ "Italian": {
561
+ "tokens": 26,
562
+ "bytes": 146,
563
+ "words": 17,
564
+ "fertility": 1.5294117647058822,
565
+ "compression": 5.615384615384615,
566
+ "roundtrip": true
567
+ },
568
+ "Dutch": {
569
+ "tokens": 27,
570
+ "bytes": 123,
571
+ "words": 12,
572
+ "fertility": 2.25,
573
+ "compression": 4.555555555555555,
574
+ "roundtrip": true
575
+ },
576
+ "Polish": {
577
+ "tokens": 33,
578
+ "bytes": 122,
579
+ "words": 13,
580
+ "fertility": 2.5384615384615383,
581
+ "compression": 3.696969696969697,
582
+ "roundtrip": true
583
+ },
584
+ "Swedish": {
585
+ "tokens": 25,
586
+ "bytes": 109,
587
+ "words": 7,
588
+ "fertility": 3.5714285714285716,
589
+ "compression": 4.36,
590
+ "roundtrip": true
591
+ },
592
+ "Turkish": {
593
+ "tokens": 30,
594
+ "bytes": 134,
595
+ "words": 13,
596
+ "fertility": 2.3076923076923075,
597
+ "compression": 4.466666666666667,
598
+ "roundtrip": true
599
+ },
600
+ "Ukrainian": {
601
+ "tokens": 35,
602
+ "bytes": 225,
603
+ "words": 13,
604
+ "fertility": 2.6923076923076925,
605
+ "compression": 6.428571428571429,
606
+ "roundtrip": true
607
+ },
608
+ "Chinese": {
609
+ "tokens": 33,
610
+ "bytes": 173,
611
+ "words": 1,
612
+ "fertility": 33.0,
613
+ "compression": 5.242424242424242,
614
+ "roundtrip": true
615
+ },
616
+ "Japanese": {
617
+ "tokens": 55,
618
+ "bytes": 296,
619
+ "words": 1,
620
+ "fertility": 55.0,
621
+ "compression": 5.381818181818182,
622
+ "roundtrip": true
623
+ },
624
+ "Korean": {
625
+ "tokens": 74,
626
+ "bytes": 285,
627
+ "words": 24,
628
+ "fertility": 3.0833333333333335,
629
+ "compression": 3.8513513513513513,
630
+ "roundtrip": true
631
+ },
632
+ "Vietnamese": {
633
+ "tokens": 28,
634
+ "bytes": 144,
635
+ "words": 25,
636
+ "fertility": 1.12,
637
+ "compression": 5.142857142857143,
638
+ "roundtrip": true
639
+ },
640
+ "Thai": {
641
+ "tokens": 51,
642
+ "bytes": 339,
643
+ "words": 1,
644
+ "fertility": 51.0,
645
+ "compression": 6.647058823529412,
646
+ "roundtrip": true
647
+ },
648
+ "Hindi": {
649
+ "tokens": 54,
650
+ "bytes": 325,
651
+ "words": 16,
652
+ "fertility": 3.375,
653
+ "compression": 6.018518518518518,
654
+ "roundtrip": true
655
+ },
656
+ "Arabic": {
657
+ "tokens": 51,
658
+ "bytes": 265,
659
+ "words": 22,
660
+ "fertility": 2.3181818181818183,
661
+ "compression": 5.196078431372549,
662
+ "roundtrip": true
663
+ },
664
+ "Russian": {
665
+ "tokens": 43,
666
+ "bytes": 390,
667
+ "words": 19,
668
+ "fertility": 2.263157894736842,
669
+ "compression": 9.069767441860465,
670
+ "roundtrip": true
671
+ },
672
+ "Python": {
673
+ "tokens": 115,
674
+ "bytes": 291,
675
+ "words": 39,
676
+ "fertility": 2.948717948717949,
677
+ "compression": 2.5304347826086957,
678
+ "roundtrip": true
679
+ },
680
+ "JavaScript": {
681
+ "tokens": 105,
682
+ "bytes": 314,
683
+ "words": 40,
684
+ "fertility": 2.625,
685
+ "compression": 2.9904761904761905,
686
+ "roundtrip": true
687
+ },
688
+ "Rust": {
689
+ "tokens": 142,
690
+ "bytes": 289,
691
+ "words": 44,
692
+ "fertility": 3.227272727272727,
693
+ "compression": 2.035211267605634,
694
+ "roundtrip": true
695
+ },
696
+ "LaTeX_Complex": {
697
+ "tokens": 110,
698
+ "bytes": 248,
699
+ "words": 26,
700
+ "fertility": 4.230769230769231,
701
+ "compression": 2.2545454545454544,
702
+ "roundtrip": true
703
+ },
704
+ "Unicode_Math": {
705
+ "tokens": 61,
706
+ "bytes": 115,
707
+ "words": 17,
708
+ "fertility": 3.588235294117647,
709
+ "compression": 1.8852459016393444,
710
+ "roundtrip": true
711
+ },
712
+ "Mixed_Notation": {
713
+ "tokens": 61,
714
+ "bytes": 138,
715
+ "words": 20,
716
+ "fertility": 3.05,
717
+ "compression": 2.262295081967213,
718
+ "roundtrip": true
719
+ },
720
+ "Emoji_Heavy": {
721
+ "tokens": 42,
722
+ "bytes": 155,
723
+ "words": 17,
724
+ "fertility": 2.4705882352941178,
725
+ "compression": 3.6904761904761907,
726
+ "roundtrip": true
727
+ },
728
+ "Numbers_Heavy": {
729
+ "tokens": 115,
730
+ "bytes": 130,
731
+ "words": 15,
732
+ "fertility": 7.666666666666667,
733
+ "compression": 1.1304347826086956,
734
+ "roundtrip": true
735
+ },
736
+ "URL_Path": {
737
+ "tokens": 34,
738
+ "bytes": 113,
739
+ "words": 1,
740
+ "fertility": 34.0,
741
+ "compression": 3.323529411764706,
742
+ "roundtrip": true
743
+ },
744
+ "Mixed_Script": {
745
+ "tokens": 33,
746
+ "bytes": 122,
747
+ "words": 17,
748
+ "fertility": 1.9411764705882353,
749
+ "compression": 3.696969696969697,
750
+ "roundtrip": true
751
+ },
752
+ "Repetition": {
753
+ "tokens": 14,
754
+ "bytes": 109,
755
+ "words": 14,
756
+ "fertility": 1.0,
757
+ "compression": 7.785714285714286,
758
+ "roundtrip": true
759
+ },
760
+ "Whitespace": {
761
+ "tokens": 16,
762
+ "bytes": 54,
763
+ "words": 6,
764
+ "fertility": 2.6666666666666665,
765
+ "compression": 3.375,
766
+ "roundtrip": true
767
+ },
768
+ "Empty_Adjacent": {
769
+ "tokens": 14,
770
+ "bytes": 39,
771
+ "words": 5,
772
+ "fertility": 2.8,
773
+ "compression": 2.7857142857142856,
774
+ "roundtrip": true
775
+ }
776
+ },
777
+ "Qwen2 (152K)": {
778
+ "English": {
779
+ "tokens": 31,
780
+ "bytes": 251,
781
+ "words": 27,
782
+ "fertility": 1.1481481481481481,
783
+ "compression": 8.096774193548388,
784
+ "roundtrip": true
785
+ },
786
+ "French": {
787
+ "tokens": 50,
788
+ "bytes": 221,
789
+ "words": 29,
790
+ "fertility": 1.7241379310344827,
791
+ "compression": 4.42,
792
+ "roundtrip": true
793
+ },
794
+ "German": {
795
+ "tokens": 50,
796
+ "bytes": 204,
797
+ "words": 17,
798
+ "fertility": 2.9411764705882355,
799
+ "compression": 4.08,
800
+ "roundtrip": true
801
+ },
802
+ "Spanish": {
803
+ "tokens": 46,
804
+ "bytes": 209,
805
+ "words": 24,
806
+ "fertility": 1.9166666666666667,
807
+ "compression": 4.543478260869565,
808
+ "roundtrip": true
809
+ },
810
+ "Portuguese": {
811
+ "tokens": 34,
812
+ "bytes": 145,
813
+ "words": 19,
814
+ "fertility": 1.7894736842105263,
815
+ "compression": 4.264705882352941,
816
+ "roundtrip": true
817
+ },
818
+ "Italian": {
819
+ "tokens": 39,
820
+ "bytes": 146,
821
+ "words": 17,
822
+ "fertility": 2.2941176470588234,
823
+ "compression": 3.7435897435897436,
824
+ "roundtrip": true
825
+ },
826
+ "Dutch": {
827
+ "tokens": 30,
828
+ "bytes": 123,
829
+ "words": 12,
830
+ "fertility": 2.5,
831
+ "compression": 4.1,
832
+ "roundtrip": true
833
+ },
834
+ "Polish": {
835
+ "tokens": 39,
836
+ "bytes": 122,
837
+ "words": 13,
838
+ "fertility": 3.0,
839
+ "compression": 3.128205128205128,
840
+ "roundtrip": true
841
+ },
842
+ "Swedish": {
843
+ "tokens": 30,
844
+ "bytes": 109,
845
+ "words": 7,
846
+ "fertility": 4.285714285714286,
847
+ "compression": 3.6333333333333333,
848
+ "roundtrip": true
849
+ },
850
+ "Turkish": {
851
+ "tokens": 40,
852
+ "bytes": 134,
853
+ "words": 13,
854
+ "fertility": 3.076923076923077,
855
+ "compression": 3.35,
856
+ "roundtrip": true
857
+ },
858
+ "Ukrainian": {
859
+ "tokens": 57,
860
+ "bytes": 225,
861
+ "words": 13,
862
+ "fertility": 4.384615384615385,
863
+ "compression": 3.9473684210526314,
864
+ "roundtrip": true
865
+ },
866
+ "Chinese": {
867
+ "tokens": 33,
868
+ "bytes": 173,
869
+ "words": 1,
870
+ "fertility": 33.0,
871
+ "compression": 5.242424242424242,
872
+ "roundtrip": true
873
+ },
874
+ "Japanese": {
875
+ "tokens": 77,
876
+ "bytes": 296,
877
+ "words": 1,
878
+ "fertility": 77.0,
879
+ "compression": 3.844155844155844,
880
+ "roundtrip": true
881
+ },
882
+ "Korean": {
883
+ "tokens": 70,
884
+ "bytes": 285,
885
+ "words": 24,
886
+ "fertility": 2.9166666666666665,
887
+ "compression": 4.071428571428571,
888
+ "roundtrip": true
889
+ },
890
+ "Vietnamese": {
891
+ "tokens": 32,
892
+ "bytes": 144,
893
+ "words": 25,
894
+ "fertility": 1.28,
895
+ "compression": 4.5,
896
+ "roundtrip": true
897
+ },
898
+ "Thai": {
899
+ "tokens": 68,
900
+ "bytes": 339,
901
+ "words": 1,
902
+ "fertility": 68.0,
903
+ "compression": 4.985294117647059,
904
+ "roundtrip": true
905
+ },
906
+ "Hindi": {
907
+ "tokens": 115,
908
+ "bytes": 325,
909
+ "words": 16,
910
+ "fertility": 7.1875,
911
+ "compression": 2.8260869565217392,
912
+ "roundtrip": true
913
+ },
914
+ "Arabic": {
915
+ "tokens": 54,
916
+ "bytes": 265,
917
+ "words": 22,
918
+ "fertility": 2.4545454545454546,
919
+ "compression": 4.907407407407407,
920
+ "roundtrip": true
921
+ },
922
+ "Russian": {
923
+ "tokens": 65,
924
+ "bytes": 390,
925
+ "words": 19,
926
+ "fertility": 3.4210526315789473,
927
+ "compression": 6.0,
928
+ "roundtrip": true
929
+ },
930
+ "Python": {
931
+ "tokens": 97,
932
+ "bytes": 291,
933
+ "words": 39,
934
+ "fertility": 2.4871794871794872,
935
+ "compression": 3.0,
936
+ "roundtrip": true
937
+ },
938
+ "JavaScript": {
939
+ "tokens": 91,
940
+ "bytes": 314,
941
+ "words": 40,
942
+ "fertility": 2.275,
943
+ "compression": 3.4505494505494507,
944
+ "roundtrip": true
945
+ },
946
+ "Rust": {
947
+ "tokens": 132,
948
+ "bytes": 289,
949
+ "words": 44,
950
+ "fertility": 3.0,
951
+ "compression": 2.1893939393939394,
952
+ "roundtrip": true
953
+ },
954
+ "LaTeX_Complex": {
955
+ "tokens": 114,
956
+ "bytes": 248,
957
+ "words": 26,
958
+ "fertility": 4.384615384615385,
959
+ "compression": 2.175438596491228,
960
+ "roundtrip": true
961
+ },
962
+ "Unicode_Math": {
963
+ "tokens": 64,
964
+ "bytes": 115,
965
+ "words": 17,
966
+ "fertility": 3.764705882352941,
967
+ "compression": 1.796875,
968
+ "roundtrip": true
969
+ },
970
+ "Mixed_Notation": {
971
+ "tokens": 66,
972
+ "bytes": 138,
973
+ "words": 20,
974
+ "fertility": 3.3,
975
+ "compression": 2.090909090909091,
976
+ "roundtrip": true
977
+ },
978
+ "Emoji_Heavy": {
979
+ "tokens": 46,
980
+ "bytes": 155,
981
+ "words": 17,
982
+ "fertility": 2.7058823529411766,
983
+ "compression": 3.369565217391304,
984
+ "roundtrip": true
985
+ },
986
+ "Numbers_Heavy": {
987
+ "tokens": 114,
988
+ "bytes": 130,
989
+ "words": 15,
990
+ "fertility": 7.6,
991
+ "compression": 1.1403508771929824,
992
+ "roundtrip": true
993
+ },
994
+ "URL_Path": {
995
+ "tokens": 30,
996
+ "bytes": 113,
997
+ "words": 1,
998
+ "fertility": 30.0,
999
+ "compression": 3.7666666666666666,
1000
+ "roundtrip": true
1001
+ },
1002
+ "Mixed_Script": {
1003
+ "tokens": 37,
1004
+ "bytes": 122,
1005
+ "words": 17,
1006
+ "fertility": 2.176470588235294,
1007
+ "compression": 3.2972972972972974,
1008
+ "roundtrip": true
1009
+ },
1010
+ "Repetition": {
1011
+ "tokens": 14,
1012
+ "bytes": 109,
1013
+ "words": 14,
1014
+ "fertility": 1.0,
1015
+ "compression": 7.785714285714286,
1016
+ "roundtrip": true
1017
+ },
1018
+ "Whitespace": {
1019
+ "tokens": 15,
1020
+ "bytes": 54,
1021
+ "words": 6,
1022
+ "fertility": 2.5,
1023
+ "compression": 3.6,
1024
+ "roundtrip": true
1025
+ },
1026
+ "Empty_Adjacent": {
1027
+ "tokens": 14,
1028
+ "bytes": 39,
1029
+ "words": 5,
1030
+ "fertility": 2.8,
1031
+ "compression": 2.7857142857142856,
1032
+ "roundtrip": true
1033
+ }
1034
+ }
1035
+ },
1036
+ "overall": {
1037
+ "Sentinel-SUT (61K)": {
1038
+ "avg_fertility": 10.210548371110242,
1039
+ "std_fertility": 19.290933783132925,
1040
+ "median_fertility": 3.262443438914027,
1041
+ "avg_compression": 3.4607235916418793,
1042
+ "median_compression": 3.3834289813486373,
1043
+ "fairness": 0.04928309414874054,
1044
+ "wins": 2.0,
1045
+ "total_tests": 32.0
1046
+ },
1047
+ "GPT-2 (50K)": {
1048
+ "avg_fertility": 20.730707859469526,
1049
+ "std_fertility": 48.701493147558395,
1050
+ "median_fertility": 4.038461538461538,
1051
+ "avg_compression": 2.5703998033314455,
1052
+ "median_compression": 2.184920634920635,
1053
+ "fairness": 0.020120119873081224,
1054
+ "wins": 0.0,
1055
+ "total_tests": 32.0
1056
+ },
1057
+ "Gemma (256K)": {
1058
+ "avg_fertility": 7.687682759598745,
1059
+ "std_fertility": 13.936255807387061,
1060
+ "median_fertility": 2.645833333333333,
1061
+ "avg_compression": 4.543616791377602,
1062
+ "median_compression": 4.511111111111111,
1063
+ "fairness": 0.0669511832748223,
1064
+ "wins": 20.0,
1065
+ "total_tests": 32.0
1066
+ },
1067
+ "Qwen2 (152K)": {
1068
+ "avg_fertility": 9.134830991971093,
1069
+ "std_fertility": 17.843274072731045,
1070
+ "median_fertility": 2.928921568627451,
1071
+ "avg_compression": 3.8791477128080354,
1072
+ "median_compression": 3.755128205128205,
1073
+ "fairness": 0.05306933371240114,
1074
+ "wins": 6.0,
1075
+ "total_tests": 32.0
1076
+ }
1077
+ },
1078
+ "categories": {
1079
+ "European": [
1080
+ "English",
1081
+ "French",
1082
+ "German",
1083
+ "Spanish",
1084
+ "Portuguese",
1085
+ "Italian",
1086
+ "Dutch",
1087
+ "Polish",
1088
+ "Swedish",
1089
+ "Turkish",
1090
+ "Ukrainian"
1091
+ ],
1092
+ "Asian": [
1093
+ "Chinese",
1094
+ "Japanese",
1095
+ "Korean",
1096
+ "Vietnamese",
1097
+ "Thai",
1098
+ "Hindi"
1099
+ ],
1100
+ "Semitic/RTL": [
1101
+ "Arabic",
1102
+ "Russian"
1103
+ ],
1104
+ "Code": [
1105
+ "Python",
1106
+ "JavaScript",
1107
+ "Rust"
1108
+ ],
1109
+ "Mathematics": [
1110
+ "LaTeX_Complex",
1111
+ "Unicode_Math",
1112
+ "Mixed_Notation"
1113
+ ],
1114
+ "Edge Cases": [
1115
+ "Emoji_Heavy",
1116
+ "Numbers_Heavy",
1117
+ "URL_Path",
1118
+ "Mixed_Script",
1119
+ "Repetition",
1120
+ "Whitespace",
1121
+ "Empty_Adjacent"
1122
+ ]
1123
+ }
1124
+ }