{ "per_sample": { "Sentinel-SUT (61K)": { "English": { "tokens": 55, "bytes": 251, "words": 27, "fertility": 2.037037037037037, "compression": 4.5636363636363635, "roundtrip": true }, "French": { "tokens": 58, "bytes": 221, "words": 29, "fertility": 2.0, "compression": 3.810344827586207, "roundtrip": true }, "German": { "tokens": 56, "bytes": 204, "words": 17, "fertility": 3.2941176470588234, "compression": 3.642857142857143, "roundtrip": true }, "Spanish": { "tokens": 49, "bytes": 209, "words": 24, "fertility": 2.0416666666666665, "compression": 4.26530612244898, "roundtrip": true }, "Portuguese": { "tokens": 37, "bytes": 145, "words": 19, "fertility": 1.9473684210526316, "compression": 3.918918918918919, "roundtrip": true }, "Italian": { "tokens": 41, "bytes": 146, "words": 17, "fertility": 2.411764705882353, "compression": 3.5609756097560976, "roundtrip": true }, "Dutch": { "tokens": 41, "bytes": 123, "words": 12, "fertility": 3.4166666666666665, "compression": 3.0, "roundtrip": true }, "Polish": { "tokens": 41, "bytes": 122, "words": 13, "fertility": 3.1538461538461537, "compression": 2.975609756097561, "roundtrip": true }, "Swedish": { "tokens": 34, "bytes": 109, "words": 7, "fertility": 4.857142857142857, "compression": 3.2058823529411766, "roundtrip": true }, "Turkish": { "tokens": 42, "bytes": 134, "words": 13, "fertility": 3.230769230769231, "compression": 3.1904761904761907, "roundtrip": true }, "Ukrainian": { "tokens": 44, "bytes": 225, "words": 13, "fertility": 3.3846153846153846, "compression": 5.113636363636363, "roundtrip": true }, "Chinese": { "tokens": 48, "bytes": 173, "words": 1, "fertility": 48.0, "compression": 3.6041666666666665, "roundtrip": true }, "Japanese": { "tokens": 74, "bytes": 296, "words": 1, "fertility": 74.0, "compression": 4.0, "roundtrip": true }, "Korean": { "tokens": 62, "bytes": 285, "words": 24, "fertility": 2.5833333333333335, "compression": 4.596774193548387, "roundtrip": true }, "Vietnamese": { "tokens": 31, "bytes": 144, "words": 25, "fertility": 1.24, "compression": 4.645161290322581, "roundtrip": true }, "Thai": { "tokens": 71, "bytes": 339, "words": 1, "fertility": 71.0, "compression": 4.774647887323944, "roundtrip": false }, "Hindi": { "tokens": 91, "bytes": 325, "words": 16, "fertility": 5.6875, "compression": 3.5714285714285716, "roundtrip": true }, "Arabic": { "tokens": 55, "bytes": 265, "words": 22, "fertility": 2.5, "compression": 4.818181818181818, "roundtrip": true }, "Russian": { "tokens": 72, "bytes": 390, "words": 19, "fertility": 3.789473684210526, "compression": 5.416666666666667, "roundtrip": true }, "Python": { "tokens": 115, "bytes": 291, "words": 39, "fertility": 2.948717948717949, "compression": 2.5304347826086957, "roundtrip": true }, "JavaScript": { "tokens": 121, "bytes": 314, "words": 40, "fertility": 3.025, "compression": 2.5950413223140494, "roundtrip": true }, "Rust": { "tokens": 136, "bytes": 289, "words": 44, "fertility": 3.090909090909091, "compression": 2.125, "roundtrip": true }, "LaTeX_Complex": { "tokens": 143, "bytes": 248, "words": 26, "fertility": 5.5, "compression": 1.7342657342657342, "roundtrip": true }, "Unicode_Math": { "tokens": 58, "bytes": 115, "words": 17, "fertility": 3.411764705882353, "compression": 1.9827586206896552, "roundtrip": false }, "Mixed_Notation": { "tokens": 69, "bytes": 138, "words": 20, "fertility": 3.45, "compression": 2.0, "roundtrip": false }, "Emoji_Heavy": { "tokens": 68, "bytes": 155, "words": 17, "fertility": 4.0, "compression": 2.2794117647058822, "roundtrip": true }, "Numbers_Heavy": { "tokens": 67, "bytes": 130, "words": 15, "fertility": 4.466666666666667, "compression": 1.9402985074626866, "roundtrip": false }, "URL_Path": { "tokens": 46, "bytes": 113, "words": 1, "fertility": 46.0, "compression": 2.4565217391304346, "roundtrip": true }, "Mixed_Script": { "tokens": 44, "bytes": 122, "words": 17, "fertility": 2.588235294117647, "compression": 2.772727272727273, "roundtrip": true }, "Repetition": { "tokens": 17, "bytes": 109, "words": 14, "fertility": 1.2142857142857142, "compression": 6.411764705882353, "roundtrip": true }, "Whitespace": { "tokens": 22, "bytes": 54, "words": 6, "fertility": 3.6666666666666665, "compression": 2.4545454545454546, "roundtrip": true }, "Empty_Adjacent": { "tokens": 14, "bytes": 39, "words": 5, "fertility": 2.8, "compression": 2.7857142857142856, "roundtrip": true } }, "GPT-2 (50K)": { "English": { "tokens": 34, "bytes": 251, "words": 27, "fertility": 1.2592592592592593, "compression": 7.382352941176471, "roundtrip": true }, "French": { "tokens": 60, "bytes": 221, "words": 29, "fertility": 2.0689655172413794, "compression": 3.683333333333333, "roundtrip": true }, "German": { "tokens": 66, "bytes": 204, "words": 17, "fertility": 3.8823529411764706, "compression": 3.090909090909091, "roundtrip": true }, "Spanish": { "tokens": 66, "bytes": 209, "words": 24, "fertility": 2.75, "compression": 3.1666666666666665, "roundtrip": true }, "Portuguese": { "tokens": 52, "bytes": 145, "words": 19, "fertility": 2.736842105263158, "compression": 2.7884615384615383, "roundtrip": true }, "Italian": { "tokens": 51, "bytes": 146, "words": 17, "fertility": 3.0, "compression": 2.8627450980392157, "roundtrip": true }, "Dutch": { "tokens": 36, "bytes": 123, "words": 12, "fertility": 3.0, "compression": 3.4166666666666665, "roundtrip": true }, "Polish": { "tokens": 53, "bytes": 122, "words": 13, "fertility": 4.076923076923077, "compression": 2.30188679245283, "roundtrip": true }, "Swedish": { "tokens": 36, "bytes": 109, "words": 7, "fertility": 5.142857142857143, "compression": 3.0277777777777777, "roundtrip": true }, "Turkish": { "tokens": 55, "bytes": 134, "words": 13, "fertility": 4.230769230769231, "compression": 2.4363636363636365, "roundtrip": true }, "Ukrainian": { "tokens": 142, "bytes": 225, "words": 13, "fertility": 10.923076923076923, "compression": 1.5845070422535212, "roundtrip": true }, "Chinese": { "tokens": 117, "bytes": 173, "words": 1, "fertility": 117.0, "compression": 1.4786324786324787, "roundtrip": true }, "Japanese": { "tokens": 150, "bytes": 296, "words": 1, "fertility": 150.0, "compression": 1.9733333333333334, "roundtrip": true }, "Korean": { "tokens": 242, "bytes": 285, "words": 24, "fertility": 10.083333333333334, "compression": 1.177685950413223, "roundtrip": true }, "Vietnamese": { "tokens": 95, "bytes": 144, "words": 25, "fertility": 3.8, "compression": 1.5157894736842106, "roundtrip": true }, "Thai": { "tokens": 226, "bytes": 339, "words": 1, "fertility": 226.0, "compression": 1.5, "roundtrip": true }, "Hindi": { "tokens": 203, "bytes": 325, "words": 16, "fertility": 12.6875, "compression": 1.6009852216748768, "roundtrip": true }, "Arabic": { "tokens": 142, "bytes": 265, "words": 22, "fertility": 6.454545454545454, "compression": 1.8661971830985915, "roundtrip": true }, "Russian": { "tokens": 228, "bytes": 390, "words": 19, "fertility": 12.0, "compression": 1.7105263157894737, "roundtrip": true }, "Python": { "tokens": 135, "bytes": 291, "words": 39, "fertility": 3.4615384615384617, "compression": 2.1555555555555554, "roundtrip": true }, "JavaScript": { "tokens": 118, "bytes": 314, "words": 40, "fertility": 2.95, "compression": 2.6610169491525424, "roundtrip": true }, "Rust": { "tokens": 144, "bytes": 289, "words": 44, "fertility": 3.272727272727273, "compression": 2.0069444444444446, "roundtrip": true }, "LaTeX_Complex": { "tokens": 130, "bytes": 248, "words": 26, "fertility": 5.0, "compression": 1.9076923076923078, "roundtrip": true }, "Unicode_Math": { "tokens": 74, "bytes": 115, "words": 17, "fertility": 4.352941176470588, "compression": 1.554054054054054, "roundtrip": true }, "Mixed_Notation": { "tokens": 78, "bytes": 138, "words": 20, "fertility": 3.9, "compression": 1.7692307692307692, "roundtrip": true }, "Emoji_Heavy": { "tokens": 70, "bytes": 155, "words": 17, "fertility": 4.117647058823529, "compression": 2.2142857142857144, "roundtrip": true }, "Numbers_Heavy": { "tokens": 67, "bytes": 130, "words": 15, "fertility": 4.466666666666667, "compression": 1.9402985074626866, "roundtrip": true }, "URL_Path": { "tokens": 39, "bytes": 113, "words": 1, "fertility": 39.0, "compression": 2.8974358974358974, "roundtrip": true }, "Mixed_Script": { "tokens": 47, "bytes": 122, "words": 17, "fertility": 2.764705882352941, "compression": 2.595744680851064, "roundtrip": true }, "Repetition": { "tokens": 14, "bytes": 109, "words": 14, "fertility": 1.0, "compression": 7.785714285714286, "roundtrip": true }, "Whitespace": { "tokens": 24, "bytes": 54, "words": 6, "fertility": 4.0, "compression": 2.25, "roundtrip": true }, "Empty_Adjacent": { "tokens": 20, "bytes": 39, "words": 5, "fertility": 4.0, "compression": 1.95, "roundtrip": true } }, "Gemma (256K)": { "English": { "tokens": 32, "bytes": 251, "words": 27, "fertility": 1.1851851851851851, "compression": 7.84375, "roundtrip": true }, "French": { "tokens": 41, "bytes": 221, "words": 29, "fertility": 1.4137931034482758, "compression": 5.390243902439025, "roundtrip": true }, "German": { "tokens": 39, "bytes": 204, "words": 17, "fertility": 2.2941176470588234, "compression": 5.230769230769231, "roundtrip": true }, "Spanish": { "tokens": 33, "bytes": 209, "words": 24, "fertility": 1.375, "compression": 6.333333333333333, "roundtrip": true }, "Portuguese": { "tokens": 28, "bytes": 145, "words": 19, "fertility": 1.4736842105263157, "compression": 5.178571428571429, "roundtrip": true }, "Italian": { "tokens": 26, "bytes": 146, "words": 17, "fertility": 1.5294117647058822, "compression": 5.615384615384615, "roundtrip": true }, "Dutch": { "tokens": 27, "bytes": 123, "words": 12, "fertility": 2.25, "compression": 4.555555555555555, "roundtrip": true }, "Polish": { "tokens": 33, "bytes": 122, "words": 13, "fertility": 2.5384615384615383, "compression": 3.696969696969697, "roundtrip": true }, "Swedish": { "tokens": 25, "bytes": 109, "words": 7, "fertility": 3.5714285714285716, "compression": 4.36, "roundtrip": true }, "Turkish": { "tokens": 30, "bytes": 134, "words": 13, "fertility": 2.3076923076923075, "compression": 4.466666666666667, "roundtrip": true }, "Ukrainian": { "tokens": 35, "bytes": 225, "words": 13, "fertility": 2.6923076923076925, "compression": 6.428571428571429, "roundtrip": true }, "Chinese": { "tokens": 33, "bytes": 173, "words": 1, "fertility": 33.0, "compression": 5.242424242424242, "roundtrip": true }, "Japanese": { "tokens": 55, "bytes": 296, "words": 1, "fertility": 55.0, "compression": 5.381818181818182, "roundtrip": true }, "Korean": { "tokens": 74, "bytes": 285, "words": 24, "fertility": 3.0833333333333335, "compression": 3.8513513513513513, "roundtrip": true }, "Vietnamese": { "tokens": 28, "bytes": 144, "words": 25, "fertility": 1.12, "compression": 5.142857142857143, "roundtrip": true }, "Thai": { "tokens": 51, "bytes": 339, "words": 1, "fertility": 51.0, "compression": 6.647058823529412, "roundtrip": true }, "Hindi": { "tokens": 54, "bytes": 325, "words": 16, "fertility": 3.375, "compression": 6.018518518518518, "roundtrip": true }, "Arabic": { "tokens": 51, "bytes": 265, "words": 22, "fertility": 2.3181818181818183, "compression": 5.196078431372549, "roundtrip": true }, "Russian": { "tokens": 43, "bytes": 390, "words": 19, "fertility": 2.263157894736842, "compression": 9.069767441860465, "roundtrip": true }, "Python": { "tokens": 115, "bytes": 291, "words": 39, "fertility": 2.948717948717949, "compression": 2.5304347826086957, "roundtrip": true }, "JavaScript": { "tokens": 105, "bytes": 314, "words": 40, "fertility": 2.625, "compression": 2.9904761904761905, "roundtrip": true }, "Rust": { "tokens": 142, "bytes": 289, "words": 44, "fertility": 3.227272727272727, "compression": 2.035211267605634, "roundtrip": true }, "LaTeX_Complex": { "tokens": 110, "bytes": 248, "words": 26, "fertility": 4.230769230769231, "compression": 2.2545454545454544, "roundtrip": true }, "Unicode_Math": { "tokens": 61, "bytes": 115, "words": 17, "fertility": 3.588235294117647, "compression": 1.8852459016393444, "roundtrip": true }, "Mixed_Notation": { "tokens": 61, "bytes": 138, "words": 20, "fertility": 3.05, "compression": 2.262295081967213, "roundtrip": true }, "Emoji_Heavy": { "tokens": 42, "bytes": 155, "words": 17, "fertility": 2.4705882352941178, "compression": 3.6904761904761907, "roundtrip": true }, "Numbers_Heavy": { "tokens": 115, "bytes": 130, "words": 15, "fertility": 7.666666666666667, "compression": 1.1304347826086956, "roundtrip": true }, "URL_Path": { "tokens": 34, "bytes": 113, "words": 1, "fertility": 34.0, "compression": 3.323529411764706, "roundtrip": true }, "Mixed_Script": { "tokens": 33, "bytes": 122, "words": 17, "fertility": 1.9411764705882353, "compression": 3.696969696969697, "roundtrip": true }, "Repetition": { "tokens": 14, "bytes": 109, "words": 14, "fertility": 1.0, "compression": 7.785714285714286, "roundtrip": true }, "Whitespace": { "tokens": 16, "bytes": 54, "words": 6, "fertility": 2.6666666666666665, "compression": 3.375, "roundtrip": true }, "Empty_Adjacent": { "tokens": 14, "bytes": 39, "words": 5, "fertility": 2.8, "compression": 2.7857142857142856, "roundtrip": true } }, "Qwen2 (152K)": { "English": { "tokens": 31, "bytes": 251, "words": 27, "fertility": 1.1481481481481481, "compression": 8.096774193548388, "roundtrip": true }, "French": { "tokens": 50, "bytes": 221, "words": 29, "fertility": 1.7241379310344827, "compression": 4.42, "roundtrip": true }, "German": { "tokens": 50, "bytes": 204, "words": 17, "fertility": 2.9411764705882355, "compression": 4.08, "roundtrip": true }, "Spanish": { "tokens": 46, "bytes": 209, "words": 24, "fertility": 1.9166666666666667, "compression": 4.543478260869565, "roundtrip": true }, "Portuguese": { "tokens": 34, "bytes": 145, "words": 19, "fertility": 1.7894736842105263, "compression": 4.264705882352941, "roundtrip": true }, "Italian": { "tokens": 39, "bytes": 146, "words": 17, "fertility": 2.2941176470588234, "compression": 3.7435897435897436, "roundtrip": true }, "Dutch": { "tokens": 30, "bytes": 123, "words": 12, "fertility": 2.5, "compression": 4.1, "roundtrip": true }, "Polish": { "tokens": 39, "bytes": 122, "words": 13, "fertility": 3.0, "compression": 3.128205128205128, "roundtrip": true }, "Swedish": { "tokens": 30, "bytes": 109, "words": 7, "fertility": 4.285714285714286, "compression": 3.6333333333333333, "roundtrip": true }, "Turkish": { "tokens": 40, "bytes": 134, "words": 13, "fertility": 3.076923076923077, "compression": 3.35, "roundtrip": true }, "Ukrainian": { "tokens": 57, "bytes": 225, "words": 13, "fertility": 4.384615384615385, "compression": 3.9473684210526314, "roundtrip": true }, "Chinese": { "tokens": 33, "bytes": 173, "words": 1, "fertility": 33.0, "compression": 5.242424242424242, "roundtrip": true }, "Japanese": { "tokens": 77, "bytes": 296, "words": 1, "fertility": 77.0, "compression": 3.844155844155844, "roundtrip": true }, "Korean": { "tokens": 70, "bytes": 285, "words": 24, "fertility": 2.9166666666666665, "compression": 4.071428571428571, "roundtrip": true }, "Vietnamese": { "tokens": 32, "bytes": 144, "words": 25, "fertility": 1.28, "compression": 4.5, "roundtrip": true }, "Thai": { "tokens": 68, "bytes": 339, "words": 1, "fertility": 68.0, "compression": 4.985294117647059, "roundtrip": true }, "Hindi": { "tokens": 115, "bytes": 325, "words": 16, "fertility": 7.1875, "compression": 2.8260869565217392, "roundtrip": true }, "Arabic": { "tokens": 54, "bytes": 265, "words": 22, "fertility": 2.4545454545454546, "compression": 4.907407407407407, "roundtrip": true }, "Russian": { "tokens": 65, "bytes": 390, "words": 19, "fertility": 3.4210526315789473, "compression": 6.0, "roundtrip": true }, "Python": { "tokens": 97, "bytes": 291, "words": 39, "fertility": 2.4871794871794872, "compression": 3.0, "roundtrip": true }, "JavaScript": { "tokens": 91, "bytes": 314, "words": 40, "fertility": 2.275, "compression": 3.4505494505494507, "roundtrip": true }, "Rust": { "tokens": 132, "bytes": 289, "words": 44, "fertility": 3.0, "compression": 2.1893939393939394, "roundtrip": true }, "LaTeX_Complex": { "tokens": 114, "bytes": 248, "words": 26, "fertility": 4.384615384615385, "compression": 2.175438596491228, "roundtrip": true }, "Unicode_Math": { "tokens": 64, "bytes": 115, "words": 17, "fertility": 3.764705882352941, "compression": 1.796875, "roundtrip": true }, "Mixed_Notation": { "tokens": 66, "bytes": 138, "words": 20, "fertility": 3.3, "compression": 2.090909090909091, "roundtrip": true }, "Emoji_Heavy": { "tokens": 46, "bytes": 155, "words": 17, "fertility": 2.7058823529411766, "compression": 3.369565217391304, "roundtrip": true }, "Numbers_Heavy": { "tokens": 114, "bytes": 130, "words": 15, "fertility": 7.6, "compression": 1.1403508771929824, "roundtrip": true }, "URL_Path": { "tokens": 30, "bytes": 113, "words": 1, "fertility": 30.0, "compression": 3.7666666666666666, "roundtrip": true }, "Mixed_Script": { "tokens": 37, "bytes": 122, "words": 17, "fertility": 2.176470588235294, "compression": 3.2972972972972974, "roundtrip": true }, "Repetition": { "tokens": 14, "bytes": 109, "words": 14, "fertility": 1.0, "compression": 7.785714285714286, "roundtrip": true }, "Whitespace": { "tokens": 15, "bytes": 54, "words": 6, "fertility": 2.5, "compression": 3.6, "roundtrip": true }, "Empty_Adjacent": { "tokens": 14, "bytes": 39, "words": 5, "fertility": 2.8, "compression": 2.7857142857142856, "roundtrip": true } } }, "overall": { "Sentinel-SUT (61K)": { "avg_fertility": 10.210548371110242, "std_fertility": 19.290933783132925, "median_fertility": 3.262443438914027, "avg_compression": 3.4607235916418793, "median_compression": 3.3834289813486373, "fairness": 0.04928309414874054, "wins": 2.0, "total_tests": 32.0 }, "GPT-2 (50K)": { "avg_fertility": 20.730707859469526, "std_fertility": 48.701493147558395, "median_fertility": 4.038461538461538, "avg_compression": 2.5703998033314455, "median_compression": 2.184920634920635, "fairness": 0.020120119873081224, "wins": 0.0, "total_tests": 32.0 }, "Gemma (256K)": { "avg_fertility": 7.687682759598745, "std_fertility": 13.936255807387061, "median_fertility": 2.645833333333333, "avg_compression": 4.543616791377602, "median_compression": 4.511111111111111, "fairness": 0.0669511832748223, "wins": 20.0, "total_tests": 32.0 }, "Qwen2 (152K)": { "avg_fertility": 9.134830991971093, "std_fertility": 17.843274072731045, "median_fertility": 2.928921568627451, "avg_compression": 3.8791477128080354, "median_compression": 3.755128205128205, "fairness": 0.05306933371240114, "wins": 6.0, "total_tests": 32.0 } }, "categories": { "European": [ "English", "French", "German", "Spanish", "Portuguese", "Italian", "Dutch", "Polish", "Swedish", "Turkish", "Ukrainian" ], "Asian": [ "Chinese", "Japanese", "Korean", "Vietnamese", "Thai", "Hindi" ], "Semitic/RTL": [ "Arabic", "Russian" ], "Code": [ "Python", "JavaScript", "Rust" ], "Mathematics": [ "LaTeX_Complex", "Unicode_Math", "Mixed_Notation" ], "Edge Cases": [ "Emoji_Heavy", "Numbers_Heavy", "URL_Path", "Mixed_Script", "Repetition", "Whitespace", "Empty_Adjacent" ] } }