binary-tokenizer-001-32k / analysis_results.json
mjbommar's picture
Upload binary-tokenizer-001-32k tokenizer
f709006 verified
{
"vocab_size": {
"total": 32761,
"total_with_special": 32768,
"base": 256,
"merges": 32505,
"special": 7,
"is_power_of_2": true,
"power": 15,
"matches_expected": true
},
"reachability": {
"valid_merges": 32505,
"invalid_merges": 0,
"reachable": 32761,
"unreachable": 0,
"all_reachable": true
},
"length_dist": {
"distribution": {
"1": 256,
"2": 13428,
"3": 6380,
"4": 6236,
"5": 1763,
"6": 1395,
"7": 676,
"8": 963,
"9": 191,
"10": 220,
"11": 109,
"12": 318,
"13": 86,
"14": 102,
"15": 69,
"16": 233,
"17": 26,
"18": 31,
"19": 23,
"20": 58,
"21": 16,
"22": 16,
"23": 19,
"24": 44,
"25": 6,
"26": 7,
"27": 8,
"28": 13,
"29": 7,
"30": 4,
"31": 3,
"32": 54
},
"avg_length": 3.812393162393162,
"min_length": 1,
"max_length": 32,
"length_3_count": 6380,
"length_3_percent": 19.474969474969473
},
"byte_content": {
"null_tokens": 8350,
"ascii_printable": 6460,
"ascii_only": 13796,
"high_byte": 18964,
"mixed": 10141,
"byte_distribution": {
"0": 20462,
"255": 3502,
"72": 2883,
"1": 2622,
"3": 1967,
"139": 1934,
"32": 1901,
"2": 1856,
"64": 1609,
"116": 1546,
"101": 1482,
"36": 1435,
"204": 1366,
"128": 1212,
"65": 1186,
"4": 1150,
"97": 1109,
"114": 1088,
"249": 1069,
"137": 1059,
"111": 990,
"8": 978,
"105": 964,
"115": 940,
"15": 917,
"110": 917,
"99": 879,
"16": 837,
"192": 814,
"232": 810,
"108": 798,
"131": 788,
"68": 777,
"84": 740,
"224": 737,
"112": 732,
"117": 723,
"48": 701,
"5": 690,
"169": 687,
"76": 684,
"69": 663,
"100": 653,
"95": 650,
"6": 647,
"73": 623,
"141": 614,
"10": 570,
"7": 562,
"66": 546
}
},
"diversity": {
"1": {
"learned": 256,
"possible": 256,
"coverage": 100.0
},
"2": {
"learned": 13428,
"possible": 65536,
"coverage": 20.489501953125
},
"3": {
"learned": 6380,
"possible": 16777216,
"coverage": 0.03802776336669922
},
"4": {
"learned": 6236,
"possible": 4294967296,
"coverage": 0.0001451931893825531
}
}
}