binary-tokenizer-001-16k / analysis_results.json
mjbommar's picture
Upload binary-tokenizer-001-16k tokenizer
9db8b79 verified
{
"vocab_size": {
"total": 16377,
"total_with_special": 16384,
"base": 256,
"merges": 16121,
"special": 7,
"is_power_of_2": true,
"power": 14,
"matches_expected": true
},
"reachability": {
"valid_merges": 16121,
"invalid_merges": 0,
"reachable": 16377,
"unreachable": 0,
"all_reachable": true
},
"length_dist": {
"distribution": {
"1": 256,
"2": 7149,
"3": 3360,
"4": 3082,
"5": 719,
"6": 606,
"7": 228,
"8": 377,
"9": 78,
"10": 92,
"11": 42,
"12": 99,
"13": 34,
"14": 38,
"15": 30,
"16": 86,
"17": 4,
"18": 12,
"19": 9,
"20": 15,
"21": 6,
"22": 6,
"23": 3,
"24": 11,
"25": 2,
"26": 1,
"27": 4,
"28": 3,
"29": 1,
"30": 2,
"31": 1,
"32": 21
},
"avg_length": 3.4977712645783723,
"min_length": 1,
"max_length": 32,
"length_3_count": 3360,
"length_3_percent": 20.516578127862246
},
"byte_content": {
"null_tokens": 4128,
"ascii_printable": 3513,
"ascii_only": 7256,
"high_byte": 9121,
"mixed": 4506,
"byte_distribution": {
"0": 9741,
"255": 1718,
"1": 1386,
"72": 1352,
"32": 958,
"139": 955,
"3": 913,
"2": 866,
"116": 752,
"204": 751,
"36": 735,
"64": 734,
"101": 685,
"128": 579,
"65": 556,
"4": 543,
"137": 522,
"249": 518,
"97": 515,
"114": 494,
"232": 482,
"105": 456,
"110": 436,
"115": 435,
"8": 432,
"111": 432,
"15": 423,
"16": 405,
"99": 384,
"131": 367,
"48": 360,
"108": 358,
"68": 357,
"117": 346,
"224": 329,
"84": 327,
"169": 325,
"112": 323,
"192": 323,
"100": 316,
"76": 307,
"5": 304,
"6": 298,
"69": 298,
"95": 296,
"73": 280,
"145": 267,
"66": 266,
"141": 265,
"31": 262
}
},
"diversity": {
"1": {
"learned": 256,
"possible": 256,
"coverage": 100.0
},
"2": {
"learned": 7149,
"possible": 65536,
"coverage": 10.90850830078125
},
"3": {
"learned": 3360,
"possible": 16777216,
"coverage": 0.02002716064453125
},
"4": {
"learned": 3082,
"possible": 4294967296,
"coverage": 7.175840437412262e-05
}
}
}