| { | |
| "text": "tokenizer/artifacts/heldout_eval.txt", | |
| "limit": 10000, | |
| "overall": [ | |
| { | |
| "name": "tiktoken_gpt2", | |
| "total_chars": 30775982, | |
| "total_bytes": 42250192, | |
| "total_tokens": 20869716, | |
| "tokens_per_1k_chars": 678.116980962622, | |
| "tokens_per_1k_bytes": 493.95553042693865, | |
| "bytes_per_token": 2.0244737398438963, | |
| "chars_per_token": 1.4746718163294603, | |
| "p50_tokens_per_line": 588, | |
| "p95_tokens_per_line": 8239, | |
| "p95_tokens_per_1k_bytes_per_line": 976.7055970387389 | |
| }, | |
| { | |
| "name": "tiktoken_cl100k_base", | |
| "total_chars": 30775982, | |
| "total_bytes": 42250192, | |
| "total_tokens": 15598584, | |
| "tokens_per_1k_chars": 506.8427710933805, | |
| "tokens_per_1k_bytes": 369.19557667335573, | |
| "bytes_per_token": 2.7085914977923635, | |
| "chars_per_token": 1.972998446525659, | |
| "p50_tokens_per_line": 552, | |
| "p95_tokens_per_line": 5871, | |
| "p95_tokens_per_1k_bytes_per_line": 641.7086633069355 | |
| }, | |
| { | |
| "name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)", | |
| "total_chars": 30775982, | |
| "total_bytes": 42250192, | |
| "total_tokens": 9485138, | |
| "tokens_per_1k_chars": 308.19936143711027, | |
| "tokens_per_1k_bytes": 224.49928748252788, | |
| "bytes_per_token": 4.454357121635974, | |
| "chars_per_token": 3.244653056181154, | |
| "p50_tokens_per_line": 469, | |
| "p95_tokens_per_line": 3096, | |
| "p95_tokens_per_1k_bytes_per_line": 285.07596067917785 | |
| } | |
| ], | |
| "by_bucket": { | |
| "devanagari": [ | |
| { | |
| "name": "tiktoken_gpt2", | |
| "total_chars": 1971486, | |
| "total_bytes": 5015290, | |
| "total_tokens": 2978527, | |
| "tokens_per_1k_chars": 1510.8030186367034, | |
| "tokens_per_1k_bytes": 593.8892865616943, | |
| "bytes_per_token": 1.6838155235792727, | |
| "chars_per_token": 0.6618996571123915, | |
| "p50_tokens_per_line": 1715, | |
| "p95_tokens_per_line": 9396, | |
| "p95_tokens_per_1k_bytes_per_line": 615.3846153846154 | |
| }, | |
| { | |
| "name": "tiktoken_cl100k_base", | |
| "total_chars": 1971486, | |
| "total_bytes": 5015290, | |
| "total_tokens": 1928837, | |
| "tokens_per_1k_chars": 978.3670794517435, | |
| "tokens_per_1k_bytes": 384.5913197442222, | |
| "bytes_per_token": 2.6001626887082736, | |
| "chars_per_token": 1.0221112514950719, | |
| "p50_tokens_per_line": 1112, | |
| "p95_tokens_per_line": 6036, | |
| "p95_tokens_per_1k_bytes_per_line": 445.6521739130435 | |
| }, | |
| { | |
| "name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)", | |
| "total_chars": 1971486, | |
| "total_bytes": 5015290, | |
| "total_tokens": 1102384, | |
| "tokens_per_1k_chars": 559.1640011646037, | |
| "tokens_per_1k_bytes": 219.80463741877338, | |
| "bytes_per_token": 4.549494549993469, | |
| "chars_per_token": 1.7883840839489689, | |
| "p50_tokens_per_line": 636, | |
| "p95_tokens_per_line": 3472, | |
| "p95_tokens_per_1k_bytes_per_line": 261.53846153846155 | |
| } | |
| ], | |
| "kannada": [ | |
| { | |
| "name": "tiktoken_gpt2", | |
| "total_chars": 2158481, | |
| "total_bytes": 5770626, | |
| "total_tokens": 5633881, | |
| "tokens_per_1k_chars": 2610.1137790881644, | |
| "tokens_per_1k_bytes": 976.3032641519309, | |
| "bytes_per_token": 1.0242719006666985, | |
| "chars_per_token": 0.3831250606819704, | |
| "p50_tokens_per_line": 3068, | |
| "p95_tokens_per_line": 19541, | |
| "p95_tokens_per_1k_bytes_per_line": 984.8484848484849 | |
| }, | |
| { | |
| "name": "tiktoken_cl100k_base", | |
| "total_chars": 2158481, | |
| "total_bytes": 5770626, | |
| "total_tokens": 3701089, | |
| "tokens_per_1k_chars": 1714.6729575103973, | |
| "tokens_per_1k_bytes": 641.3669851416466, | |
| "bytes_per_token": 1.5591697470663364, | |
| "chars_per_token": 0.5832015928284892, | |
| "p50_tokens_per_line": 2016, | |
| "p95_tokens_per_line": 12796, | |
| "p95_tokens_per_1k_bytes_per_line": 645.6692913385826 | |
| }, | |
| { | |
| "name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)", | |
| "total_chars": 2158481, | |
| "total_bytes": 5770626, | |
| "total_tokens": 1250792, | |
| "tokens_per_1k_chars": 579.4778828259317, | |
| "tokens_per_1k_bytes": 216.75152747726156, | |
| "bytes_per_token": 4.613577637209064, | |
| "chars_per_token": 1.7256914019277385, | |
| "p50_tokens_per_line": 689, | |
| "p95_tokens_per_line": 4247, | |
| "p95_tokens_per_1k_bytes_per_line": 248.54651162790697 | |
| } | |
| ], | |
| "latin": [ | |
| { | |
| "name": "tiktoken_gpt2", | |
| "total_chars": 23547886, | |
| "total_bytes": 23664351, | |
| "total_tokens": 6049914, | |
| "tokens_per_1k_chars": 256.91962327318896, | |
| "tokens_per_1k_bytes": 255.65518361352906, | |
| "bytes_per_token": 3.911518576958284, | |
| "chars_per_token": 3.892267890089016, | |
| "p50_tokens_per_line": 423, | |
| "p95_tokens_per_line": 2501, | |
| "p95_tokens_per_1k_bytes_per_line": 394.47852760736197 | |
| }, | |
| { | |
| "name": "tiktoken_cl100k_base", | |
| "total_chars": 23547886, | |
| "total_bytes": 23664351, | |
| "total_tokens": 5891120, | |
| "tokens_per_1k_chars": 250.17617292694555, | |
| "tokens_per_1k_bytes": 248.94492141364873, | |
| "bytes_per_token": 4.016952803541602, | |
| "chars_per_token": 3.9971832181316964, | |
| "p50_tokens_per_line": 413, | |
| "p95_tokens_per_line": 2433, | |
| "p95_tokens_per_1k_bytes_per_line": 378.1904305978987 | |
| }, | |
| { | |
| "name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)", | |
| "total_chars": 23547886, | |
| "total_bytes": 23664351, | |
| "total_tokens": 5415481, | |
| "tokens_per_1k_chars": 229.9773746144346, | |
| "tokens_per_1k_bytes": 228.84553225228953, | |
| "bytes_per_token": 4.369759768338214, | |
| "chars_per_token": 4.34825383008453, | |
| "p50_tokens_per_line": 402, | |
| "p95_tokens_per_line": 2171, | |
| "p95_tokens_per_1k_bytes_per_line": 288.46153846153845 | |
| } | |
| ], | |
| "mixed": [ | |
| { | |
| "name": "tiktoken_gpt2", | |
| "total_chars": 3098129, | |
| "total_bytes": 7799925, | |
| "total_tokens": 6207394, | |
| "tokens_per_1k_chars": 2003.5944274754215, | |
| "tokens_per_1k_bytes": 795.8273957762415, | |
| "bytes_per_token": 1.2565538775209049, | |
| "chars_per_token": 0.49910300522248147, | |
| "p50_tokens_per_line": 5384, | |
| "p95_tokens_per_line": 47503, | |
| "p95_tokens_per_1k_bytes_per_line": 980.3065905784939 | |
| }, | |
| { | |
| "name": "tiktoken_cl100k_base", | |
| "total_chars": 3098129, | |
| "total_bytes": 7799925, | |
| "total_tokens": 4077538, | |
| "tokens_per_1k_chars": 1316.1291863573144, | |
| "tokens_per_1k_bytes": 522.7663086504037, | |
| "bytes_per_token": 1.9129006277807834, | |
| "chars_per_token": 0.7598038326068328, | |
| "p50_tokens_per_line": 3720, | |
| "p95_tokens_per_line": 31014, | |
| "p95_tokens_per_1k_bytes_per_line": 642.8864288642886 | |
| }, | |
| { | |
| "name": "mgpt2_RegexTokenizer_candidate (tokenizer/artifacts/mgpt2.model)", | |
| "total_chars": 3098129, | |
| "total_bytes": 7799925, | |
| "total_tokens": 1716481, | |
| "tokens_per_1k_chars": 554.0379370904182, | |
| "tokens_per_1k_bytes": 220.0637826645769, | |
| "bytes_per_token": 4.5441371037605425, | |
| "chars_per_token": 1.8049305526830766, | |
| "p50_tokens_per_line": 1719, | |
| "p95_tokens_per_line": 13591, | |
| "p95_tokens_per_1k_bytes_per_line": 249.65580541532813 | |
| } | |
| ] | |
| } | |
| } | |