{ "vocab_size": 64000, "avg_token_length_chars": 5.7, "max_token_length_chars": 15, "single_byte_tokens": 256, "special_tokens": [ { "token": "", "id": 0 }, { "token": "<|bos|>", "id": 1 }, { "token": "<|endoftext|>", "id": 2 }, { "token": "<", "id": 30 }, { "token": "[", "id": 61 }, { "token": "[User", "id": 1111 }, { "token": "[System", "id": 1170 }, { "token": "[]", "id": 3492 }, { "token": "", "<|bos|>", "<|endoftext|>" ], "sample_ratio": 0.1 }, "per_language_evaluation": { "ar": { "chars": 135, "bytes": 250, "tokens": 45, "words": 20, "chars_per_token": 3.0, "bytes_per_token": 5.556, "tokens_per_word": 2.25, "byte_fragmentation_ratio": 0.156 }, "arithmetic": { "chars": 88, "bytes": 88, "tokens": 78, "words": 18, "chars_per_token": 1.128, "bytes_per_token": 1.128, "tokens_per_word": 4.333, "byte_fragmentation_ratio": 0.987 }, "bn": { "chars": 129, "bytes": 347, "tokens": 70, "words": 21, "chars_per_token": 1.843, "bytes_per_token": 4.957, "tokens_per_word": 3.333, "byte_fragmentation_ratio": 0.014 }, "de": { "chars": 177, "bytes": 180, "tokens": 38, "words": 23, "chars_per_token": 4.658, "bytes_per_token": 4.737, "tokens_per_word": 1.652, "byte_fragmentation_ratio": 0.158 }, "el": { "chars": 158, "bytes": 289, "tokens": 57, "words": 26, "chars_per_token": 2.772, "bytes_per_token": 5.07, "tokens_per_word": 2.192, "byte_fragmentation_ratio": 0.175 }, "en": { "chars": 224, "bytes": 224, "tokens": 43, "words": 30, "chars_per_token": 5.209, "bytes_per_token": 5.209, "tokens_per_word": 1.433, "byte_fragmentation_ratio": 0.163 }, "es": { "chars": 162, "bytes": 168, "tokens": 40, "words": 26, "chars_per_token": 4.05, "bytes_per_token": 4.2, "tokens_per_word": 1.538, "byte_fragmentation_ratio": 0.25 }, "fa": { "chars": 153, "bytes": 281, "tokens": 53, "words": 28, "chars_per_token": 2.887, "bytes_per_token": 5.302, "tokens_per_word": 1.893, "byte_fragmentation_ratio": 0.132 }, "fr": { "chars": 175, "bytes": 183, "tokens": 43, "words": 26, "chars_per_token": 4.07, "bytes_per_token": 4.256, "tokens_per_word": 1.654, "byte_fragmentation_ratio": 0.349 }, "he": { "chars": 136, "bytes": 249, "tokens": 55, "words": 22, "chars_per_token": 2.473, "bytes_per_token": 4.527, "tokens_per_word": 2.5, "byte_fragmentation_ratio": 0.2 }, "hi": { "chars": 137, "bytes": 353, "tokens": 67, "words": 30, "chars_per_token": 2.045, "bytes_per_token": 5.269, "tokens_per_word": 2.233, "byte_fragmentation_ratio": 0.0 }, "hy": { "chars": 140, "bytes": 257, "tokens": 55, "words": 20, "chars_per_token": 2.545, "bytes_per_token": 4.673, "tokens_per_word": 2.75, "byte_fragmentation_ratio": 0.309 }, "ja": { "chars": 70, "bytes": 210, "tokens": 49, "words": 1, "chars_per_token": 1.429, "bytes_per_token": 4.286, "tokens_per_word": 49.0, "byte_fragmentation_ratio": 0.041 }, "javascript": { "chars": 212, "bytes": 212, "tokens": 53, "words": 31, "chars_per_token": 4.0, "bytes_per_token": 4.0, "tokens_per_word": 1.71, "byte_fragmentation_ratio": 0.245 }, "ka": { "chars": 126, "bytes": 346, "tokens": 53, "words": 14, "chars_per_token": 2.377, "bytes_per_token": 6.528, "tokens_per_word": 3.786, "byte_fragmentation_ratio": 0.151 }, "ko": { "chars": 73, "bytes": 185, "tokens": 41, "words": 16, "chars_per_token": 1.78, "bytes_per_token": 4.512, "tokens_per_word": 2.562, "byte_fragmentation_ratio": 0.049 }, "math": { "chars": 152, "bytes": 172, "tokens": 86, "words": 28, "chars_per_token": 1.767, "bytes_per_token": 2.0, "tokens_per_word": 3.071, "byte_fragmentation_ratio": 0.791 }, "python": { "chars": 202, "bytes": 202, "tokens": 63, "words": 32, "chars_per_token": 3.206, "bytes_per_token": 3.206, "tokens_per_word": 1.969, "byte_fragmentation_ratio": 0.508 }, "ru": { "chars": 190, "bytes": 357, "tokens": 51, "words": 23, "chars_per_token": 3.725, "bytes_per_token": 7.0, "tokens_per_word": 2.217, "byte_fragmentation_ratio": 0.059 }, "th": { "chars": 123, "bytes": 367, "tokens": 59, "words": 2, "chars_per_token": 2.085, "bytes_per_token": 6.22, "tokens_per_word": 29.5, "byte_fragmentation_ratio": 0.0 }, "uk": { "chars": 157, "bytes": 291, "tokens": 47, "words": 22, "chars_per_token": 3.34, "bytes_per_token": 6.191, "tokens_per_word": 2.136, "byte_fragmentation_ratio": 0.149 }, "vi": { "chars": 143, "bytes": 190, "tokens": 47, "words": 34, "chars_per_token": 3.043, "bytes_per_token": 4.043, "tokens_per_word": 1.382, "byte_fragmentation_ratio": 0.277 }, "zh": { "chars": 62, "bytes": 186, "tokens": 42, "words": 1, "chars_per_token": 1.476, "bytes_per_token": 4.429, "tokens_per_word": 42.0, "byte_fragmentation_ratio": 0.0 } } }