{ "vocab_size": 114688, "avg_token_length_chars": 6.24, "max_token_length_chars": 15, "single_byte_tokens": 256, "special_tokens": [ { "token": "", "id": 0 }, { "token": "<|bos|>", "id": 1 }, { "token": "<|endoftext|>", "id": 2 }, { "token": "<", "id": 30 }, { "token": "[", "id": 61 }, { "token": "[User", "id": 984 }, { "token": "[System", "id": 1019 }, { "token": "[]", "id": 2190 }, { "token": "['", "id": 2206 }, { "token": "[\"", "id": 3088 }, { "token": "", "<|bos|>", "<|endoftext|>" ], "sample_ratio": 0.12 }, "per_language_evaluation": { "ar": { "chars": 135, "bytes": 250, "tokens": 38, "words": 20, "chars_per_token": 3.553, "bytes_per_token": 6.579, "tokens_per_word": 1.9, "byte_fragmentation_ratio": 0.132 }, "arithmetic": { "chars": 88, "bytes": 88, "tokens": 78, "words": 18, "chars_per_token": 1.128, "bytes_per_token": 1.128, "tokens_per_word": 4.333, "byte_fragmentation_ratio": 0.987 }, "bn": { "chars": 129, "bytes": 347, "tokens": 68, "words": 21, "chars_per_token": 1.897, "bytes_per_token": 5.103, "tokens_per_word": 3.238, "byte_fragmentation_ratio": 0.0 }, "de": { "chars": 177, "bytes": 180, "tokens": 29, "words": 23, "chars_per_token": 6.103, "bytes_per_token": 6.207, "tokens_per_word": 1.261, "byte_fragmentation_ratio": 0.069 }, "el": { "chars": 158, "bytes": 289, "tokens": 52, "words": 26, "chars_per_token": 3.038, "bytes_per_token": 5.558, "tokens_per_word": 2.0, "byte_fragmentation_ratio": 0.154 }, "en": { "chars": 224, "bytes": 224, "tokens": 40, "words": 30, "chars_per_token": 5.6, "bytes_per_token": 5.6, "tokens_per_word": 1.333, "byte_fragmentation_ratio": 0.175 }, "es": { "chars": 162, "bytes": 168, "tokens": 36, "words": 26, "chars_per_token": 4.5, "bytes_per_token": 4.667, "tokens_per_word": 1.385, "byte_fragmentation_ratio": 0.25 }, "fa": { "chars": 153, "bytes": 281, "tokens": 42, "words": 28, "chars_per_token": 3.643, "bytes_per_token": 6.69, "tokens_per_word": 1.5, "byte_fragmentation_ratio": 0.024 }, "fr": { "chars": 175, "bytes": 183, "tokens": 39, "words": 26, "chars_per_token": 4.487, "bytes_per_token": 4.692, "tokens_per_word": 1.5, "byte_fragmentation_ratio": 0.333 }, "he": { "chars": 136, "bytes": 249, "tokens": 43, "words": 22, "chars_per_token": 3.163, "bytes_per_token": 5.791, "tokens_per_word": 1.955, "byte_fragmentation_ratio": 0.14 }, "hi": { "chars": 137, "bytes": 353, "tokens": 67, "words": 30, "chars_per_token": 2.045, "bytes_per_token": 5.269, "tokens_per_word": 2.233, "byte_fragmentation_ratio": 0.0 }, "hy": { "chars": 140, "bytes": 257, "tokens": 48, "words": 20, "chars_per_token": 2.917, "bytes_per_token": 5.354, "tokens_per_word": 2.4, "byte_fragmentation_ratio": 0.229 }, "ja": { "chars": 70, "bytes": 210, "tokens": 41, "words": 1, "chars_per_token": 1.707, "bytes_per_token": 5.122, "tokens_per_word": 41.0, "byte_fragmentation_ratio": 0.0 }, "javascript": { "chars": 212, "bytes": 212, "tokens": 53, "words": 31, "chars_per_token": 4.0, "bytes_per_token": 4.0, "tokens_per_word": 1.71, "byte_fragmentation_ratio": 0.245 }, "ka": { "chars": 126, "bytes": 346, "tokens": 47, "words": 14, "chars_per_token": 2.681, "bytes_per_token": 7.362, "tokens_per_word": 3.357, "byte_fragmentation_ratio": 0.085 }, "ko": { "chars": 73, "bytes": 185, "tokens": 33, "words": 16, "chars_per_token": 2.212, "bytes_per_token": 5.606, "tokens_per_word": 2.062, "byte_fragmentation_ratio": 0.061 }, "math": { "chars": 152, "bytes": 172, "tokens": 83, "words": 28, "chars_per_token": 1.831, "bytes_per_token": 2.072, "tokens_per_word": 2.964, "byte_fragmentation_ratio": 0.771 }, "python": { "chars": 202, "bytes": 202, "tokens": 60, "words": 32, "chars_per_token": 3.367, "bytes_per_token": 3.367, "tokens_per_word": 1.875, "byte_fragmentation_ratio": 0.5 }, "ru": { "chars": 190, "bytes": 357, "tokens": 45, "words": 23, "chars_per_token": 4.222, "bytes_per_token": 7.933, "tokens_per_word": 1.957, "byte_fragmentation_ratio": 0.022 }, "th": { "chars": 123, "bytes": 367, "tokens": 51, "words": 2, "chars_per_token": 2.412, "bytes_per_token": 7.196, "tokens_per_word": 25.5, "byte_fragmentation_ratio": 0.0 }, "uk": { "chars": 157, "bytes": 291, "tokens": 43, "words": 22, "chars_per_token": 3.651, "bytes_per_token": 6.767, "tokens_per_word": 1.955, "byte_fragmentation_ratio": 0.116 }, "vi": { "chars": 143, "bytes": 190, "tokens": 41, "words": 34, "chars_per_token": 3.488, "bytes_per_token": 4.634, "tokens_per_word": 1.206, "byte_fragmentation_ratio": 0.146 }, "zh": { "chars": 62, "bytes": 186, "tokens": 34, "words": 1, "chars_per_token": 1.824, "bytes_per_token": 5.471, "tokens_per_word": 34.0, "byte_fragmentation_ratio": 0.0 } } }