{ "vocab_size": 96000, "avg_token_length_chars": 6.1, "max_token_length_chars": 15, "single_byte_tokens": 256, "special_tokens": [ { "token": "", "id": 0 }, { "token": "<|bos|>", "id": 1 }, { "token": "<|endoftext|>", "id": 2 }, { "token": "<", "id": 30 }, { "token": "[", "id": 61 }, { "token": "[User", "id": 1157 }, { "token": "[System", "id": 1217 }, { "token": "[]", "id": 3325 }, { "token": "", "<|bos|>", "<|endoftext|>" ], "sample_ratio": 0.15 }, "per_language_evaluation": { "ar": { "chars": 135, "bytes": 250, "tokens": 40, "words": 20, "chars_per_token": 3.375, "bytes_per_token": 6.25, "tokens_per_word": 2.0, "byte_fragmentation_ratio": 0.1 }, "arithmetic": { "chars": 88, "bytes": 88, "tokens": 78, "words": 18, "chars_per_token": 1.128, "bytes_per_token": 1.128, "tokens_per_word": 4.333, "byte_fragmentation_ratio": 0.987 }, "bn": { "chars": 129, "bytes": 347, "tokens": 68, "words": 21, "chars_per_token": 1.897, "bytes_per_token": 5.103, "tokens_per_word": 3.238, "byte_fragmentation_ratio": 0.0 }, "de": { "chars": 177, "bytes": 180, "tokens": 30, "words": 23, "chars_per_token": 5.9, "bytes_per_token": 6.0, "tokens_per_word": 1.304, "byte_fragmentation_ratio": 0.1 }, "el": { "chars": 158, "bytes": 289, "tokens": 56, "words": 26, "chars_per_token": 2.821, "bytes_per_token": 5.161, "tokens_per_word": 2.154, "byte_fragmentation_ratio": 0.179 }, "en": { "chars": 224, "bytes": 224, "tokens": 41, "words": 30, "chars_per_token": 5.463, "bytes_per_token": 5.463, "tokens_per_word": 1.367, "byte_fragmentation_ratio": 0.171 }, "es": { "chars": 162, "bytes": 168, "tokens": 37, "words": 26, "chars_per_token": 4.378, "bytes_per_token": 4.541, "tokens_per_word": 1.423, "byte_fragmentation_ratio": 0.243 }, "fa": { "chars": 153, "bytes": 281, "tokens": 45, "words": 28, "chars_per_token": 3.4, "bytes_per_token": 6.244, "tokens_per_word": 1.607, "byte_fragmentation_ratio": 0.089 }, "fr": { "chars": 175, "bytes": 183, "tokens": 40, "words": 26, "chars_per_token": 4.375, "bytes_per_token": 4.575, "tokens_per_word": 1.538, "byte_fragmentation_ratio": 0.35 }, "he": { "chars": 136, "bytes": 249, "tokens": 48, "words": 22, "chars_per_token": 2.833, "bytes_per_token": 5.188, "tokens_per_word": 2.182, "byte_fragmentation_ratio": 0.229 }, "hi": { "chars": 137, "bytes": 353, "tokens": 67, "words": 30, "chars_per_token": 2.045, "bytes_per_token": 5.269, "tokens_per_word": 2.233, "byte_fragmentation_ratio": 0.0 }, "hy": { "chars": 140, "bytes": 257, "tokens": 47, "words": 20, "chars_per_token": 2.979, "bytes_per_token": 5.468, "tokens_per_word": 2.35, "byte_fragmentation_ratio": 0.234 }, "ja": { "chars": 70, "bytes": 210, "tokens": 44, "words": 1, "chars_per_token": 1.591, "bytes_per_token": 4.773, "tokens_per_word": 44.0, "byte_fragmentation_ratio": 0.0 }, "javascript": { "chars": 212, "bytes": 212, "tokens": 53, "words": 31, "chars_per_token": 4.0, "bytes_per_token": 4.0, "tokens_per_word": 1.71, "byte_fragmentation_ratio": 0.245 }, "ka": { "chars": 126, "bytes": 346, "tokens": 47, "words": 14, "chars_per_token": 2.681, "bytes_per_token": 7.362, "tokens_per_word": 3.357, "byte_fragmentation_ratio": 0.085 }, "ko": { "chars": 73, "bytes": 185, "tokens": 37, "words": 16, "chars_per_token": 1.973, "bytes_per_token": 5.0, "tokens_per_word": 2.312, "byte_fragmentation_ratio": 0.054 }, "math": { "chars": 152, "bytes": 172, "tokens": 85, "words": 28, "chars_per_token": 1.788, "bytes_per_token": 2.024, "tokens_per_word": 3.036, "byte_fragmentation_ratio": 0.776 }, "python": { "chars": 202, "bytes": 202, "tokens": 61, "words": 32, "chars_per_token": 3.311, "bytes_per_token": 3.311, "tokens_per_word": 1.906, "byte_fragmentation_ratio": 0.492 }, "ru": { "chars": 190, "bytes": 357, "tokens": 46, "words": 23, "chars_per_token": 4.13, "bytes_per_token": 7.761, "tokens_per_word": 2.0, "byte_fragmentation_ratio": 0.043 }, "th": { "chars": 123, "bytes": 367, "tokens": 53, "words": 2, "chars_per_token": 2.321, "bytes_per_token": 6.925, "tokens_per_word": 26.5, "byte_fragmentation_ratio": 0.0 }, "uk": { "chars": 157, "bytes": 291, "tokens": 42, "words": 22, "chars_per_token": 3.738, "bytes_per_token": 6.929, "tokens_per_word": 1.909, "byte_fragmentation_ratio": 0.095 }, "vi": { "chars": 143, "bytes": 190, "tokens": 42, "words": 34, "chars_per_token": 3.405, "bytes_per_token": 4.524, "tokens_per_word": 1.235, "byte_fragmentation_ratio": 0.167 }, "zh": { "chars": 62, "bytes": 186, "tokens": 35, "words": 1, "chars_per_token": 1.771, "bytes_per_token": 5.314, "tokens_per_word": 35.0, "byte_fragmentation_ratio": 0.0 } } }