| { | |
| "vocab_size": 114688, | |
| "avg_token_length_chars": 6.24, | |
| "max_token_length_chars": 15, | |
| "single_byte_tokens": 256, | |
| "special_tokens": [ | |
| { | |
| "token": "<pad>", | |
| "id": 0 | |
| }, | |
| { | |
| "token": "<|bos|>", | |
| "id": 1 | |
| }, | |
| { | |
| "token": "<|endoftext|>", | |
| "id": 2 | |
| }, | |
| { | |
| "token": "<", | |
| "id": 30 | |
| }, | |
| { | |
| "token": "[", | |
| "id": 61 | |
| }, | |
| { | |
| "token": "[User", | |
| "id": 984 | |
| }, | |
| { | |
| "token": "[System", | |
| "id": 1019 | |
| }, | |
| { | |
| "token": "[]", | |
| "id": 2190 | |
| }, | |
| { | |
| "token": "['", | |
| "id": 2206 | |
| }, | |
| { | |
| "token": "[\"", | |
| "id": 3088 | |
| }, | |
| { | |
| "token": "</", | |
| "id": 3103 | |
| }, | |
| { | |
| "token": "[i", | |
| "id": 3156 | |
| }, | |
| { | |
| "token": "<T", | |
| "id": 4720 | |
| }, | |
| { | |
| "token": "[:", | |
| "id": 6758 | |
| }, | |
| { | |
| "token": "[Oracle", | |
| "id": 8458 | |
| }, | |
| { | |
| "token": "<String", | |
| "id": 8677 | |
| }, | |
| { | |
| "token": "<int", | |
| "id": 8793 | |
| }, | |
| { | |
| "token": "[string", | |
| "id": 9942 | |
| }, | |
| { | |
| "token": "<string", | |
| "id": 10055 | |
| }, | |
| { | |
| "token": "[str", | |
| "id": 10751 | |
| } | |
| ], | |
| "compression_ratio_chars_per_token": 3.597, | |
| "fertility_tokens_per_word": 4.042, | |
| "training_files": 1086, | |
| "training_time_seconds": 308.3, | |
| "config": { | |
| "preset": "llama3", | |
| "vocab_size": 114688, | |
| "min_frequency": 2, | |
| "bpe_dropout": null, | |
| "single_digit": true, | |
| "max_token_length": 16, | |
| "special_tokens": [ | |
| "<pad>", | |
| "<|bos|>", | |
| "<|endoftext|>" | |
| ], | |
| "sample_ratio": 0.12 | |
| }, | |
| "per_language_evaluation": { | |
| "ar": { | |
| "chars": 135, | |
| "bytes": 250, | |
| "tokens": 38, | |
| "words": 20, | |
| "chars_per_token": 3.553, | |
| "bytes_per_token": 6.579, | |
| "tokens_per_word": 1.9, | |
| "byte_fragmentation_ratio": 0.132 | |
| }, | |
| "arithmetic": { | |
| "chars": 88, | |
| "bytes": 88, | |
| "tokens": 78, | |
| "words": 18, | |
| "chars_per_token": 1.128, | |
| "bytes_per_token": 1.128, | |
| "tokens_per_word": 4.333, | |
| "byte_fragmentation_ratio": 0.987 | |
| }, | |
| "bn": { | |
| "chars": 129, | |
| "bytes": 347, | |
| "tokens": 68, | |
| "words": 21, | |
| "chars_per_token": 1.897, | |
| "bytes_per_token": 5.103, | |
| "tokens_per_word": 3.238, | |
| "byte_fragmentation_ratio": 0.0 | |
| }, | |
| "de": { | |
| "chars": 177, | |
| "bytes": 180, | |
| "tokens": 29, | |
| "words": 23, | |
| "chars_per_token": 6.103, | |
| "bytes_per_token": 6.207, | |
| "tokens_per_word": 1.261, | |
| "byte_fragmentation_ratio": 0.069 | |
| }, | |
| "el": { | |
| "chars": 158, | |
| "bytes": 289, | |
| "tokens": 52, | |
| "words": 26, | |
| "chars_per_token": 3.038, | |
| "bytes_per_token": 5.558, | |
| "tokens_per_word": 2.0, | |
| "byte_fragmentation_ratio": 0.154 | |
| }, | |
| "en": { | |
| "chars": 224, | |
| "bytes": 224, | |
| "tokens": 40, | |
| "words": 30, | |
| "chars_per_token": 5.6, | |
| "bytes_per_token": 5.6, | |
| "tokens_per_word": 1.333, | |
| "byte_fragmentation_ratio": 0.175 | |
| }, | |
| "es": { | |
| "chars": 162, | |
| "bytes": 168, | |
| "tokens": 36, | |
| "words": 26, | |
| "chars_per_token": 4.5, | |
| "bytes_per_token": 4.667, | |
| "tokens_per_word": 1.385, | |
| "byte_fragmentation_ratio": 0.25 | |
| }, | |
| "fa": { | |
| "chars": 153, | |
| "bytes": 281, | |
| "tokens": 42, | |
| "words": 28, | |
| "chars_per_token": 3.643, | |
| "bytes_per_token": 6.69, | |
| "tokens_per_word": 1.5, | |
| "byte_fragmentation_ratio": 0.024 | |
| }, | |
| "fr": { | |
| "chars": 175, | |
| "bytes": 183, | |
| "tokens": 39, | |
| "words": 26, | |
| "chars_per_token": 4.487, | |
| "bytes_per_token": 4.692, | |
| "tokens_per_word": 1.5, | |
| "byte_fragmentation_ratio": 0.333 | |
| }, | |
| "he": { | |
| "chars": 136, | |
| "bytes": 249, | |
| "tokens": 43, | |
| "words": 22, | |
| "chars_per_token": 3.163, | |
| "bytes_per_token": 5.791, | |
| "tokens_per_word": 1.955, | |
| "byte_fragmentation_ratio": 0.14 | |
| }, | |
| "hi": { | |
| "chars": 137, | |
| "bytes": 353, | |
| "tokens": 67, | |
| "words": 30, | |
| "chars_per_token": 2.045, | |
| "bytes_per_token": 5.269, | |
| "tokens_per_word": 2.233, | |
| "byte_fragmentation_ratio": 0.0 | |
| }, | |
| "hy": { | |
| "chars": 140, | |
| "bytes": 257, | |
| "tokens": 48, | |
| "words": 20, | |
| "chars_per_token": 2.917, | |
| "bytes_per_token": 5.354, | |
| "tokens_per_word": 2.4, | |
| "byte_fragmentation_ratio": 0.229 | |
| }, | |
| "ja": { | |
| "chars": 70, | |
| "bytes": 210, | |
| "tokens": 41, | |
| "words": 1, | |
| "chars_per_token": 1.707, | |
| "bytes_per_token": 5.122, | |
| "tokens_per_word": 41.0, | |
| "byte_fragmentation_ratio": 0.0 | |
| }, | |
| "javascript": { | |
| "chars": 212, | |
| "bytes": 212, | |
| "tokens": 53, | |
| "words": 31, | |
| "chars_per_token": 4.0, | |
| "bytes_per_token": 4.0, | |
| "tokens_per_word": 1.71, | |
| "byte_fragmentation_ratio": 0.245 | |
| }, | |
| "ka": { | |
| "chars": 126, | |
| "bytes": 346, | |
| "tokens": 47, | |
| "words": 14, | |
| "chars_per_token": 2.681, | |
| "bytes_per_token": 7.362, | |
| "tokens_per_word": 3.357, | |
| "byte_fragmentation_ratio": 0.085 | |
| }, | |
| "ko": { | |
| "chars": 73, | |
| "bytes": 185, | |
| "tokens": 33, | |
| "words": 16, | |
| "chars_per_token": 2.212, | |
| "bytes_per_token": 5.606, | |
| "tokens_per_word": 2.062, | |
| "byte_fragmentation_ratio": 0.061 | |
| }, | |
| "math": { | |
| "chars": 152, | |
| "bytes": 172, | |
| "tokens": 83, | |
| "words": 28, | |
| "chars_per_token": 1.831, | |
| "bytes_per_token": 2.072, | |
| "tokens_per_word": 2.964, | |
| "byte_fragmentation_ratio": 0.771 | |
| }, | |
| "python": { | |
| "chars": 202, | |
| "bytes": 202, | |
| "tokens": 60, | |
| "words": 32, | |
| "chars_per_token": 3.367, | |
| "bytes_per_token": 3.367, | |
| "tokens_per_word": 1.875, | |
| "byte_fragmentation_ratio": 0.5 | |
| }, | |
| "ru": { | |
| "chars": 190, | |
| "bytes": 357, | |
| "tokens": 45, | |
| "words": 23, | |
| "chars_per_token": 4.222, | |
| "bytes_per_token": 7.933, | |
| "tokens_per_word": 1.957, | |
| "byte_fragmentation_ratio": 0.022 | |
| }, | |
| "th": { | |
| "chars": 123, | |
| "bytes": 367, | |
| "tokens": 51, | |
| "words": 2, | |
| "chars_per_token": 2.412, | |
| "bytes_per_token": 7.196, | |
| "tokens_per_word": 25.5, | |
| "byte_fragmentation_ratio": 0.0 | |
| }, | |
| "uk": { | |
| "chars": 157, | |
| "bytes": 291, | |
| "tokens": 43, | |
| "words": 22, | |
| "chars_per_token": 3.651, | |
| "bytes_per_token": 6.767, | |
| "tokens_per_word": 1.955, | |
| "byte_fragmentation_ratio": 0.116 | |
| }, | |
| "vi": { | |
| "chars": 143, | |
| "bytes": 190, | |
| "tokens": 41, | |
| "words": 34, | |
| "chars_per_token": 3.488, | |
| "bytes_per_token": 4.634, | |
| "tokens_per_word": 1.206, | |
| "byte_fragmentation_ratio": 0.146 | |
| }, | |
| "zh": { | |
| "chars": 62, | |
| "bytes": 186, | |
| "tokens": 34, | |
| "words": 1, | |
| "chars_per_token": 1.824, | |
| "bytes_per_token": 5.471, | |
| "tokens_per_word": 34.0, | |
| "byte_fragmentation_ratio": 0.0 | |
| } | |
| } | |
| } |