| { | |
| "vocab_size": 64000, | |
| "avg_token_length_chars": 5.7, | |
| "max_token_length_chars": 15, | |
| "single_byte_tokens": 256, | |
| "special_tokens": [ | |
| { | |
| "token": "<pad>", | |
| "id": 0 | |
| }, | |
| { | |
| "token": "<|bos|>", | |
| "id": 1 | |
| }, | |
| { | |
| "token": "<|endoftext|>", | |
| "id": 2 | |
| }, | |
| { | |
| "token": "<", | |
| "id": 30 | |
| }, | |
| { | |
| "token": "[", | |
| "id": 61 | |
| }, | |
| { | |
| "token": "[User", | |
| "id": 1111 | |
| }, | |
| { | |
| "token": "[System", | |
| "id": 1170 | |
| }, | |
| { | |
| "token": "[]", | |
| "id": 3492 | |
| }, | |
| { | |
| "token": "</", | |
| "id": 4725 | |
| }, | |
| { | |
| "token": "['", | |
| "id": 5154 | |
| }, | |
| { | |
| "token": "[\"", | |
| "id": 5307 | |
| }, | |
| { | |
| "token": "[i", | |
| "id": 6326 | |
| }, | |
| { | |
| "token": "<T", | |
| "id": 8656 | |
| }, | |
| { | |
| "token": "[Oracle", | |
| "id": 9580 | |
| }, | |
| { | |
| "token": "[:", | |
| "id": 12198 | |
| }, | |
| { | |
| "token": "<String", | |
| "id": 15705 | |
| }, | |
| { | |
| "token": "<string", | |
| "id": 17256 | |
| }, | |
| { | |
| "token": "[string", | |
| "id": 17624 | |
| }, | |
| { | |
| "token": "[\\", | |
| "id": 18218 | |
| }, | |
| { | |
| "token": "<?", | |
| "id": 19095 | |
| } | |
| ], | |
| "compression_ratio_chars_per_token": 3.033, | |
| "fertility_tokens_per_word": 4.906, | |
| "training_files": 1085, | |
| "training_time_seconds": 390.5, | |
| "config": { | |
| "preset": "llama3", | |
| "vocab_size": 64000, | |
| "min_frequency": 2, | |
| "bpe_dropout": null, | |
| "single_digit": true, | |
| "max_token_length": 16, | |
| "special_tokens": [ | |
| "<pad>", | |
| "<|bos|>", | |
| "<|endoftext|>" | |
| ], | |
| "sample_ratio": 0.1 | |
| }, | |
| "per_language_evaluation": { | |
| "ar": { | |
| "chars": 135, | |
| "bytes": 250, | |
| "tokens": 45, | |
| "words": 20, | |
| "chars_per_token": 3.0, | |
| "bytes_per_token": 5.556, | |
| "tokens_per_word": 2.25, | |
| "byte_fragmentation_ratio": 0.156 | |
| }, | |
| "arithmetic": { | |
| "chars": 88, | |
| "bytes": 88, | |
| "tokens": 78, | |
| "words": 18, | |
| "chars_per_token": 1.128, | |
| "bytes_per_token": 1.128, | |
| "tokens_per_word": 4.333, | |
| "byte_fragmentation_ratio": 0.987 | |
| }, | |
| "bn": { | |
| "chars": 129, | |
| "bytes": 347, | |
| "tokens": 70, | |
| "words": 21, | |
| "chars_per_token": 1.843, | |
| "bytes_per_token": 4.957, | |
| "tokens_per_word": 3.333, | |
| "byte_fragmentation_ratio": 0.014 | |
| }, | |
| "de": { | |
| "chars": 177, | |
| "bytes": 180, | |
| "tokens": 38, | |
| "words": 23, | |
| "chars_per_token": 4.658, | |
| "bytes_per_token": 4.737, | |
| "tokens_per_word": 1.652, | |
| "byte_fragmentation_ratio": 0.158 | |
| }, | |
| "el": { | |
| "chars": 158, | |
| "bytes": 289, | |
| "tokens": 57, | |
| "words": 26, | |
| "chars_per_token": 2.772, | |
| "bytes_per_token": 5.07, | |
| "tokens_per_word": 2.192, | |
| "byte_fragmentation_ratio": 0.175 | |
| }, | |
| "en": { | |
| "chars": 224, | |
| "bytes": 224, | |
| "tokens": 43, | |
| "words": 30, | |
| "chars_per_token": 5.209, | |
| "bytes_per_token": 5.209, | |
| "tokens_per_word": 1.433, | |
| "byte_fragmentation_ratio": 0.163 | |
| }, | |
| "es": { | |
| "chars": 162, | |
| "bytes": 168, | |
| "tokens": 40, | |
| "words": 26, | |
| "chars_per_token": 4.05, | |
| "bytes_per_token": 4.2, | |
| "tokens_per_word": 1.538, | |
| "byte_fragmentation_ratio": 0.25 | |
| }, | |
| "fa": { | |
| "chars": 153, | |
| "bytes": 281, | |
| "tokens": 53, | |
| "words": 28, | |
| "chars_per_token": 2.887, | |
| "bytes_per_token": 5.302, | |
| "tokens_per_word": 1.893, | |
| "byte_fragmentation_ratio": 0.132 | |
| }, | |
| "fr": { | |
| "chars": 175, | |
| "bytes": 183, | |
| "tokens": 43, | |
| "words": 26, | |
| "chars_per_token": 4.07, | |
| "bytes_per_token": 4.256, | |
| "tokens_per_word": 1.654, | |
| "byte_fragmentation_ratio": 0.349 | |
| }, | |
| "he": { | |
| "chars": 136, | |
| "bytes": 249, | |
| "tokens": 55, | |
| "words": 22, | |
| "chars_per_token": 2.473, | |
| "bytes_per_token": 4.527, | |
| "tokens_per_word": 2.5, | |
| "byte_fragmentation_ratio": 0.2 | |
| }, | |
| "hi": { | |
| "chars": 137, | |
| "bytes": 353, | |
| "tokens": 67, | |
| "words": 30, | |
| "chars_per_token": 2.045, | |
| "bytes_per_token": 5.269, | |
| "tokens_per_word": 2.233, | |
| "byte_fragmentation_ratio": 0.0 | |
| }, | |
| "hy": { | |
| "chars": 140, | |
| "bytes": 257, | |
| "tokens": 55, | |
| "words": 20, | |
| "chars_per_token": 2.545, | |
| "bytes_per_token": 4.673, | |
| "tokens_per_word": 2.75, | |
| "byte_fragmentation_ratio": 0.309 | |
| }, | |
| "ja": { | |
| "chars": 70, | |
| "bytes": 210, | |
| "tokens": 49, | |
| "words": 1, | |
| "chars_per_token": 1.429, | |
| "bytes_per_token": 4.286, | |
| "tokens_per_word": 49.0, | |
| "byte_fragmentation_ratio": 0.041 | |
| }, | |
| "javascript": { | |
| "chars": 212, | |
| "bytes": 212, | |
| "tokens": 53, | |
| "words": 31, | |
| "chars_per_token": 4.0, | |
| "bytes_per_token": 4.0, | |
| "tokens_per_word": 1.71, | |
| "byte_fragmentation_ratio": 0.245 | |
| }, | |
| "ka": { | |
| "chars": 126, | |
| "bytes": 346, | |
| "tokens": 53, | |
| "words": 14, | |
| "chars_per_token": 2.377, | |
| "bytes_per_token": 6.528, | |
| "tokens_per_word": 3.786, | |
| "byte_fragmentation_ratio": 0.151 | |
| }, | |
| "ko": { | |
| "chars": 73, | |
| "bytes": 185, | |
| "tokens": 41, | |
| "words": 16, | |
| "chars_per_token": 1.78, | |
| "bytes_per_token": 4.512, | |
| "tokens_per_word": 2.562, | |
| "byte_fragmentation_ratio": 0.049 | |
| }, | |
| "math": { | |
| "chars": 152, | |
| "bytes": 172, | |
| "tokens": 86, | |
| "words": 28, | |
| "chars_per_token": 1.767, | |
| "bytes_per_token": 2.0, | |
| "tokens_per_word": 3.071, | |
| "byte_fragmentation_ratio": 0.791 | |
| }, | |
| "python": { | |
| "chars": 202, | |
| "bytes": 202, | |
| "tokens": 63, | |
| "words": 32, | |
| "chars_per_token": 3.206, | |
| "bytes_per_token": 3.206, | |
| "tokens_per_word": 1.969, | |
| "byte_fragmentation_ratio": 0.508 | |
| }, | |
| "ru": { | |
| "chars": 190, | |
| "bytes": 357, | |
| "tokens": 51, | |
| "words": 23, | |
| "chars_per_token": 3.725, | |
| "bytes_per_token": 7.0, | |
| "tokens_per_word": 2.217, | |
| "byte_fragmentation_ratio": 0.059 | |
| }, | |
| "th": { | |
| "chars": 123, | |
| "bytes": 367, | |
| "tokens": 59, | |
| "words": 2, | |
| "chars_per_token": 2.085, | |
| "bytes_per_token": 6.22, | |
| "tokens_per_word": 29.5, | |
| "byte_fragmentation_ratio": 0.0 | |
| }, | |
| "uk": { | |
| "chars": 157, | |
| "bytes": 291, | |
| "tokens": 47, | |
| "words": 22, | |
| "chars_per_token": 3.34, | |
| "bytes_per_token": 6.191, | |
| "tokens_per_word": 2.136, | |
| "byte_fragmentation_ratio": 0.149 | |
| }, | |
| "vi": { | |
| "chars": 143, | |
| "bytes": 190, | |
| "tokens": 47, | |
| "words": 34, | |
| "chars_per_token": 3.043, | |
| "bytes_per_token": 4.043, | |
| "tokens_per_word": 1.382, | |
| "byte_fragmentation_ratio": 0.277 | |
| }, | |
| "zh": { | |
| "chars": 62, | |
| "bytes": 186, | |
| "tokens": 42, | |
| "words": 1, | |
| "chars_per_token": 1.476, | |
| "bytes_per_token": 4.429, | |
| "tokens_per_word": 42.0, | |
| "byte_fragmentation_ratio": 0.0 | |
| } | |
| } | |
| } |