| { | |
| "vocab_size": 96000, | |
| "avg_token_length_chars": 6.1, | |
| "max_token_length_chars": 15, | |
| "single_byte_tokens": 256, | |
| "special_tokens": [ | |
| { | |
| "token": "<pad>", | |
| "id": 0 | |
| }, | |
| { | |
| "token": "<|bos|>", | |
| "id": 1 | |
| }, | |
| { | |
| "token": "<|endoftext|>", | |
| "id": 2 | |
| }, | |
| { | |
| "token": "<", | |
| "id": 30 | |
| }, | |
| { | |
| "token": "[", | |
| "id": 61 | |
| }, | |
| { | |
| "token": "[User", | |
| "id": 1157 | |
| }, | |
| { | |
| "token": "[System", | |
| "id": 1217 | |
| }, | |
| { | |
| "token": "[]", | |
| "id": 3325 | |
| }, | |
| { | |
| "token": "</", | |
| "id": 4477 | |
| }, | |
| { | |
| "token": "['", | |
| "id": 4641 | |
| }, | |
| { | |
| "token": "[\"", | |
| "id": 5119 | |
| }, | |
| { | |
| "token": "[i", | |
| "id": 5453 | |
| }, | |
| { | |
| "token": "<T", | |
| "id": 7073 | |
| }, | |
| { | |
| "token": "[Oracle", | |
| "id": 9667 | |
| }, | |
| { | |
| "token": "[:", | |
| "id": 11907 | |
| }, | |
| { | |
| "token": "<String", | |
| "id": 13683 | |
| }, | |
| { | |
| "token": "<string", | |
| "id": 14770 | |
| }, | |
| { | |
| "token": "[string", | |
| "id": 15770 | |
| }, | |
| { | |
| "token": "[str", | |
| "id": 16887 | |
| }, | |
| { | |
| "token": "<int", | |
| "id": 18424 | |
| } | |
| ], | |
| "compression_ratio_chars_per_token": 3.174, | |
| "fertility_tokens_per_word": 4.711, | |
| "training_files": 1085, | |
| "training_time_seconds": 647.3, | |
| "config": { | |
| "preset": "llama3", | |
| "vocab_size": 96000, | |
| "min_frequency": 2, | |
| "bpe_dropout": null, | |
| "single_digit": true, | |
| "max_token_length": 16, | |
| "special_tokens": [ | |
| "<pad>", | |
| "<|bos|>", | |
| "<|endoftext|>" | |
| ], | |
| "sample_ratio": 0.15 | |
| }, | |
| "per_language_evaluation": { | |
| "ar": { | |
| "chars": 135, | |
| "bytes": 250, | |
| "tokens": 40, | |
| "words": 20, | |
| "chars_per_token": 3.375, | |
| "bytes_per_token": 6.25, | |
| "tokens_per_word": 2.0, | |
| "byte_fragmentation_ratio": 0.1 | |
| }, | |
| "arithmetic": { | |
| "chars": 88, | |
| "bytes": 88, | |
| "tokens": 78, | |
| "words": 18, | |
| "chars_per_token": 1.128, | |
| "bytes_per_token": 1.128, | |
| "tokens_per_word": 4.333, | |
| "byte_fragmentation_ratio": 0.987 | |
| }, | |
| "bn": { | |
| "chars": 129, | |
| "bytes": 347, | |
| "tokens": 68, | |
| "words": 21, | |
| "chars_per_token": 1.897, | |
| "bytes_per_token": 5.103, | |
| "tokens_per_word": 3.238, | |
| "byte_fragmentation_ratio": 0.0 | |
| }, | |
| "de": { | |
| "chars": 177, | |
| "bytes": 180, | |
| "tokens": 30, | |
| "words": 23, | |
| "chars_per_token": 5.9, | |
| "bytes_per_token": 6.0, | |
| "tokens_per_word": 1.304, | |
| "byte_fragmentation_ratio": 0.1 | |
| }, | |
| "el": { | |
| "chars": 158, | |
| "bytes": 289, | |
| "tokens": 56, | |
| "words": 26, | |
| "chars_per_token": 2.821, | |
| "bytes_per_token": 5.161, | |
| "tokens_per_word": 2.154, | |
| "byte_fragmentation_ratio": 0.179 | |
| }, | |
| "en": { | |
| "chars": 224, | |
| "bytes": 224, | |
| "tokens": 41, | |
| "words": 30, | |
| "chars_per_token": 5.463, | |
| "bytes_per_token": 5.463, | |
| "tokens_per_word": 1.367, | |
| "byte_fragmentation_ratio": 0.171 | |
| }, | |
| "es": { | |
| "chars": 162, | |
| "bytes": 168, | |
| "tokens": 37, | |
| "words": 26, | |
| "chars_per_token": 4.378, | |
| "bytes_per_token": 4.541, | |
| "tokens_per_word": 1.423, | |
| "byte_fragmentation_ratio": 0.243 | |
| }, | |
| "fa": { | |
| "chars": 153, | |
| "bytes": 281, | |
| "tokens": 45, | |
| "words": 28, | |
| "chars_per_token": 3.4, | |
| "bytes_per_token": 6.244, | |
| "tokens_per_word": 1.607, | |
| "byte_fragmentation_ratio": 0.089 | |
| }, | |
| "fr": { | |
| "chars": 175, | |
| "bytes": 183, | |
| "tokens": 40, | |
| "words": 26, | |
| "chars_per_token": 4.375, | |
| "bytes_per_token": 4.575, | |
| "tokens_per_word": 1.538, | |
| "byte_fragmentation_ratio": 0.35 | |
| }, | |
| "he": { | |
| "chars": 136, | |
| "bytes": 249, | |
| "tokens": 48, | |
| "words": 22, | |
| "chars_per_token": 2.833, | |
| "bytes_per_token": 5.188, | |
| "tokens_per_word": 2.182, | |
| "byte_fragmentation_ratio": 0.229 | |
| }, | |
| "hi": { | |
| "chars": 137, | |
| "bytes": 353, | |
| "tokens": 67, | |
| "words": 30, | |
| "chars_per_token": 2.045, | |
| "bytes_per_token": 5.269, | |
| "tokens_per_word": 2.233, | |
| "byte_fragmentation_ratio": 0.0 | |
| }, | |
| "hy": { | |
| "chars": 140, | |
| "bytes": 257, | |
| "tokens": 47, | |
| "words": 20, | |
| "chars_per_token": 2.979, | |
| "bytes_per_token": 5.468, | |
| "tokens_per_word": 2.35, | |
| "byte_fragmentation_ratio": 0.234 | |
| }, | |
| "ja": { | |
| "chars": 70, | |
| "bytes": 210, | |
| "tokens": 44, | |
| "words": 1, | |
| "chars_per_token": 1.591, | |
| "bytes_per_token": 4.773, | |
| "tokens_per_word": 44.0, | |
| "byte_fragmentation_ratio": 0.0 | |
| }, | |
| "javascript": { | |
| "chars": 212, | |
| "bytes": 212, | |
| "tokens": 53, | |
| "words": 31, | |
| "chars_per_token": 4.0, | |
| "bytes_per_token": 4.0, | |
| "tokens_per_word": 1.71, | |
| "byte_fragmentation_ratio": 0.245 | |
| }, | |
| "ka": { | |
| "chars": 126, | |
| "bytes": 346, | |
| "tokens": 47, | |
| "words": 14, | |
| "chars_per_token": 2.681, | |
| "bytes_per_token": 7.362, | |
| "tokens_per_word": 3.357, | |
| "byte_fragmentation_ratio": 0.085 | |
| }, | |
| "ko": { | |
| "chars": 73, | |
| "bytes": 185, | |
| "tokens": 37, | |
| "words": 16, | |
| "chars_per_token": 1.973, | |
| "bytes_per_token": 5.0, | |
| "tokens_per_word": 2.312, | |
| "byte_fragmentation_ratio": 0.054 | |
| }, | |
| "math": { | |
| "chars": 152, | |
| "bytes": 172, | |
| "tokens": 85, | |
| "words": 28, | |
| "chars_per_token": 1.788, | |
| "bytes_per_token": 2.024, | |
| "tokens_per_word": 3.036, | |
| "byte_fragmentation_ratio": 0.776 | |
| }, | |
| "python": { | |
| "chars": 202, | |
| "bytes": 202, | |
| "tokens": 61, | |
| "words": 32, | |
| "chars_per_token": 3.311, | |
| "bytes_per_token": 3.311, | |
| "tokens_per_word": 1.906, | |
| "byte_fragmentation_ratio": 0.492 | |
| }, | |
| "ru": { | |
| "chars": 190, | |
| "bytes": 357, | |
| "tokens": 46, | |
| "words": 23, | |
| "chars_per_token": 4.13, | |
| "bytes_per_token": 7.761, | |
| "tokens_per_word": 2.0, | |
| "byte_fragmentation_ratio": 0.043 | |
| }, | |
| "th": { | |
| "chars": 123, | |
| "bytes": 367, | |
| "tokens": 53, | |
| "words": 2, | |
| "chars_per_token": 2.321, | |
| "bytes_per_token": 6.925, | |
| "tokens_per_word": 26.5, | |
| "byte_fragmentation_ratio": 0.0 | |
| }, | |
| "uk": { | |
| "chars": 157, | |
| "bytes": 291, | |
| "tokens": 42, | |
| "words": 22, | |
| "chars_per_token": 3.738, | |
| "bytes_per_token": 6.929, | |
| "tokens_per_word": 1.909, | |
| "byte_fragmentation_ratio": 0.095 | |
| }, | |
| "vi": { | |
| "chars": 143, | |
| "bytes": 190, | |
| "tokens": 42, | |
| "words": 34, | |
| "chars_per_token": 3.405, | |
| "bytes_per_token": 4.524, | |
| "tokens_per_word": 1.235, | |
| "byte_fragmentation_ratio": 0.167 | |
| }, | |
| "zh": { | |
| "chars": 62, | |
| "bytes": 186, | |
| "tokens": 35, | |
| "words": 1, | |
| "chars_per_token": 1.771, | |
| "bytes_per_token": 5.314, | |
| "tokens_per_word": 35.0, | |
| "byte_fragmentation_ratio": 0.0 | |
| } | |
| } | |
| } |