File size: 1,291 Bytes
bbf3fc6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | {
"model": "multilingual_32k",
"vocab_size": 32000,
"bos_id": 1,
"eos_id": 2,
"config": {
"character_coverage": 0.9995,
"model_type": "bpe",
"byte_fallback": true,
"split_digits": true,
"max_sentence_length": 16384,
"input_sentence_size": 10000000
},
"data_sources": {
"en": "allenai/c4 (en)",
"ar": "wikimedia/wikipedia (20231101.ar)",
"he": "wikimedia/wikipedia (20231101.he)",
"fa": "wikimedia/wikipedia (20231101.fa)"
},
"languages": {
"en": {
"num_tokens": 131858,
"num_bytes": 502591,
"num_words": 85508,
"num_chars": 500000,
"bytes_per_token": 3.81,
"tokens_per_word": 1.54
},
"ar": {
"num_tokens": 138572,
"num_bytes": 900643,
"num_words": 81698,
"num_chars": 500000,
"bytes_per_token": 6.5,
"tokens_per_word": 1.7
},
"he": {
"num_tokens": 150214,
"num_bytes": 876334,
"num_words": 81962,
"num_chars": 500000,
"bytes_per_token": 5.83,
"tokens_per_word": 1.83
},
"fa": {
"num_tokens": 129491,
"num_bytes": 902876,
"num_words": 91425,
"num_chars": 500000,
"bytes_per_token": 6.97,
"tokens_per_word": 1.42
}
},
"timestamp": "2026-04-01T14:12:42Z"
} |