SARF-Tokenizer / results.json
almaghrabima's picture
Update: rank by parity+efficiency, add Falcon-H1-7B
5362025
{
"num_ar_samples": 4998,
"num_en_samples": 5000,
"results": [
{
"name": "Gemma-3-4B",
"vocab_size": 262145,
"ar_fertility": 2.3109,
"ar_chars_per_token": 2.8642,
"en_fertility": 1.1368,
"en_chars_per_token": 2.9107,
"parity": 0.984
},
{
"name": "Fanar-1-9B",
"vocab_size": 128256,
"ar_fertility": 2.2643,
"ar_chars_per_token": 2.8119,
"en_fertility": 1.1412,
"en_chars_per_token": 2.88,
"parity": 0.9764
},
{
"name": "Hala-9B",
"vocab_size": 128256,
"ar_fertility": 2.2643,
"ar_chars_per_token": 2.8119,
"en_fertility": 1.1412,
"en_chars_per_token": 2.88,
"parity": 0.9764
},
{
"name": "Command-R-Arabic",
"vocab_size": 255033,
"ar_fertility": 2.3196,
"ar_chars_per_token": 2.7987,
"en_fertility": 1.1422,
"en_chars_per_token": 2.906,
"parity": 0.9631
},
{
"name": "SARF (Ours)",
"vocab_size": 72195,
"ar_fertility": 1.9778,
"ar_chars_per_token": 2.8319,
"en_fertility": 1.5609,
"en_chars_per_token": 3.1635,
"parity": 0.8952
},
{
"name": "GPT-4o",
"vocab_size": 200019,
"ar_fertility": 2.2489,
"ar_chars_per_token": 3.1108,
"en_fertility": 1.2132,
"en_chars_per_token": 3.4918,
"parity": 0.8909
},
{
"name": "Qwen3-4B",
"vocab_size": 151669,
"ar_fertility": 2.314,
"ar_chars_per_token": 2.5988,
"en_fertility": 1.2247,
"en_chars_per_token": 2.9641,
"parity": 0.8767
},
{
"name": "Qwen3-VL-4B",
"vocab_size": 151669,
"ar_fertility": 2.314,
"ar_chars_per_token": 2.5988,
"en_fertility": 1.2247,
"en_chars_per_token": 2.9641,
"parity": 0.8767
},
{
"name": "Falcon-H1-7B",
"vocab_size": 130049,
"ar_fertility": 2.0829,
"ar_chars_per_token": 3.2722,
"en_fertility": 1.2661,
"en_chars_per_token": 2.8348,
"parity": 1.1543
},
{
"name": "ALLaM-7B",
"vocab_size": 64000,
"ar_fertility": 1.2856,
"ar_chars_per_token": 3.8978,
"en_fertility": 1.1973,
"en_chars_per_token": 2.699,
"parity": 1.4442
},
{
"name": "Mistral-7B-v0.3",
"vocab_size": 32768,
"ar_fertility": 5.1329,
"ar_chars_per_token": 1.1307,
"en_fertility": 1.2185,
"en_chars_per_token": 2.7016,
"parity": 0.4185
},
{
"name": "GPT-4",
"vocab_size": 100277,
"ar_fertility": 4.1107,
"ar_chars_per_token": 1.4303,
"en_fertility": 1.2247,
"en_chars_per_token": 3.4518,
"parity": 0.4144
},
{
"name": "AceGPT-13B",
"vocab_size": 32000,
"ar_fertility": 5.236,
"ar_chars_per_token": 1.1098,
"en_fertility": 1.2368,
"en_chars_per_token": 2.6909,
"parity": 0.4124
}
],
"markdown_table": "| Rank | Tokenizer | Vocab | AR Fertility | AR Chars/Tok | EN Fertility | EN Chars/Tok | Parity |\n|------|-----------|------:|-------------:|-------------:|-------------:|-------------:|-------:|\n| 1 | Gemma-3-4B | 262,145 | 2.311 | 2.864 | 1.137 | 2.911 | 0.9840 |\n| 2 | Fanar-1-9B | 128,256 | 2.264 | 2.812 | 1.141 | 2.880 | 0.9764 |\n| 3 | Hala-9B | 128,256 | 2.264 | 2.812 | 1.141 | 2.880 | 0.9764 |\n| 4 | Command-R-Arabic | 255,033 | 2.320 | 2.799 | 1.142 | 2.906 | 0.9631 |\n| 5 | SARF (Ours) | 72,195 | 1.978 | 2.832 | 1.561 | 3.163 | 0.8952 |\n| 6 | GPT-4o | 200,019 | 2.249 | 3.111 | 1.213 | 3.492 | 0.8909 |\n| 7 | Qwen3-4B | 151,669 | 2.314 | 2.599 | 1.225 | 2.964 | 0.8767 |\n| 8 | Qwen3-VL-4B | 151,669 | 2.314 | 2.599 | 1.225 | 2.964 | 0.8767 |\n| 9 | Falcon-H1-7B | 130,049 | 2.083 | 3.272 | 1.266 | 2.835 | 1.1543 |\n| 10 | ALLaM-7B | 64,000 | 1.286 | 3.898 | 1.197 | 2.699 | 1.4442 |\n| 11 | Mistral-7B-v0.3 | 32,768 | 5.133 | 1.131 | 1.218 | 2.702 | 0.4185 |\n| 12 | GPT-4 | 100,277 | 4.111 | 1.430 | 1.225 | 3.452 | 0.4144 |\n| 13 | AceGPT-13B | 32,000 | 5.236 | 1.110 | 1.237 | 2.691 | 0.4124 |"
}