{ "num_ar_samples": 4998, "num_en_samples": 5000, "results": [ { "name": "Gemma-3-4B", "vocab_size": 262145, "ar_fertility": 2.3109, "ar_chars_per_token": 2.8642, "en_fertility": 1.1368, "en_chars_per_token": 2.9107, "parity": 0.984 }, { "name": "Fanar-1-9B", "vocab_size": 128256, "ar_fertility": 2.2643, "ar_chars_per_token": 2.8119, "en_fertility": 1.1412, "en_chars_per_token": 2.88, "parity": 0.9764 }, { "name": "Hala-9B", "vocab_size": 128256, "ar_fertility": 2.2643, "ar_chars_per_token": 2.8119, "en_fertility": 1.1412, "en_chars_per_token": 2.88, "parity": 0.9764 }, { "name": "Command-R-Arabic", "vocab_size": 255033, "ar_fertility": 2.3196, "ar_chars_per_token": 2.7987, "en_fertility": 1.1422, "en_chars_per_token": 2.906, "parity": 0.9631 }, { "name": "SARF (Ours)", "vocab_size": 72195, "ar_fertility": 1.9778, "ar_chars_per_token": 2.8319, "en_fertility": 1.5609, "en_chars_per_token": 3.1635, "parity": 0.8952 }, { "name": "GPT-4o", "vocab_size": 200019, "ar_fertility": 2.2489, "ar_chars_per_token": 3.1108, "en_fertility": 1.2132, "en_chars_per_token": 3.4918, "parity": 0.8909 }, { "name": "Qwen3-4B", "vocab_size": 151669, "ar_fertility": 2.314, "ar_chars_per_token": 2.5988, "en_fertility": 1.2247, "en_chars_per_token": 2.9641, "parity": 0.8767 }, { "name": "Qwen3-VL-4B", "vocab_size": 151669, "ar_fertility": 2.314, "ar_chars_per_token": 2.5988, "en_fertility": 1.2247, "en_chars_per_token": 2.9641, "parity": 0.8767 }, { "name": "Falcon-H1-7B", "vocab_size": 130049, "ar_fertility": 2.0829, "ar_chars_per_token": 3.2722, "en_fertility": 1.2661, "en_chars_per_token": 2.8348, "parity": 1.1543 }, { "name": "ALLaM-7B", "vocab_size": 64000, "ar_fertility": 1.2856, "ar_chars_per_token": 3.8978, "en_fertility": 1.1973, "en_chars_per_token": 2.699, "parity": 1.4442 }, { "name": "Mistral-7B-v0.3", "vocab_size": 32768, "ar_fertility": 5.1329, "ar_chars_per_token": 1.1307, "en_fertility": 1.2185, "en_chars_per_token": 2.7016, "parity": 0.4185 }, { "name": "GPT-4", "vocab_size": 100277, "ar_fertility": 4.1107, "ar_chars_per_token": 1.4303, "en_fertility": 1.2247, "en_chars_per_token": 3.4518, "parity": 0.4144 }, { "name": "AceGPT-13B", "vocab_size": 32000, "ar_fertility": 5.236, "ar_chars_per_token": 1.1098, "en_fertility": 1.2368, "en_chars_per_token": 2.6909, "parity": 0.4124 } ], "markdown_table": "| Rank | Tokenizer | Vocab | AR Fertility | AR Chars/Tok | EN Fertility | EN Chars/Tok | Parity |\n|------|-----------|------:|-------------:|-------------:|-------------:|-------------:|-------:|\n| 1 | Gemma-3-4B | 262,145 | 2.311 | 2.864 | 1.137 | 2.911 | 0.9840 |\n| 2 | Fanar-1-9B | 128,256 | 2.264 | 2.812 | 1.141 | 2.880 | 0.9764 |\n| 3 | Hala-9B | 128,256 | 2.264 | 2.812 | 1.141 | 2.880 | 0.9764 |\n| 4 | Command-R-Arabic | 255,033 | 2.320 | 2.799 | 1.142 | 2.906 | 0.9631 |\n| 5 | SARF (Ours) | 72,195 | 1.978 | 2.832 | 1.561 | 3.163 | 0.8952 |\n| 6 | GPT-4o | 200,019 | 2.249 | 3.111 | 1.213 | 3.492 | 0.8909 |\n| 7 | Qwen3-4B | 151,669 | 2.314 | 2.599 | 1.225 | 2.964 | 0.8767 |\n| 8 | Qwen3-VL-4B | 151,669 | 2.314 | 2.599 | 1.225 | 2.964 | 0.8767 |\n| 9 | Falcon-H1-7B | 130,049 | 2.083 | 3.272 | 1.266 | 2.835 | 1.1543 |\n| 10 | ALLaM-7B | 64,000 | 1.286 | 3.898 | 1.197 | 2.699 | 1.4442 |\n| 11 | Mistral-7B-v0.3 | 32,768 | 5.133 | 1.131 | 1.218 | 2.702 | 0.4185 |\n| 12 | GPT-4 | 100,277 | 4.111 | 1.430 | 1.225 | 3.452 | 0.4144 |\n| 13 | AceGPT-13B | 32,000 | 5.236 | 1.110 | 1.237 | 2.691 | 0.4124 |" }