| { | |
| "num_ar_samples": 4998, | |
| "num_en_samples": 5000, | |
| "results": [ | |
| { | |
| "name": "Gemma-3-4B", | |
| "vocab_size": 262145, | |
| "ar_fertility": 2.3109, | |
| "ar_chars_per_token": 2.8642, | |
| "en_fertility": 1.1368, | |
| "en_chars_per_token": 2.9107, | |
| "parity": 0.984 | |
| }, | |
| { | |
| "name": "Fanar-1-9B", | |
| "vocab_size": 128256, | |
| "ar_fertility": 2.2643, | |
| "ar_chars_per_token": 2.8119, | |
| "en_fertility": 1.1412, | |
| "en_chars_per_token": 2.88, | |
| "parity": 0.9764 | |
| }, | |
| { | |
| "name": "Hala-9B", | |
| "vocab_size": 128256, | |
| "ar_fertility": 2.2643, | |
| "ar_chars_per_token": 2.8119, | |
| "en_fertility": 1.1412, | |
| "en_chars_per_token": 2.88, | |
| "parity": 0.9764 | |
| }, | |
| { | |
| "name": "Command-R-Arabic", | |
| "vocab_size": 255033, | |
| "ar_fertility": 2.3196, | |
| "ar_chars_per_token": 2.7987, | |
| "en_fertility": 1.1422, | |
| "en_chars_per_token": 2.906, | |
| "parity": 0.9631 | |
| }, | |
| { | |
| "name": "SARF (Ours)", | |
| "vocab_size": 72195, | |
| "ar_fertility": 1.9778, | |
| "ar_chars_per_token": 2.8319, | |
| "en_fertility": 1.5609, | |
| "en_chars_per_token": 3.1635, | |
| "parity": 0.8952 | |
| }, | |
| { | |
| "name": "GPT-4o", | |
| "vocab_size": 200019, | |
| "ar_fertility": 2.2489, | |
| "ar_chars_per_token": 3.1108, | |
| "en_fertility": 1.2132, | |
| "en_chars_per_token": 3.4918, | |
| "parity": 0.8909 | |
| }, | |
| { | |
| "name": "Qwen3-4B", | |
| "vocab_size": 151669, | |
| "ar_fertility": 2.314, | |
| "ar_chars_per_token": 2.5988, | |
| "en_fertility": 1.2247, | |
| "en_chars_per_token": 2.9641, | |
| "parity": 0.8767 | |
| }, | |
| { | |
| "name": "Qwen3-VL-4B", | |
| "vocab_size": 151669, | |
| "ar_fertility": 2.314, | |
| "ar_chars_per_token": 2.5988, | |
| "en_fertility": 1.2247, | |
| "en_chars_per_token": 2.9641, | |
| "parity": 0.8767 | |
| }, | |
| { | |
| "name": "Falcon-H1-7B", | |
| "vocab_size": 130049, | |
| "ar_fertility": 2.0829, | |
| "ar_chars_per_token": 3.2722, | |
| "en_fertility": 1.2661, | |
| "en_chars_per_token": 2.8348, | |
| "parity": 1.1543 | |
| }, | |
| { | |
| "name": "ALLaM-7B", | |
| "vocab_size": 64000, | |
| "ar_fertility": 1.2856, | |
| "ar_chars_per_token": 3.8978, | |
| "en_fertility": 1.1973, | |
| "en_chars_per_token": 2.699, | |
| "parity": 1.4442 | |
| }, | |
| { | |
| "name": "Mistral-7B-v0.3", | |
| "vocab_size": 32768, | |
| "ar_fertility": 5.1329, | |
| "ar_chars_per_token": 1.1307, | |
| "en_fertility": 1.2185, | |
| "en_chars_per_token": 2.7016, | |
| "parity": 0.4185 | |
| }, | |
| { | |
| "name": "GPT-4", | |
| "vocab_size": 100277, | |
| "ar_fertility": 4.1107, | |
| "ar_chars_per_token": 1.4303, | |
| "en_fertility": 1.2247, | |
| "en_chars_per_token": 3.4518, | |
| "parity": 0.4144 | |
| }, | |
| { | |
| "name": "AceGPT-13B", | |
| "vocab_size": 32000, | |
| "ar_fertility": 5.236, | |
| "ar_chars_per_token": 1.1098, | |
| "en_fertility": 1.2368, | |
| "en_chars_per_token": 2.6909, | |
| "parity": 0.4124 | |
| } | |
| ], | |
| "markdown_table": "| Rank | Tokenizer | Vocab | AR Fertility | AR Chars/Tok | EN Fertility | EN Chars/Tok | Parity |\n|------|-----------|------:|-------------:|-------------:|-------------:|-------------:|-------:|\n| 1 | Gemma-3-4B | 262,145 | 2.311 | 2.864 | 1.137 | 2.911 | 0.9840 |\n| 2 | Fanar-1-9B | 128,256 | 2.264 | 2.812 | 1.141 | 2.880 | 0.9764 |\n| 3 | Hala-9B | 128,256 | 2.264 | 2.812 | 1.141 | 2.880 | 0.9764 |\n| 4 | Command-R-Arabic | 255,033 | 2.320 | 2.799 | 1.142 | 2.906 | 0.9631 |\n| 5 | SARF (Ours) | 72,195 | 1.978 | 2.832 | 1.561 | 3.163 | 0.8952 |\n| 6 | GPT-4o | 200,019 | 2.249 | 3.111 | 1.213 | 3.492 | 0.8909 |\n| 7 | Qwen3-4B | 151,669 | 2.314 | 2.599 | 1.225 | 2.964 | 0.8767 |\n| 8 | Qwen3-VL-4B | 151,669 | 2.314 | 2.599 | 1.225 | 2.964 | 0.8767 |\n| 9 | Falcon-H1-7B | 130,049 | 2.083 | 3.272 | 1.266 | 2.835 | 1.1543 |\n| 10 | ALLaM-7B | 64,000 | 1.286 | 3.898 | 1.197 | 2.699 | 1.4442 |\n| 11 | Mistral-7B-v0.3 | 32,768 | 5.133 | 1.131 | 1.218 | 2.702 | 0.4185 |\n| 12 | GPT-4 | 100,277 | 4.111 | 1.430 | 1.225 | 3.452 | 0.4144 |\n| 13 | AceGPT-13B | 32,000 | 5.236 | 1.110 | 1.237 | 2.691 | 0.4124 |" | |
| } |