{ "methodology": "lm-evaluation-harness v0.4.11, apply_chat_template=True, limit=200, acc_norm preferred", "tasks": [ "copa_ar", "arabic_mt_hellaswag", "arabic_leaderboard_arabic_mmlu" ], "models": [ { "name": "RightNow-Arabic-0.5B-Turbo", "params_B": 0.518, "copa_ar": 58.4, "arabic_mt_hellaswag": 26.0, "arabic_leaderboard_arabic_mmlu": 23.2, "mean": 35.87, "category": "ours" }, { "name": "Qwen2.5-0.5B-Instruct", "params_B": 0.494, "copa_ar": 53.9, "arabic_mt_hellaswag": 22.5, "arabic_leaderboard_arabic_mmlu": 26.0, "mean": 34.13, "category": "small" }, { "name": "Falcon-H1-0.5B-Instruct", "params_B": 0.524, "copa_ar": 44.9, "arabic_mt_hellaswag": 23.0, "arabic_leaderboard_arabic_mmlu": 24.2, "mean": 30.7, "category": "small" }, { "name": "Falcon-H1-1.5B-Instruct", "params_B": 1.5, "copa_ar": 58.4, "arabic_mt_hellaswag": 27.5, "arabic_leaderboard_arabic_mmlu": 32.7, "mean": 39.53, "category": "medium" }, { "name": "AceGPT-7B-chat", "params_B": 7.0, "copa_ar": 69.7, "arabic_mt_hellaswag": 27.0, "arabic_leaderboard_arabic_mmlu": 35.0, "mean": 43.9, "category": "large" }, { "name": "ALLaM-7B-Instruct", "params_B": 7.0, "copa_ar": 68.5, "arabic_mt_hellaswag": 29.0, "arabic_leaderboard_arabic_mmlu": 52.2, "mean": 49.9, "category": "large" }, { "name": "SILMA-9B-Instruct", "params_B": 9.0, "copa_ar": 69.7, "arabic_mt_hellaswag": 38.0, "arabic_leaderboard_arabic_mmlu": 52.9, "mean": 53.53, "category": "xlarge" } ] }