Update: Auto-evaluation on Space startup
Browse files- afcl/app.py +33 -3
afcl/app.py
CHANGED
|
@@ -29,19 +29,49 @@ The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Mode
|
|
| 29 |
**لوحة تقييم استدعاء الدوال بالعربية** تقيّم نماذج اللغة الكبيرة على قدرتها على فهم الاستعلامات العربية وإنشاء استدعاءات الدوال المناسبة.
|
| 30 |
"""
|
| 31 |
|
| 32 |
-
# Models to evaluate
|
| 33 |
MODELS_TO_EVALUATE = [
|
|
|
|
| 34 |
{"model": "Jais-30B-Chat", "model_id": "inceptionai/jais-30b-chat-v3", "organization": "Inception AI"},
|
| 35 |
{"model": "ALLaM-7B-Instruct", "model_id": "sdaia/allam-1-7b-instruct", "organization": "SDAIA"},
|
| 36 |
{"model": "SILMA-9B-Instruct", "model_id": "silma-ai/SILMA-9B-Instruct-v1.0", "organization": "Silma AI"},
|
|
|
|
| 37 |
{"model": "AceGPT-13B-Chat", "model_id": "FreedomIntelligence/AceGPT-13B-chat", "organization": "FreedomIntelligence"},
|
| 38 |
-
{"model": "
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
| 40 |
{"model": "Qwen2.5-7B-Instruct", "model_id": "Qwen/Qwen2.5-7B-Instruct", "organization": "Alibaba Qwen"},
|
|
|
|
| 41 |
{"model": "Llama-3.1-8B-Instruct", "model_id": "meta-llama/Llama-3.1-8B-Instruct", "organization": "Meta"},
|
|
|
|
| 42 |
{"model": "Gemma-2-9B-IT", "model_id": "google/gemma-2-9b-it", "organization": "Google"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
{"model": "Mistral-7B-Instruct", "model_id": "mistralai/Mistral-7B-Instruct-v0.3", "organization": "Mistral AI"},
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
{"model": "Phi-3-Mini-Instruct", "model_id": "microsoft/Phi-3-mini-4k-instruct", "organization": "Microsoft"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
]
|
| 46 |
|
| 47 |
# Global state
|
|
|
|
| 29 |
**لوحة تقييم استدعاء الدوال بالعربية** تقيّم نماذج اللغة الكبيرة على قدرتها على فهم الاستعلامات العربية وإنشاء استدعاءات الدوال المناسبة.
|
| 30 |
"""
|
| 31 |
|
| 32 |
+
# All 28 Models to evaluate
|
| 33 |
MODELS_TO_EVALUATE = [
|
| 34 |
+
# Arabic-Native LLMs
|
| 35 |
{"model": "Jais-30B-Chat", "model_id": "inceptionai/jais-30b-chat-v3", "organization": "Inception AI"},
|
| 36 |
{"model": "ALLaM-7B-Instruct", "model_id": "sdaia/allam-1-7b-instruct", "organization": "SDAIA"},
|
| 37 |
{"model": "SILMA-9B-Instruct", "model_id": "silma-ai/SILMA-9B-Instruct-v1.0", "organization": "Silma AI"},
|
| 38 |
+
{"model": "Fanar-Star-1.2B", "model_id": "QatarComputing/fanar-star-1.2b", "organization": "QCRI"},
|
| 39 |
{"model": "AceGPT-13B-Chat", "model_id": "FreedomIntelligence/AceGPT-13B-chat", "organization": "FreedomIntelligence"},
|
| 40 |
+
{"model": "AraGPT2-Mega", "model_id": "aubmindlab/aragpt2-mega", "organization": "AUB MIND Lab"},
|
| 41 |
+
|
| 42 |
+
# Multilingual with strong Arabic
|
| 43 |
+
{"model": "Qwen2.5-72B-Instruct", "model_id": "Qwen/Qwen2.5-72B-Instruct", "organization": "Alibaba Qwen"},
|
| 44 |
+
{"model": "Qwen2.5-32B-Instruct", "model_id": "Qwen/Qwen2.5-32B-Instruct", "organization": "Alibaba Qwen"},
|
| 45 |
{"model": "Qwen2.5-7B-Instruct", "model_id": "Qwen/Qwen2.5-7B-Instruct", "organization": "Alibaba Qwen"},
|
| 46 |
+
{"model": "Llama-3.1-70B-Instruct", "model_id": "meta-llama/Llama-3.1-70B-Instruct", "organization": "Meta"},
|
| 47 |
{"model": "Llama-3.1-8B-Instruct", "model_id": "meta-llama/Llama-3.1-8B-Instruct", "organization": "Meta"},
|
| 48 |
+
{"model": "Gemma-2-27B-IT", "model_id": "google/gemma-2-27b-it", "organization": "Google"},
|
| 49 |
{"model": "Gemma-2-9B-IT", "model_id": "google/gemma-2-9b-it", "organization": "Google"},
|
| 50 |
+
|
| 51 |
+
# Cohere Arabic Models
|
| 52 |
+
{"model": "Aya-Expanse-32B", "model_id": "CohereForAI/aya-expanse-32b", "organization": "Cohere For AI"},
|
| 53 |
+
{"model": "Aya-Expanse-8B", "model_id": "CohereForAI/aya-expanse-8b", "organization": "Cohere For AI"},
|
| 54 |
+
{"model": "c4ai-command-r7b-arabic", "model_id": "CohereForAI/c4ai-command-r7b-arabic-02-2025", "organization": "Cohere For AI"},
|
| 55 |
+
|
| 56 |
+
# Falcon (UAE)
|
| 57 |
+
{"model": "Falcon-180B-Chat", "model_id": "tiiuae/falcon-180B-chat", "organization": "TII UAE"},
|
| 58 |
+
{"model": "Falcon-40B-Instruct", "model_id": "tiiuae/falcon-40b-instruct", "organization": "TII UAE"},
|
| 59 |
+
|
| 60 |
+
# Mistral
|
| 61 |
+
{"model": "Mistral-Large-Instruct", "model_id": "mistralai/Mistral-Large-Instruct-2411", "organization": "Mistral AI"},
|
| 62 |
+
{"model": "Mixtral-8x22B-Instruct", "model_id": "mistralai/Mixtral-8x22B-Instruct-v0.1", "organization": "Mistral AI"},
|
| 63 |
{"model": "Mistral-7B-Instruct", "model_id": "mistralai/Mistral-7B-Instruct-v0.3", "organization": "Mistral AI"},
|
| 64 |
+
|
| 65 |
+
# Others
|
| 66 |
+
{"model": "DeepSeek-V3", "model_id": "deepseek-ai/DeepSeek-V3", "organization": "DeepSeek"},
|
| 67 |
+
{"model": "Phi-4", "model_id": "microsoft/phi-4", "organization": "Microsoft"},
|
| 68 |
{"model": "Phi-3-Mini-Instruct", "model_id": "microsoft/Phi-3-mini-4k-instruct", "organization": "Microsoft"},
|
| 69 |
+
{"model": "BLOOM-176B", "model_id": "bigscience/bloom", "organization": "BigScience"},
|
| 70 |
+
{"model": "BLOOMZ-7B1", "model_id": "bigscience/bloomz-7b1", "organization": "BigScience"},
|
| 71 |
+
|
| 72 |
+
# Arabic Fine-tuned
|
| 73 |
+
{"model": "Arabic-Llama-3.1-8B", "model_id": "Ammar-Arabi/Arabic-Llama-3.1-8B-Instruct", "organization": "Ammar Arabi"},
|
| 74 |
+
{"model": "Llama3-8B-Arabic-Instruct", "model_id": "MahmoudAshraf/Llama3-8B-Arabic-instruct", "organization": "Mahmoud Ashraf"},
|
| 75 |
]
|
| 76 |
|
| 77 |
# Global state
|