HeshamHaroon commited on
Commit
d9d7dd0
ยท
verified ยท
1 Parent(s): 9a35f45

Update: Auto-evaluation on Space startup

Browse files
Files changed (1) hide show
  1. afcl/app.py +326 -125
afcl/app.py CHANGED
@@ -2,8 +2,7 @@
2
  Arabic Function Calling Leaderboard (AFCL)
3
  ==========================================
4
 
5
- A Gradio-based leaderboard that evaluates LLMs on Arabic function calling.
6
- Evaluation runs on HuggingFace Space infrastructure.
7
  """
8
 
9
  import gradio as gr
@@ -13,70 +12,170 @@ import os
13
  import re
14
  import time
15
  import requests
16
- from pathlib import Path
17
  from typing import Dict, List, Optional
18
  from threading import Thread
19
  from datasets import load_dataset
20
- import huggingface_hub
21
 
22
  # Constants
23
- TITLE = "๐Ÿ† Arabic Function Calling Leaderboard"
24
- TITLE_AR = "๐Ÿ† ู„ูˆุญุฉ ุชู‚ูŠูŠู… ุงุณุชุฏุนุงุก ุงู„ุฏูˆุงู„ ุจุงู„ุนุฑุจูŠุฉ"
25
-
26
- DESCRIPTION = """
27
- The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Models on their ability to understand Arabic queries and generate appropriate function calls.
28
-
29
- **ู„ูˆุญุฉ ุชู‚ูŠูŠู… ุงุณุชุฏุนุงุก ุงู„ุฏูˆุงู„ ุจุงู„ุนุฑุจูŠุฉ** ุชู‚ูŠู‘ู… ู†ู…ุงุฐุฌ ุงู„ู„ุบุฉ ุงู„ูƒุจูŠุฑุฉ ุนู„ู‰ ู‚ุฏุฑุชู‡ุง ุนู„ู‰ ูู‡ู… ุงู„ุงุณุชุนู„ุงู…ุงุช ุงู„ุนุฑุจูŠุฉ ูˆุฅู†ุดุงุก ุงุณุชุฏุนุงุกุงุช ุงู„ุฏูˆุงู„ ุงู„ู…ู†ุงุณุจุฉ.
30
- """
31
 
32
  # All 28 Models to evaluate
33
  MODELS_TO_EVALUATE = [
34
  # Arabic-Native LLMs
35
- {"model": "Jais-30B-Chat", "model_id": "inceptionai/jais-30b-chat-v3", "organization": "Inception AI"},
36
- {"model": "ALLaM-7B-Instruct", "model_id": "sdaia/allam-1-7b-instruct", "organization": "SDAIA"},
37
- {"model": "SILMA-9B-Instruct", "model_id": "silma-ai/SILMA-9B-Instruct-v1.0", "organization": "Silma AI"},
38
- {"model": "Fanar-Star-1.2B", "model_id": "QatarComputing/fanar-star-1.2b", "organization": "QCRI"},
39
- {"model": "AceGPT-13B-Chat", "model_id": "FreedomIntelligence/AceGPT-13B-chat", "organization": "FreedomIntelligence"},
40
- {"model": "AraGPT2-Mega", "model_id": "aubmindlab/aragpt2-mega", "organization": "AUB MIND Lab"},
41
 
42
  # Multilingual with strong Arabic
43
- {"model": "Qwen2.5-72B-Instruct", "model_id": "Qwen/Qwen2.5-72B-Instruct", "organization": "Alibaba Qwen"},
44
- {"model": "Qwen2.5-32B-Instruct", "model_id": "Qwen/Qwen2.5-32B-Instruct", "organization": "Alibaba Qwen"},
45
- {"model": "Qwen2.5-7B-Instruct", "model_id": "Qwen/Qwen2.5-7B-Instruct", "organization": "Alibaba Qwen"},
46
- {"model": "Llama-3.1-70B-Instruct", "model_id": "meta-llama/Llama-3.1-70B-Instruct", "organization": "Meta"},
47
- {"model": "Llama-3.1-8B-Instruct", "model_id": "meta-llama/Llama-3.1-8B-Instruct", "organization": "Meta"},
48
- {"model": "Gemma-2-27B-IT", "model_id": "google/gemma-2-27b-it", "organization": "Google"},
49
- {"model": "Gemma-2-9B-IT", "model_id": "google/gemma-2-9b-it", "organization": "Google"},
50
 
51
  # Cohere Arabic Models
52
- {"model": "Aya-Expanse-32B", "model_id": "CohereForAI/aya-expanse-32b", "organization": "Cohere For AI"},
53
- {"model": "Aya-Expanse-8B", "model_id": "CohereForAI/aya-expanse-8b", "organization": "Cohere For AI"},
54
- {"model": "c4ai-command-r7b-arabic", "model_id": "CohereForAI/c4ai-command-r7b-arabic-02-2025", "organization": "Cohere For AI"},
55
 
56
  # Falcon (UAE)
57
- {"model": "Falcon-180B-Chat", "model_id": "tiiuae/falcon-180B-chat", "organization": "TII UAE"},
58
- {"model": "Falcon-40B-Instruct", "model_id": "tiiuae/falcon-40b-instruct", "organization": "TII UAE"},
59
 
60
  # Mistral
61
- {"model": "Mistral-Large-Instruct", "model_id": "mistralai/Mistral-Large-Instruct-2411", "organization": "Mistral AI"},
62
- {"model": "Mixtral-8x22B-Instruct", "model_id": "mistralai/Mixtral-8x22B-Instruct-v0.1", "organization": "Mistral AI"},
63
- {"model": "Mistral-7B-Instruct", "model_id": "mistralai/Mistral-7B-Instruct-v0.3", "organization": "Mistral AI"},
64
 
65
  # Others
66
- {"model": "DeepSeek-V3", "model_id": "deepseek-ai/DeepSeek-V3", "organization": "DeepSeek"},
67
- {"model": "Phi-4", "model_id": "microsoft/phi-4", "organization": "Microsoft"},
68
- {"model": "Phi-3-Mini-Instruct", "model_id": "microsoft/Phi-3-mini-4k-instruct", "organization": "Microsoft"},
69
- {"model": "BLOOM-176B", "model_id": "bigscience/bloom", "organization": "BigScience"},
70
- {"model": "BLOOMZ-7B1", "model_id": "bigscience/bloomz-7b1", "organization": "BigScience"},
71
 
72
  # Arabic Fine-tuned
73
- {"model": "Arabic-Llama-3.1-8B", "model_id": "Ammar-Arabi/Arabic-Llama-3.1-8B-Instruct", "organization": "Ammar Arabi"},
74
- {"model": "Llama3-8B-Arabic-Instruct", "model_id": "MahmoudAshraf/Llama3-8B-Arabic-instruct", "organization": "Mahmoud Ashraf"},
75
  ]
76
 
77
  # Global state
78
  LEADERBOARD_DATA = []
79
- EVALUATION_STATUS = "Not started"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
 
82
  def load_evaluation_dataset():
@@ -101,21 +200,18 @@ def load_evaluation_dataset():
101
 
102
  def create_prompt(query: str, functions: List[Dict]) -> str:
103
  """Create evaluation prompt."""
104
- func_desc = "You are a function calling AI. Given the user query and available functions, respond with a JSON function call.\n\nAvailable functions:\n"
105
  for f in functions:
106
  func_desc += f"- {f.get('name')}: {f.get('description', '')}\n"
107
 
108
  return f"""{func_desc}
109
 
110
- User Query (Arabic): {query}
111
-
112
- Respond ONLY with a JSON object:
113
- {{"name": "function_name", "arguments": {{"param1": "value1"}}}}
114
 
115
- If no function should be called:
116
- {{"name": null, "arguments": {{}}}}
117
 
118
- JSON Response:"""
119
 
120
 
121
  def call_model(model_id: str, prompt: str) -> str:
@@ -124,17 +220,13 @@ def call_model(model_id: str, prompt: str) -> str:
124
  headers = {"Authorization": f"Bearer {token}"}
125
  url = f"https://api-inference.huggingface.co/models/{model_id}"
126
 
127
- payload = {
128
- "inputs": prompt,
129
- "parameters": {"max_new_tokens": 200, "temperature": 0.1}
130
- }
131
 
132
  try:
133
  response = requests.post(url, headers=headers, json=payload, timeout=60)
134
  if response.status_code == 503:
135
  time.sleep(20)
136
  response = requests.post(url, headers=headers, json=payload, timeout=60)
137
-
138
  result = response.json()
139
  if isinstance(result, list) and result:
140
  return result[0].get("generated_text", "")
@@ -197,11 +289,11 @@ def run_evaluation():
197
  """Run full evaluation on all models."""
198
  global LEADERBOARD_DATA, EVALUATION_STATUS
199
 
200
- EVALUATION_STATUS = "Loading dataset..."
201
  samples = load_evaluation_dataset()
202
 
203
  if not samples:
204
- EVALUATION_STATUS = "Failed to load dataset"
205
  return
206
 
207
  results = []
@@ -211,7 +303,8 @@ def run_evaluation():
211
  model_name = model_config['model']
212
  model_id = model_config['model_id']
213
 
214
- EVALUATION_STATUS = f"Evaluating {model_name} ({idx+1}/{total_models})..."
 
215
 
216
  category_scores = {}
217
  category_counts = {}
@@ -228,7 +321,7 @@ def run_evaluation():
228
  except:
229
  pass
230
  category_counts[cat] += 1
231
- time.sleep(0.5) # Rate limiting
232
 
233
  # Calculate scores
234
  scores = {cat: round((category_scores[cat] / category_counts[cat]) * 100, 1)
@@ -243,6 +336,8 @@ def run_evaluation():
243
  "model": model_name,
244
  "model_id": model_id,
245
  "organization": model_config['organization'],
 
 
246
  "overall": round(overall, 1),
247
  "simple": scores.get('simple', 0),
248
  "multiple": scores.get('multiple', 0),
@@ -253,108 +348,214 @@ def run_evaluation():
253
  "status": "completed"
254
  })
255
 
256
- # Sort and rank
257
- results = sorted(results, key=lambda x: x['overall'], reverse=True)
258
- for i, r in enumerate(results, 1):
259
- r['rank'] = i
 
260
 
261
- LEADERBOARD_DATA = results
262
- EVALUATION_STATUS = f"Completed - {len(results)} models evaluated"
263
 
264
 
265
  def get_leaderboard_df():
266
  """Get leaderboard as DataFrame."""
267
  if not LEADERBOARD_DATA:
268
- # Return empty with pending status
269
- data = [{"rank": i+1, "model": m["model"], "organization": m["organization"],
270
- "overall": "-", "status": "โณ Pending"}
271
- for i, m in enumerate(MODELS_TO_EVALUATE)]
 
 
 
 
 
 
 
 
 
 
 
272
  return pd.DataFrame(data)
273
 
274
- df = pd.DataFrame(LEADERBOARD_DATA)
275
- cols = ["rank", "model", "organization", "overall", "simple", "multiple",
276
- "parallel", "parallel_multiple", "irrelevance", "dialect_handling"]
277
- df = df[[c for c in cols if c in df.columns]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
- # Format percentages
280
- for col in df.columns:
281
- if df[col].dtype in ['float64', 'float32', 'int64']:
282
- if col != 'rank':
283
- df[col] = df[col].apply(lambda x: f"{x:.1f}%")
 
284
 
285
- return df
 
 
 
 
 
 
 
 
 
 
286
 
287
 
288
  def create_app():
289
  """Create the Gradio app."""
290
- with gr.Blocks(title="Arabic FC Leaderboard", theme=gr.themes.Soft()) as app:
291
 
292
- gr.Markdown(f"""
293
- <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4a 100%); border-radius: 12px; color: white; margin-bottom: 20px;">
294
- <h1>{TITLE_AR}</h1>
295
- <h2>{TITLE}</h2>
296
- <p>Evaluating LLMs on Arabic Function Calling | ุชู‚ูŠูŠู… ู†ู…ุงุฐุฌ ุงู„ู„ุบุฉ ุนู„ู‰ ุงุณุชุฏุนุงุก ุงู„ุฏูˆุงู„ ุจุงู„ุนุฑุจูŠุฉ</p>
 
 
 
 
 
 
 
 
 
297
  </div>
298
  """)
299
 
300
- gr.Markdown(DESCRIPTION)
301
-
302
  with gr.Row():
303
- gr.Markdown(f"""
304
- <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
305
- <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(MODELS_TO_EVALUATE)}</div>
306
- <div>Models | ุงู„ู†ู…ุงุฐุฌ</div>
 
 
 
 
 
 
307
  </div>
308
  """)
309
- gr.Markdown("""
310
- <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
311
- <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">147</div>
312
- <div>Test Samples | ุนูŠู†ุงุช</div>
313
  </div>
314
  """)
315
- gr.Markdown("""
316
- <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
317
- <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">10</div>
318
- <div>Categories | ุงู„ูุฆุงุช</div>
319
  </div>
320
  """)
321
 
322
- status_text = gr.Markdown(f"**Status:** {EVALUATION_STATUS}")
 
323
 
 
324
  with gr.Tabs():
325
  with gr.TabItem("๐Ÿ† Leaderboard"):
326
- leaderboard_df = gr.DataFrame(
327
  value=get_leaderboard_df(),
328
- interactive=False
 
329
  )
330
 
331
- def refresh_leaderboard():
332
- return get_leaderboard_df(), f"**Status:** {EVALUATION_STATUS}"
333
-
334
- refresh_btn = gr.Button("๐Ÿ”„ Refresh | ุชุญุฏูŠุซ")
335
- refresh_btn.click(refresh_leaderboard, outputs=[leaderboard_df, status_text])
336
-
337
- with gr.TabItem("๐Ÿ“Š About"):
338
- gr.Markdown("""
339
- ## Evaluation Categories
340
-
341
- | Category | Samples | Description |
342
- |----------|---------|-------------|
343
- | Simple | ~20 | Single function call |
344
- | Multiple | ~20 | Select from multiple functions |
345
- | Parallel | ~20 | Multiple calls |
346
- | Parallel Multiple | ~20 | Complex multi-call |
347
- | Irrelevance | ~20 | Should not call |
348
- | Dialect | ~15 | Egyptian/Gulf/Levantine |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
- ## Dataset
351
- ๐Ÿ“Š [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  """)
353
 
354
- gr.Markdown("""
355
- ---
356
- <div style="text-align: center; color: #666;">
357
- Built for the Arabic NLP community | ุจูู†ูŠ ู„ู…ุฌุชู…ุน ู…ุนุงู„ุฌุฉ ุงู„ู„ุบุฉ ุงู„ุนุฑุจูŠุฉ
 
 
358
  </div>
359
  """)
360
 
 
2
  Arabic Function Calling Leaderboard (AFCL)
3
  ==========================================
4
 
5
+ Professional leaderboard for evaluating LLMs on Arabic function calling.
 
6
  """
7
 
8
  import gradio as gr
 
12
  import re
13
  import time
14
  import requests
 
15
  from typing import Dict, List, Optional
16
  from threading import Thread
17
  from datasets import load_dataset
 
18
 
19
  # Constants
20
+ TITLE = "Arabic Function Calling Leaderboard"
21
+ TITLE_AR = "ู„ูˆุญุฉ ุชู‚ูŠูŠู… ุงุณุชุฏุนุงุก ุงู„ุฏูˆุงู„ ุจุงู„ุนุฑุจูŠุฉ"
 
 
 
 
 
 
22
 
23
  # All 28 Models to evaluate
24
  MODELS_TO_EVALUATE = [
25
  # Arabic-Native LLMs
26
+ {"model": "Jais-30B-Chat", "model_id": "inceptionai/jais-30b-chat-v3", "organization": "Inception AI", "params": "30B", "type": "Arabic-Native"},
27
+ {"model": "ALLaM-7B-Instruct", "model_id": "sdaia/allam-1-7b-instruct", "organization": "SDAIA", "params": "7B", "type": "Arabic-Native"},
28
+ {"model": "SILMA-9B-Instruct", "model_id": "silma-ai/SILMA-9B-Instruct-v1.0", "organization": "Silma AI", "params": "9B", "type": "Arabic-Native"},
29
+ {"model": "Fanar-Star-1.2B", "model_id": "QatarComputing/fanar-star-1.2b", "organization": "QCRI", "params": "1.2B", "type": "Arabic-Native"},
30
+ {"model": "AceGPT-13B-Chat", "model_id": "FreedomIntelligence/AceGPT-13B-chat", "organization": "FreedomIntelligence", "params": "13B", "type": "Arabic-Native"},
31
+ {"model": "AraGPT2-Mega", "model_id": "aubmindlab/aragpt2-mega", "organization": "AUB MIND Lab", "params": "1.5B", "type": "Arabic-Native"},
32
 
33
  # Multilingual with strong Arabic
34
+ {"model": "Qwen2.5-72B-Instruct", "model_id": "Qwen/Qwen2.5-72B-Instruct", "organization": "Alibaba", "params": "72B", "type": "Multilingual"},
35
+ {"model": "Qwen2.5-32B-Instruct", "model_id": "Qwen/Qwen2.5-32B-Instruct", "organization": "Alibaba", "params": "32B", "type": "Multilingual"},
36
+ {"model": "Qwen2.5-7B-Instruct", "model_id": "Qwen/Qwen2.5-7B-Instruct", "organization": "Alibaba", "params": "7B", "type": "Multilingual"},
37
+ {"model": "Llama-3.1-70B-Instruct", "model_id": "meta-llama/Llama-3.1-70B-Instruct", "organization": "Meta", "params": "70B", "type": "Multilingual"},
38
+ {"model": "Llama-3.1-8B-Instruct", "model_id": "meta-llama/Llama-3.1-8B-Instruct", "organization": "Meta", "params": "8B", "type": "Multilingual"},
39
+ {"model": "Gemma-2-27B-IT", "model_id": "google/gemma-2-27b-it", "organization": "Google", "params": "27B", "type": "Multilingual"},
40
+ {"model": "Gemma-2-9B-IT", "model_id": "google/gemma-2-9b-it", "organization": "Google", "params": "9B", "type": "Multilingual"},
41
 
42
  # Cohere Arabic Models
43
+ {"model": "Aya-Expanse-32B", "model_id": "CohereForAI/aya-expanse-32b", "organization": "Cohere", "params": "32B", "type": "Multilingual"},
44
+ {"model": "Aya-Expanse-8B", "model_id": "CohereForAI/aya-expanse-8b", "organization": "Cohere", "params": "8B", "type": "Multilingual"},
45
+ {"model": "Command-R7B-Arabic", "model_id": "CohereForAI/c4ai-command-r7b-arabic-02-2025", "organization": "Cohere", "params": "7B", "type": "Arabic-Tuned"},
46
 
47
  # Falcon (UAE)
48
+ {"model": "Falcon-180B-Chat", "model_id": "tiiuae/falcon-180B-chat", "organization": "TII UAE", "params": "180B", "type": "Multilingual"},
49
+ {"model": "Falcon-40B-Instruct", "model_id": "tiiuae/falcon-40b-instruct", "organization": "TII UAE", "params": "40B", "type": "Multilingual"},
50
 
51
  # Mistral
52
+ {"model": "Mistral-Large", "model_id": "mistralai/Mistral-Large-Instruct-2411", "organization": "Mistral AI", "params": "123B", "type": "Multilingual"},
53
+ {"model": "Mixtral-8x22B", "model_id": "mistralai/Mixtral-8x22B-Instruct-v0.1", "organization": "Mistral AI", "params": "141B", "type": "Multilingual"},
54
+ {"model": "Mistral-7B-Instruct", "model_id": "mistralai/Mistral-7B-Instruct-v0.3", "organization": "Mistral AI", "params": "7B", "type": "Multilingual"},
55
 
56
  # Others
57
+ {"model": "DeepSeek-V3", "model_id": "deepseek-ai/DeepSeek-V3", "organization": "DeepSeek", "params": "671B", "type": "Multilingual"},
58
+ {"model": "Phi-4", "model_id": "microsoft/phi-4", "organization": "Microsoft", "params": "14B", "type": "Multilingual"},
59
+ {"model": "Phi-3-Mini", "model_id": "microsoft/Phi-3-mini-4k-instruct", "organization": "Microsoft", "params": "3.8B", "type": "Multilingual"},
60
+ {"model": "BLOOM-176B", "model_id": "bigscience/bloom", "organization": "BigScience", "params": "176B", "type": "Multilingual"},
61
+ {"model": "BLOOMZ-7B1", "model_id": "bigscience/bloomz-7b1", "organization": "BigScience", "params": "7B", "type": "Multilingual"},
62
 
63
  # Arabic Fine-tuned
64
+ {"model": "Arabic-Llama-3.1-8B", "model_id": "Ammar-Arabi/Arabic-Llama-3.1-8B-Instruct", "organization": "Community", "params": "8B", "type": "Arabic-Tuned"},
65
+ {"model": "Llama3-8B-Arabic", "model_id": "MahmoudAshraf/Llama3-8B-Arabic-instruct", "organization": "Community", "params": "8B", "type": "Arabic-Tuned"},
66
  ]
67
 
68
  # Global state
69
  LEADERBOARD_DATA = []
70
+ EVALUATION_STATUS = {"current": "Initializing...", "progress": 0, "total": len(MODELS_TO_EVALUATE)}
71
+
72
+ # Custom CSS for professional look
73
+ CUSTOM_CSS = """
74
+ /* Professional Dark Theme */
75
+ .gradio-container {
76
+ background: linear-gradient(135deg, #0f0f1a 0%, #1a1a2e 100%) !important;
77
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
78
+ }
79
+
80
+ /* Header styling */
81
+ .header-container {
82
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
83
+ border-radius: 16px;
84
+ padding: 32px;
85
+ margin-bottom: 24px;
86
+ box-shadow: 0 20px 40px rgba(102, 126, 234, 0.3);
87
+ }
88
+
89
+ /* Stats cards */
90
+ .stat-card {
91
+ background: rgba(255,255,255,0.05);
92
+ backdrop-filter: blur(10px);
93
+ border: 1px solid rgba(255,255,255,0.1);
94
+ border-radius: 12px;
95
+ padding: 24px;
96
+ text-align: center;
97
+ transition: transform 0.3s ease;
98
+ }
99
+
100
+ .stat-card:hover {
101
+ transform: translateY(-4px);
102
+ }
103
+
104
+ .stat-value {
105
+ font-size: 2.5rem;
106
+ font-weight: 700;
107
+ background: linear-gradient(135deg, #667eea, #764ba2);
108
+ -webkit-background-clip: text;
109
+ -webkit-text-fill-color: transparent;
110
+ }
111
+
112
+ .stat-label {
113
+ color: #a0a0a0;
114
+ font-size: 0.9rem;
115
+ margin-top: 8px;
116
+ }
117
+
118
+ /* Table styling */
119
+ .leaderboard-table {
120
+ background: rgba(255,255,255,0.02) !important;
121
+ border-radius: 12px !important;
122
+ border: 1px solid rgba(255,255,255,0.1) !important;
123
+ }
124
+
125
+ /* Rank badges */
126
+ .rank-1 { color: #ffd700 !important; font-weight: bold; }
127
+ .rank-2 { color: #c0c0c0 !important; font-weight: bold; }
128
+ .rank-3 { color: #cd7f32 !important; font-weight: bold; }
129
+
130
+ /* Progress bar */
131
+ .progress-container {
132
+ background: rgba(255,255,255,0.1);
133
+ border-radius: 8px;
134
+ padding: 16px;
135
+ margin: 16px 0;
136
+ }
137
+
138
+ .progress-bar {
139
+ height: 8px;
140
+ background: linear-gradient(90deg, #667eea, #764ba2);
141
+ border-radius: 4px;
142
+ transition: width 0.5s ease;
143
+ }
144
+
145
+ /* Tabs */
146
+ .tabs {
147
+ border: none !important;
148
+ }
149
+
150
+ .tab-nav {
151
+ background: transparent !important;
152
+ border-bottom: 2px solid rgba(255,255,255,0.1) !important;
153
+ }
154
+
155
+ .tab-nav button {
156
+ color: #a0a0a0 !important;
157
+ font-weight: 500 !important;
158
+ padding: 12px 24px !important;
159
+ }
160
+
161
+ .tab-nav button.selected {
162
+ color: #667eea !important;
163
+ border-bottom: 2px solid #667eea !important;
164
+ }
165
+
166
+ /* Category pills */
167
+ .category-pill {
168
+ display: inline-block;
169
+ padding: 4px 12px;
170
+ border-radius: 20px;
171
+ font-size: 0.75rem;
172
+ font-weight: 500;
173
+ }
174
+
175
+ .cat-arabic { background: #22c55e20; color: #22c55e; }
176
+ .cat-multilingual { background: #3b82f620; color: #3b82f6; }
177
+ .cat-tuned { background: #f59e0b20; color: #f59e0b; }
178
+ """
179
 
180
 
181
  def load_evaluation_dataset():
 
200
 
201
  def create_prompt(query: str, functions: List[Dict]) -> str:
202
  """Create evaluation prompt."""
203
+ func_desc = "You are a function calling AI. Respond with JSON only.\n\nFunctions:\n"
204
  for f in functions:
205
  func_desc += f"- {f.get('name')}: {f.get('description', '')}\n"
206
 
207
  return f"""{func_desc}
208
 
209
+ Query: {query}
 
 
 
210
 
211
+ Response format: {{"name": "function_name", "arguments": {{"key": "value"}}}}
212
+ If no function applies: {{"name": null, "arguments": {{}}}}
213
 
214
+ JSON:"""
215
 
216
 
217
  def call_model(model_id: str, prompt: str) -> str:
 
220
  headers = {"Authorization": f"Bearer {token}"}
221
  url = f"https://api-inference.huggingface.co/models/{model_id}"
222
 
223
+ payload = {"inputs": prompt, "parameters": {"max_new_tokens": 200, "temperature": 0.1}}
 
 
 
224
 
225
  try:
226
  response = requests.post(url, headers=headers, json=payload, timeout=60)
227
  if response.status_code == 503:
228
  time.sleep(20)
229
  response = requests.post(url, headers=headers, json=payload, timeout=60)
 
230
  result = response.json()
231
  if isinstance(result, list) and result:
232
  return result[0].get("generated_text", "")
 
289
  """Run full evaluation on all models."""
290
  global LEADERBOARD_DATA, EVALUATION_STATUS
291
 
292
+ EVALUATION_STATUS["current"] = "Loading dataset..."
293
  samples = load_evaluation_dataset()
294
 
295
  if not samples:
296
+ EVALUATION_STATUS["current"] = "Failed to load dataset"
297
  return
298
 
299
  results = []
 
303
  model_name = model_config['model']
304
  model_id = model_config['model_id']
305
 
306
+ EVALUATION_STATUS["current"] = f"Evaluating {model_name}..."
307
+ EVALUATION_STATUS["progress"] = idx + 1
308
 
309
  category_scores = {}
310
  category_counts = {}
 
321
  except:
322
  pass
323
  category_counts[cat] += 1
324
+ time.sleep(0.5)
325
 
326
  # Calculate scores
327
  scores = {cat: round((category_scores[cat] / category_counts[cat]) * 100, 1)
 
336
  "model": model_name,
337
  "model_id": model_id,
338
  "organization": model_config['organization'],
339
+ "params": model_config['params'],
340
+ "type": model_config['type'],
341
  "overall": round(overall, 1),
342
  "simple": scores.get('simple', 0),
343
  "multiple": scores.get('multiple', 0),
 
348
  "status": "completed"
349
  })
350
 
351
+ # Update global data after each model
352
+ temp_results = sorted(results, key=lambda x: x['overall'], reverse=True)
353
+ for i, r in enumerate(temp_results, 1):
354
+ r['rank'] = i
355
+ LEADERBOARD_DATA = temp_results
356
 
357
+ EVALUATION_STATUS["current"] = "Evaluation Complete"
358
+ EVALUATION_STATUS["progress"] = total_models
359
 
360
 
361
  def get_leaderboard_df():
362
  """Get leaderboard as DataFrame."""
363
  if not LEADERBOARD_DATA:
364
+ data = []
365
+ for i, m in enumerate(MODELS_TO_EVALUATE, 1):
366
+ data.append({
367
+ "Rank": i,
368
+ "Model": m["model"],
369
+ "Org": m["organization"],
370
+ "Size": m["params"],
371
+ "Type": m["type"],
372
+ "Overall": "โ€”",
373
+ "Simple": "โ€”",
374
+ "Multiple": "โ€”",
375
+ "Parallel": "โ€”",
376
+ "Irrelevance": "โ€”",
377
+ "Dialect": "โ€”",
378
+ })
379
  return pd.DataFrame(data)
380
 
381
+ data = []
382
+ for r in LEADERBOARD_DATA:
383
+ data.append({
384
+ "Rank": f"๐Ÿฅ‡ {r['rank']}" if r['rank'] == 1 else f"๐Ÿฅˆ {r['rank']}" if r['rank'] == 2 else f"๐Ÿฅ‰ {r['rank']}" if r['rank'] == 3 else r['rank'],
385
+ "Model": r['model'],
386
+ "Org": r['organization'],
387
+ "Size": r['params'],
388
+ "Type": r['type'],
389
+ "Overall": f"{r['overall']}%",
390
+ "Simple": f"{r['simple']}%",
391
+ "Multiple": f"{r['multiple']}%",
392
+ "Parallel": f"{r['parallel']}%",
393
+ "Irrelevance": f"{r['irrelevance']}%",
394
+ "Dialect": f"{r['dialect_handling']}%",
395
+ })
396
+
397
+ return pd.DataFrame(data)
398
+
399
 
400
+ def get_status_html():
401
+ """Get evaluation status as HTML."""
402
+ progress = EVALUATION_STATUS["progress"]
403
+ total = EVALUATION_STATUS["total"]
404
+ current = EVALUATION_STATUS["current"]
405
+ pct = (progress / total) * 100 if total > 0 else 0
406
 
407
+ return f"""
408
+ <div style="background: rgba(102,126,234,0.1); border: 1px solid rgba(102,126,234,0.3); border-radius: 12px; padding: 20px; margin: 16px 0;">
409
+ <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 12px;">
410
+ <span style="color: #667eea; font-weight: 600;">๐Ÿ“Š {current}</span>
411
+ <span style="color: #a0a0a0;">{progress}/{total} models</span>
412
+ </div>
413
+ <div style="background: rgba(255,255,255,0.1); border-radius: 8px; height: 8px; overflow: hidden;">
414
+ <div style="background: linear-gradient(90deg, #667eea, #764ba2); height: 100%; width: {pct}%; transition: width 0.5s ease;"></div>
415
+ </div>
416
+ </div>
417
+ """
418
 
419
 
420
  def create_app():
421
  """Create the Gradio app."""
 
422
 
423
+ with gr.Blocks(title="AFCL - Arabic Function Calling Leaderboard", css=CUSTOM_CSS, theme=gr.themes.Base()) as app:
424
+
425
+ # Header
426
+ gr.HTML("""
427
+ <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 16px; padding: 40px; margin-bottom: 24px; text-align: center;">
428
+ <h1 style="color: white; font-size: 2.5rem; margin: 0; font-weight: 700;">
429
+ ๐Ÿ† Arabic Function Calling Leaderboard
430
+ </h1>
431
+ <p style="color: rgba(255,255,255,0.9); font-size: 1.1rem; margin-top: 8px;">
432
+ ู„ูˆุญุฉ ุชู‚ูŠูŠู… ุงุณุชุฏุนุงุก ุงู„ุฏูˆุงู„ ุจุงู„ุนุฑุจูŠุฉ
433
+ </p>
434
+ <p style="color: rgba(255,255,255,0.7); font-size: 0.95rem; margin-top: 16px; max-width: 600px; margin-left: auto; margin-right: auto;">
435
+ Comprehensive benchmark evaluating LLMs on Arabic function calling across 10 categories including dialects
436
+ </p>
437
  </div>
438
  """)
439
 
440
+ # Stats Row
 
441
  with gr.Row():
442
+ gr.HTML(f"""
443
+ <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 24px; text-align: center; flex: 1;">
444
+ <div style="font-size: 2.5rem; font-weight: 700; background: linear-gradient(135deg, #667eea, #764ba2); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">{len(MODELS_TO_EVALUATE)}</div>
445
+ <div style="color: #a0a0a0; font-size: 0.9rem; margin-top: 8px;">Models</div>
446
+ </div>
447
+ """)
448
+ gr.HTML("""
449
+ <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 24px; text-align: center; flex: 1;">
450
+ <div style="font-size: 2.5rem; font-weight: 700; background: linear-gradient(135deg, #22c55e, #16a34a); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">147</div>
451
+ <div style="color: #a0a0a0; font-size: 0.9rem; margin-top: 8px;">Test Samples</div>
452
  </div>
453
  """)
454
+ gr.HTML("""
455
+ <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 24px; text-align: center; flex: 1;">
456
+ <div style="font-size: 2.5rem; font-weight: 700; background: linear-gradient(135deg, #f59e0b, #d97706); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">10</div>
457
+ <div style="color: #a0a0a0; font-size: 0.9rem; margin-top: 8px;">Categories</div>
458
  </div>
459
  """)
460
+ gr.HTML("""
461
+ <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 24px; text-align: center; flex: 1;">
462
+ <div style="font-size: 2.5rem; font-weight: 700; background: linear-gradient(135deg, #ec4899, #be185d); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">3</div>
463
+ <div style="color: #a0a0a0; font-size: 0.9rem; margin-top: 8px;">Dialects</div>
464
  </div>
465
  """)
466
 
467
+ # Status
468
+ status_html = gr.HTML(get_status_html())
469
 
470
+ # Tabs
471
  with gr.Tabs():
472
  with gr.TabItem("๐Ÿ† Leaderboard"):
473
+ leaderboard_table = gr.DataFrame(
474
  value=get_leaderboard_df(),
475
+ interactive=False,
476
+ wrap=True,
477
  )
478
 
479
+ with gr.Row():
480
+ refresh_btn = gr.Button("๐Ÿ”„ Refresh Results", variant="primary", size="lg")
481
+
482
+ def refresh():
483
+ return get_leaderboard_df(), get_status_html()
484
+
485
+ refresh_btn.click(refresh, outputs=[leaderboard_table, status_html])
486
+
487
+ with gr.TabItem("๐Ÿ“Š Categories"):
488
+ gr.HTML("""
489
+ <div style="padding: 24px;">
490
+ <h3 style="color: #667eea; margin-bottom: 24px;">Evaluation Categories</h3>
491
+ <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 16px;">
492
+ <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;">
493
+ <h4 style="color: #22c55e; margin: 0;">Simple</h4>
494
+ <p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Single function, single call scenarios</p>
495
+ </div>
496
+ <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;">
497
+ <h4 style="color: #3b82f6; margin: 0;">Multiple</h4>
498
+ <p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Select correct function from 2-4 options</p>
499
+ </div>
500
+ <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;">
501
+ <h4 style="color: #f59e0b; margin: 0;">Parallel</h4>
502
+ <p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Multiple calls of same function</p>
503
+ </div>
504
+ <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;">
505
+ <h4 style="color: #ec4899; margin: 0;">Parallel Multiple</h4>
506
+ <p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Multiple functions, multiple calls</p>
507
+ </div>
508
+ <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;">
509
+ <h4 style="color: #ef4444; margin: 0;">Irrelevance</h4>
510
+ <p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Correctly reject when no function applies</p>
511
+ </div>
512
+ <div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;">
513
+ <h4 style="color: #8b5cf6; margin: 0;">Dialect Handling</h4>
514
+ <p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Egyptian ๐Ÿ‡ช๐Ÿ‡ฌ / Gulf ๐Ÿ‡ธ๐Ÿ‡ฆ / Levantine ๐Ÿ‡ฑ๐Ÿ‡ง</p>
515
+ </div>
516
+ </div>
517
+ </div>
518
+ """)
519
 
520
+ with gr.TabItem("๐Ÿ“– About"):
521
+ gr.HTML("""
522
+ <div style="padding: 24px; max-width: 800px;">
523
+ <h3 style="color: #667eea;">About AFCL</h3>
524
+ <p style="color: #c0c0c0; line-height: 1.8;">
525
+ The <strong>Arabic Function Calling Leaderboard (AFCL)</strong> is the first comprehensive benchmark
526
+ for evaluating LLMs on function calling capabilities in Arabic. It tests models across Modern Standard
527
+ Arabic (MSA) and three major dialects: Egyptian, Gulf, and Levantine.
528
+ </p>
529
+
530
+ <h4 style="color: #22c55e; margin-top: 24px;">Dataset</h4>
531
+ <p style="color: #c0c0c0;">
532
+ ๐Ÿ“Š <a href="https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling" style="color: #667eea;">HeshamHaroon/Arabic_Function_Calling</a>
533
+ </p>
534
+
535
+ <h4 style="color: #f59e0b; margin-top: 24px;">Scoring</h4>
536
+ <p style="color: #c0c0c0; line-height: 1.8;">
537
+ Models are scored using AST-based matching with Arabic text normalization.
538
+ The overall score is a weighted average across all categories, with emphasis on
539
+ irrelevance detection and dialect handling.
540
+ </p>
541
+
542
+ <h4 style="color: #ec4899; margin-top: 24px;">Citation</h4>
543
+ <pre style="background: rgba(255,255,255,0.05); padding: 16px; border-radius: 8px; color: #a0a0a0; overflow-x: auto;">
544
+ @misc{afcl2024,
545
+ title={Arabic Function Calling Leaderboard},
546
+ author={Hesham Haroon},
547
+ year={2024},
548
+ url={https://huggingface.co/spaces/HeshamHaroon/Arabic-Function-Calling-Leaderboard}
549
+ }</pre>
550
+ </div>
551
  """)
552
 
553
+ # Footer
554
+ gr.HTML("""
555
+ <div style="text-align: center; padding: 24px; margin-top: 24px; border-top: 1px solid rgba(255,255,255,0.1);">
556
+ <p style="color: #666; font-size: 0.9rem;">
557
+ Built for the Arabic NLP Community | ุจูู†ูŠ ู„ู…ุฌุชู…ุน ู…ุนุงู„ุฌุฉ ุงู„ู„ุบุฉ ุงู„ุนุฑุจูŠุฉ
558
+ </p>
559
  </div>
560
  """)
561