HeshamHaroon commited on
Commit
6add5d0
·
verified ·
1 Parent(s): 6315c7d

Initial release: Arabic Function Calling Leaderboard

Browse files
Files changed (1) hide show
  1. afcl/app.py +77 -84
afcl/app.py CHANGED
@@ -17,10 +17,6 @@ from .data.loader import (
17
  load_leaderboard, save_leaderboard, load_benchmark,
18
  calculate_overall_score, CATEGORY_WEIGHTS
19
  )
20
- from .visualization.charts import (
21
- create_radar_chart, create_bar_chart,
22
- create_category_comparison, create_dialect_breakdown
23
- )
24
 
25
  # Constants
26
  TITLE = "🏆 Arabic Function Calling Leaderboard"
@@ -34,14 +30,14 @@ The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Mode
34
 
35
  # Column definitions
36
  LEADERBOARD_COLUMNS = {
37
- "rank": {"label": "المرتبة", "label_en": "Rank", "type": "number"},
38
  "model": {"label": "النموذج", "label_en": "Model", "type": "str"},
39
  "organization": {"label": "المنظمة", "label_en": "Organization", "type": "str"},
40
  "overall": {"label": "الدقة الكلية", "label_en": "Overall", "type": "number"},
41
  "simple": {"label": "بسيط", "label_en": "Simple", "type": "number"},
42
  "multiple": {"label": "متعدد", "label_en": "Multiple", "type": "number"},
43
  "parallel": {"label": "متوازي", "label_en": "Parallel", "type": "number"},
44
- "parallel_multiple": {"label": "متوازي متعدد", "label_en": "Parallel Multiple", "type": "number"},
45
  "irrelevance": {"label": "اللا صلة", "label_en": "Irrelevance", "type": "number"},
46
  "dialect_handling": {"label": "اللهجات", "label_en": "Dialects", "type": "number"},
47
  "status": {"label": "الحالة", "label_en": "Status", "type": "str"},
@@ -64,11 +60,13 @@ def get_leaderboard_data() -> List[Dict]:
64
 
65
  def format_leaderboard_dataframe(data: List[Dict], use_arabic: bool = True) -> pd.DataFrame:
66
  """Convert leaderboard data to pandas DataFrame."""
 
 
 
67
  df = pd.DataFrame(data)
68
 
69
- # Select columns to display
70
- display_cols = ["rank", "model", "organization", "overall", "simple", "multiple",
71
- "parallel", "parallel_multiple", "irrelevance", "dialect_handling", "status"]
72
  df = df[[c for c in display_cols if c in df.columns]]
73
 
74
  # Rename columns based on language preference
@@ -80,67 +78,54 @@ def format_leaderboard_dataframe(data: List[Dict], use_arabic: bool = True) -> p
80
 
81
  df = df.rename(columns=column_mapping)
82
 
83
- # Format numeric columns (show as percentage, but mark 0.0 as "Pending")
84
  for col in df.columns:
85
  if df[col].dtype in ['float64', 'float32']:
86
- df[col] = df[col].apply(lambda x: "⏳ Pending" if x == 0.0 else f"{x:.1f}%")
87
 
88
  # Format status column
89
  status_col = "الحالة" if use_arabic else "Status"
90
  if status_col in df.columns:
91
- df[status_col] = df[status_col].apply(lambda x: "⏳ قيد التقييم" if x == "pending" else "✅ مكتمل" if use_arabic else "⏳ Pending" if x == "pending" else "✅ Completed")
 
 
 
92
 
93
  return df
94
 
95
 
96
- def create_leaderboard_tab(use_arabic: bool = True):
97
- """Create the main leaderboard tab content."""
98
  data = get_leaderboard_data()
99
- df = format_leaderboard_dataframe(data, use_arabic)
100
 
101
- return gr.DataFrame(
102
- value=df,
103
- interactive=False,
104
- wrap=True,
105
- )
 
 
106
 
 
 
 
107
 
108
- def create_visualization_tab():
109
- """Create the visualization tab with charts."""
110
- data = get_leaderboard_data()
111
 
112
- # Prepare data for charts
113
- model_scores = {
114
- entry["model"]: {k: v for k, v in entry.items() if k not in ["rank", "model"]}
115
- for entry in data
116
- }
117
 
118
- with gr.Row():
119
- with gr.Column():
120
- radar_chart = create_radar_chart(
121
- {k: v for k, v in list(model_scores.items())[:5]},
122
- use_arabic=True,
123
- title="مقارنة النماذج - Category Comparison"
124
- )
125
- gr.Plot(value=radar_chart)
126
-
127
- with gr.Row():
128
- with gr.Column():
129
- bar_chart = create_bar_chart(
130
- data,
131
- metric="overall",
132
- use_arabic=True,
133
- title="أفضل النماذج - Top Models"
134
- )
135
- gr.Plot(value=bar_chart)
136
 
137
- with gr.Row():
138
- category_chart = create_category_comparison(
139
- data,
140
- use_arabic=True,
141
- title="أداء الفئات - Category Performance"
142
- )
143
- gr.Plot(value=category_chart)
 
 
 
144
 
145
 
146
  def create_submit_tab():
@@ -220,14 +205,20 @@ def create_about_tab():
220
 
221
  ## Evaluation Categories | فئات التقييم
222
 
223
- | Category | الفئة | Description | الوصف |
224
- |----------|-------|-------------|-------|
225
- | Simple | بسيط | Single function, single call | دالة واحدة، استدعاء واحد |
226
- | Multiple | متعدد | Select correct function from options | اختيار الدالة الصحيحة من عدة خيارات |
227
- | Parallel | متوازي | Multiple calls of same function | استدعاءات متعددة لنفس الدالة |
228
- | Parallel Multiple | متوازي متعدد | Multiple functions, multiple calls | دوال متعددة، استدعاءات متعددة |
229
- | Irrelevance | اللا صلة | No function should be called | لا يجب استدعاء أي دالة |
230
- | Dialect Handling | اللهجات | Egyptian/Gulf/Levantine queries | استعلامات مصرية/خليجية/شامية |
 
 
 
 
 
 
231
 
232
  ## Scoring Formula | معادلة التقييم
233
 
@@ -245,19 +236,14 @@ def create_about_tab():
245
  - Multi-Turn: 15%
246
  - Native Arabic: 10%
247
 
248
- ## Evaluation Methodology | منهجية التقييم
249
-
250
- 1. **AST-Based Matching**: Function calls are compared using Abstract Syntax Tree matching with Arabic text normalization.
251
-
252
- 2. **Arabic Normalization**: Handles diacritics (tashkeel), alef variants, and Arabic-Indic numerals.
253
-
254
- 3. **Order-Agnostic Parallel Evaluation**: For parallel calls, order doesn't matter - we use bipartite matching.
255
-
256
  ## Dataset | مجموعة البيانات
257
 
258
- - **Total Samples**: 1,470+
 
 
259
  - **Languages**: Arabic (MSA + Dialects) & English
260
- - **Source**: Translated from BFCL with additional dialect variants
 
261
 
262
  ## Citation | الاقتباس
263
 
@@ -269,12 +255,6 @@ def create_about_tab():
269
  url={https://huggingface.co/spaces/HeshamHaroon/Arabic-Function-Calling-Leaderboard}
270
  }
271
  ```
272
-
273
- ## Contact | التواصل
274
-
275
- For questions or contributions, please open an issue on the repository.
276
-
277
- للأسئلة أو المساهمات، يرجى فتح مشكلة في المستودع.
278
  """)
279
 
280
 
@@ -305,11 +285,20 @@ def create_app():
305
 
306
  # Stats row
307
  data = get_leaderboard_data()
 
 
 
308
  with gr.Row():
309
  gr.Markdown(f"""
310
  <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
311
  <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(data)}</div>
312
- <div style="color: #666;">Models to Evaluate | النماذج للتقييم</div>
 
 
 
 
 
 
313
  </div>
314
  """)
315
  gr.Markdown("""
@@ -318,10 +307,14 @@ def create_app():
318
  <div style="color: #666;">Test Samples | عينات الاختبار</div>
319
  </div>
320
  """)
321
- gr.Markdown("""
322
- <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
323
- <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">10</div>
324
- <div style="color: #666;">Categories | الفئات</div>
 
 
 
 
325
  </div>
326
  """)
327
 
@@ -335,8 +328,8 @@ def create_app():
335
  wrap=True,
336
  )
337
 
338
- with gr.TabItem("📊 Visualizations | الرسوم البيانية"):
339
- create_visualization_tab()
340
 
341
  with gr.TabItem("📤 Submit | إرسال"):
342
  create_submit_tab()
 
17
  load_leaderboard, save_leaderboard, load_benchmark,
18
  calculate_overall_score, CATEGORY_WEIGHTS
19
  )
 
 
 
 
20
 
21
  # Constants
22
  TITLE = "🏆 Arabic Function Calling Leaderboard"
 
30
 
31
  # Column definitions
32
  LEADERBOARD_COLUMNS = {
33
+ "rank": {"label": "#", "label_en": "#", "type": "number"},
34
  "model": {"label": "النموذج", "label_en": "Model", "type": "str"},
35
  "organization": {"label": "المنظمة", "label_en": "Organization", "type": "str"},
36
  "overall": {"label": "الدقة الكلية", "label_en": "Overall", "type": "number"},
37
  "simple": {"label": "بسيط", "label_en": "Simple", "type": "number"},
38
  "multiple": {"label": "متعدد", "label_en": "Multiple", "type": "number"},
39
  "parallel": {"label": "متوازي", "label_en": "Parallel", "type": "number"},
40
+ "parallel_multiple": {"label": "متوازي متعدد", "label_en": "P. Multiple", "type": "number"},
41
  "irrelevance": {"label": "اللا صلة", "label_en": "Irrelevance", "type": "number"},
42
  "dialect_handling": {"label": "اللهجات", "label_en": "Dialects", "type": "number"},
43
  "status": {"label": "الحالة", "label_en": "Status", "type": "str"},
 
60
 
61
  def format_leaderboard_dataframe(data: List[Dict], use_arabic: bool = True) -> pd.DataFrame:
62
  """Convert leaderboard data to pandas DataFrame."""
63
+ if not data:
64
+ return pd.DataFrame()
65
+
66
  df = pd.DataFrame(data)
67
 
68
+ # Select columns to display (fewer columns for cleaner view)
69
+ display_cols = ["rank", "model", "organization", "overall", "status"]
 
70
  df = df[[c for c in display_cols if c in df.columns]]
71
 
72
  # Rename columns based on language preference
 
78
 
79
  df = df.rename(columns=column_mapping)
80
 
81
+ # Format numeric columns (show as percentage, but mark 0.0 as "-")
82
  for col in df.columns:
83
  if df[col].dtype in ['float64', 'float32']:
84
+ df[col] = df[col].apply(lambda x: "-" if x == 0.0 else f"{x:.1f}%")
85
 
86
  # Format status column
87
  status_col = "الحالة" if use_arabic else "Status"
88
  if status_col in df.columns:
89
+ df[status_col] = df[status_col].apply(
90
+ lambda x: "⏳ قيد الانتظار" if x == "pending" else "✅ مكتمل"
91
+ if use_arabic else "⏳ Pending" if x == "pending" else "✅ Done"
92
+ )
93
 
94
  return df
95
 
96
 
97
+ def create_models_list_tab():
98
+ """Create the models list tab showing all models to be evaluated."""
99
  data = get_leaderboard_data()
 
100
 
101
+ # Group by organization
102
+ orgs = {}
103
+ for entry in data:
104
+ org = entry.get("organization", "Other")
105
+ if org not in orgs:
106
+ orgs[org] = []
107
+ orgs[org].append(entry)
108
 
109
+ # Create markdown content
110
+ md_content = """
111
+ ## 📋 Models Queue | قائمة النماذج للتقييم
112
 
113
+ The following **{total}** models are queued for evaluation on the Arabic Function Calling benchmark:
 
 
114
 
115
+ النماذج التالية (**{total}** نموذج) في قائمة الانتظار للتقييم:
 
 
 
 
116
 
117
+ ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
+ """.format(total=len(data))
120
+
121
+ for org, models in sorted(orgs.items()):
122
+ md_content += f"### {org}\n"
123
+ for m in models:
124
+ model_url = m.get("model_url", "#")
125
+ md_content += f"- [{m['model']}]({model_url}) - ⏳ Pending\n"
126
+ md_content += "\n"
127
+
128
+ return gr.Markdown(md_content)
129
 
130
 
131
  def create_submit_tab():
 
205
 
206
  ## Evaluation Categories | فئات التقييم
207
 
208
+ | Category | الفئة | Samples | Description |
209
+ |----------|-------|---------|-------------|
210
+ | Simple | بسيط | 200 | Single function, single call |
211
+ | Multiple | متعدد | 200 | Select correct function from options |
212
+ | Parallel | متوازي | 200 | Multiple calls of same function |
213
+ | Parallel Multiple | متوازي متعدد | 200 | Multiple functions, multiple calls |
214
+ | Irrelevance | اللا صلة | 200 | No function should be called |
215
+ | Dialect Handling | اللهجات | 150 | Egyptian/Gulf/Levantine queries |
216
+ | Java | جافا | 100 | Java API function calls |
217
+ | JavaScript | جافاسكريبت | 50 | JS function calls |
218
+ | REST | REST | 70 | REST API calls |
219
+ | SQL | SQL | 100 | SQL query generation |
220
+
221
+ **Total: 1,470 samples**
222
 
223
  ## Scoring Formula | معادلة التقييم
224
 
 
236
  - Multi-Turn: 15%
237
  - Native Arabic: 10%
238
 
 
 
 
 
 
 
 
 
239
  ## Dataset | مجموعة البيانات
240
 
241
+ 📊 **[HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)**
242
+
243
+ - **Total Samples**: 1,470
244
  - **Languages**: Arabic (MSA + Dialects) & English
245
+ - **Categories**: 10 evaluation categories
246
+ - **Source**: Translated from BFCL with dialect variants
247
 
248
  ## Citation | الاقتباس
249
 
 
255
  url={https://huggingface.co/spaces/HeshamHaroon/Arabic-Function-Calling-Leaderboard}
256
  }
257
  ```
 
 
 
 
 
 
258
  """)
259
 
260
 
 
285
 
286
  # Stats row
287
  data = get_leaderboard_data()
288
+ evaluated = len([d for d in data if d.get("status") != "pending"])
289
+ pending = len([d for d in data if d.get("status") == "pending"])
290
+
291
  with gr.Row():
292
  gr.Markdown(f"""
293
  <div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
294
  <div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(data)}</div>
295
+ <div style="color: #666;">Total Models | إجمالي النماذج</div>
296
+ </div>
297
+ """)
298
+ gr.Markdown(f"""
299
+ <div style="text-align: center; padding: 15px; background: #fff3cd; border-radius: 8px;">
300
+ <div style="font-size: 2rem; font-weight: bold; color: #856404;">{pending}</div>
301
+ <div style="color: #856404;">⏳ Pending | قيد الانتظار</div>
302
  </div>
303
  """)
304
  gr.Markdown("""
 
307
  <div style="color: #666;">Test Samples | عينات الاختبار</div>
308
  </div>
309
  """)
310
+
311
+ # Notice about pending evaluation
312
+ if pending > 0:
313
+ gr.Markdown(f"""
314
+ <div style="padding: 15px; background: #fff3cd; border: 1px solid #ffc107; border-radius: 8px; margin: 15px 0;">
315
+ ⏳ <strong>Evaluation in Progress | التقييم قيد التنفيذ</strong><br>
316
+ {pending} models are waiting to be evaluated. Results will be updated as evaluations complete.<br>
317
+ {pending} نموذج في انتظار التقييم. سيتم تحديث النتائج فور اكتمال التقييم.
318
  </div>
319
  """)
320
 
 
328
  wrap=True,
329
  )
330
 
331
+ with gr.TabItem("📋 Models | النماذج"):
332
+ create_models_list_tab()
333
 
334
  with gr.TabItem("📤 Submit | إرسال"):
335
  create_submit_tab()