Initial release: Arabic Function Calling Leaderboard
Browse files- afcl/app.py +77 -84
afcl/app.py
CHANGED
|
@@ -17,10 +17,6 @@ from .data.loader import (
|
|
| 17 |
load_leaderboard, save_leaderboard, load_benchmark,
|
| 18 |
calculate_overall_score, CATEGORY_WEIGHTS
|
| 19 |
)
|
| 20 |
-
from .visualization.charts import (
|
| 21 |
-
create_radar_chart, create_bar_chart,
|
| 22 |
-
create_category_comparison, create_dialect_breakdown
|
| 23 |
-
)
|
| 24 |
|
| 25 |
# Constants
|
| 26 |
TITLE = "🏆 Arabic Function Calling Leaderboard"
|
|
@@ -34,14 +30,14 @@ The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Mode
|
|
| 34 |
|
| 35 |
# Column definitions
|
| 36 |
LEADERBOARD_COLUMNS = {
|
| 37 |
-
"rank": {"label": "
|
| 38 |
"model": {"label": "النموذج", "label_en": "Model", "type": "str"},
|
| 39 |
"organization": {"label": "المنظمة", "label_en": "Organization", "type": "str"},
|
| 40 |
"overall": {"label": "الدقة الكلية", "label_en": "Overall", "type": "number"},
|
| 41 |
"simple": {"label": "بسيط", "label_en": "Simple", "type": "number"},
|
| 42 |
"multiple": {"label": "متعدد", "label_en": "Multiple", "type": "number"},
|
| 43 |
"parallel": {"label": "متوازي", "label_en": "Parallel", "type": "number"},
|
| 44 |
-
"parallel_multiple": {"label": "متوازي متعدد", "label_en": "
|
| 45 |
"irrelevance": {"label": "اللا صلة", "label_en": "Irrelevance", "type": "number"},
|
| 46 |
"dialect_handling": {"label": "اللهجات", "label_en": "Dialects", "type": "number"},
|
| 47 |
"status": {"label": "الحالة", "label_en": "Status", "type": "str"},
|
|
@@ -64,11 +60,13 @@ def get_leaderboard_data() -> List[Dict]:
|
|
| 64 |
|
| 65 |
def format_leaderboard_dataframe(data: List[Dict], use_arabic: bool = True) -> pd.DataFrame:
|
| 66 |
"""Convert leaderboard data to pandas DataFrame."""
|
|
|
|
|
|
|
|
|
|
| 67 |
df = pd.DataFrame(data)
|
| 68 |
|
| 69 |
-
# Select columns to display
|
| 70 |
-
display_cols = ["rank", "model", "organization", "overall", "
|
| 71 |
-
"parallel", "parallel_multiple", "irrelevance", "dialect_handling", "status"]
|
| 72 |
df = df[[c for c in display_cols if c in df.columns]]
|
| 73 |
|
| 74 |
# Rename columns based on language preference
|
|
@@ -80,67 +78,54 @@ def format_leaderboard_dataframe(data: List[Dict], use_arabic: bool = True) -> p
|
|
| 80 |
|
| 81 |
df = df.rename(columns=column_mapping)
|
| 82 |
|
| 83 |
-
# Format numeric columns (show as percentage, but mark 0.0 as "
|
| 84 |
for col in df.columns:
|
| 85 |
if df[col].dtype in ['float64', 'float32']:
|
| 86 |
-
df[col] = df[col].apply(lambda x: "
|
| 87 |
|
| 88 |
# Format status column
|
| 89 |
status_col = "الحالة" if use_arabic else "Status"
|
| 90 |
if status_col in df.columns:
|
| 91 |
-
df[status_col] = df[status_col].apply(
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
return df
|
| 94 |
|
| 95 |
|
| 96 |
-
def
|
| 97 |
-
"""Create the
|
| 98 |
data = get_leaderboard_data()
|
| 99 |
-
df = format_leaderboard_dataframe(data, use_arabic)
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
| 106 |
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
-
|
| 109 |
-
"""Create the visualization tab with charts."""
|
| 110 |
-
data = get_leaderboard_data()
|
| 111 |
|
| 112 |
-
|
| 113 |
-
model_scores = {
|
| 114 |
-
entry["model"]: {k: v for k, v in entry.items() if k not in ["rank", "model"]}
|
| 115 |
-
for entry in data
|
| 116 |
-
}
|
| 117 |
|
| 118 |
-
|
| 119 |
-
with gr.Column():
|
| 120 |
-
radar_chart = create_radar_chart(
|
| 121 |
-
{k: v for k, v in list(model_scores.items())[:5]},
|
| 122 |
-
use_arabic=True,
|
| 123 |
-
title="مقارنة النماذج - Category Comparison"
|
| 124 |
-
)
|
| 125 |
-
gr.Plot(value=radar_chart)
|
| 126 |
-
|
| 127 |
-
with gr.Row():
|
| 128 |
-
with gr.Column():
|
| 129 |
-
bar_chart = create_bar_chart(
|
| 130 |
-
data,
|
| 131 |
-
metric="overall",
|
| 132 |
-
use_arabic=True,
|
| 133 |
-
title="أفضل النماذج - Top Models"
|
| 134 |
-
)
|
| 135 |
-
gr.Plot(value=bar_chart)
|
| 136 |
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
|
| 146 |
def create_submit_tab():
|
|
@@ -220,14 +205,20 @@ def create_about_tab():
|
|
| 220 |
|
| 221 |
## Evaluation Categories | فئات التقييم
|
| 222 |
|
| 223 |
-
| Category | الفئة |
|
| 224 |
-
|
| 225 |
-
| Simple | بسيط | Single function, single call |
|
| 226 |
-
| Multiple | متعدد | Select correct function from options |
|
| 227 |
-
| Parallel | متوازي | Multiple calls of same function |
|
| 228 |
-
| Parallel Multiple | متوازي متعدد | Multiple functions, multiple calls |
|
| 229 |
-
| Irrelevance | اللا صلة | No function should be called |
|
| 230 |
-
| Dialect Handling | اللهجات | Egyptian/Gulf/Levantine queries |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
## Scoring Formula | معادلة التقييم
|
| 233 |
|
|
@@ -245,19 +236,14 @@ def create_about_tab():
|
|
| 245 |
- Multi-Turn: 15%
|
| 246 |
- Native Arabic: 10%
|
| 247 |
|
| 248 |
-
## Evaluation Methodology | منهجية التقييم
|
| 249 |
-
|
| 250 |
-
1. **AST-Based Matching**: Function calls are compared using Abstract Syntax Tree matching with Arabic text normalization.
|
| 251 |
-
|
| 252 |
-
2. **Arabic Normalization**: Handles diacritics (tashkeel), alef variants, and Arabic-Indic numerals.
|
| 253 |
-
|
| 254 |
-
3. **Order-Agnostic Parallel Evaluation**: For parallel calls, order doesn't matter - we use bipartite matching.
|
| 255 |
-
|
| 256 |
## Dataset | مجموعة البيانات
|
| 257 |
|
| 258 |
-
|
|
|
|
|
|
|
| 259 |
- **Languages**: Arabic (MSA + Dialects) & English
|
| 260 |
-
- **
|
|
|
|
| 261 |
|
| 262 |
## Citation | الاقتباس
|
| 263 |
|
|
@@ -269,12 +255,6 @@ def create_about_tab():
|
|
| 269 |
url={https://huggingface.co/spaces/HeshamHaroon/Arabic-Function-Calling-Leaderboard}
|
| 270 |
}
|
| 271 |
```
|
| 272 |
-
|
| 273 |
-
## Contact | التواصل
|
| 274 |
-
|
| 275 |
-
For questions or contributions, please open an issue on the repository.
|
| 276 |
-
|
| 277 |
-
للأسئلة أو المساهمات، يرجى فتح مشكلة في المستودع.
|
| 278 |
""")
|
| 279 |
|
| 280 |
|
|
@@ -305,11 +285,20 @@ def create_app():
|
|
| 305 |
|
| 306 |
# Stats row
|
| 307 |
data = get_leaderboard_data()
|
|
|
|
|
|
|
|
|
|
| 308 |
with gr.Row():
|
| 309 |
gr.Markdown(f"""
|
| 310 |
<div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
|
| 311 |
<div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(data)}</div>
|
| 312 |
-
<div style="color: #666;">Models
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
</div>
|
| 314 |
""")
|
| 315 |
gr.Markdown("""
|
|
@@ -318,10 +307,14 @@ def create_app():
|
|
| 318 |
<div style="color: #666;">Test Samples | عينات الاختبار</div>
|
| 319 |
</div>
|
| 320 |
""")
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
</div>
|
| 326 |
""")
|
| 327 |
|
|
@@ -335,8 +328,8 @@ def create_app():
|
|
| 335 |
wrap=True,
|
| 336 |
)
|
| 337 |
|
| 338 |
-
with gr.TabItem("
|
| 339 |
-
|
| 340 |
|
| 341 |
with gr.TabItem("📤 Submit | إرسال"):
|
| 342 |
create_submit_tab()
|
|
|
|
| 17 |
load_leaderboard, save_leaderboard, load_benchmark,
|
| 18 |
calculate_overall_score, CATEGORY_WEIGHTS
|
| 19 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# Constants
|
| 22 |
TITLE = "🏆 Arabic Function Calling Leaderboard"
|
|
|
|
| 30 |
|
| 31 |
# Column definitions
|
| 32 |
LEADERBOARD_COLUMNS = {
|
| 33 |
+
"rank": {"label": "#", "label_en": "#", "type": "number"},
|
| 34 |
"model": {"label": "النموذج", "label_en": "Model", "type": "str"},
|
| 35 |
"organization": {"label": "المنظمة", "label_en": "Organization", "type": "str"},
|
| 36 |
"overall": {"label": "الدقة الكلية", "label_en": "Overall", "type": "number"},
|
| 37 |
"simple": {"label": "بسيط", "label_en": "Simple", "type": "number"},
|
| 38 |
"multiple": {"label": "متعدد", "label_en": "Multiple", "type": "number"},
|
| 39 |
"parallel": {"label": "متوازي", "label_en": "Parallel", "type": "number"},
|
| 40 |
+
"parallel_multiple": {"label": "متوازي متعدد", "label_en": "P. Multiple", "type": "number"},
|
| 41 |
"irrelevance": {"label": "اللا صلة", "label_en": "Irrelevance", "type": "number"},
|
| 42 |
"dialect_handling": {"label": "اللهجات", "label_en": "Dialects", "type": "number"},
|
| 43 |
"status": {"label": "الحالة", "label_en": "Status", "type": "str"},
|
|
|
|
| 60 |
|
| 61 |
def format_leaderboard_dataframe(data: List[Dict], use_arabic: bool = True) -> pd.DataFrame:
|
| 62 |
"""Convert leaderboard data to pandas DataFrame."""
|
| 63 |
+
if not data:
|
| 64 |
+
return pd.DataFrame()
|
| 65 |
+
|
| 66 |
df = pd.DataFrame(data)
|
| 67 |
|
| 68 |
+
# Select columns to display (fewer columns for cleaner view)
|
| 69 |
+
display_cols = ["rank", "model", "organization", "overall", "status"]
|
|
|
|
| 70 |
df = df[[c for c in display_cols if c in df.columns]]
|
| 71 |
|
| 72 |
# Rename columns based on language preference
|
|
|
|
| 78 |
|
| 79 |
df = df.rename(columns=column_mapping)
|
| 80 |
|
| 81 |
+
# Format numeric columns (show as percentage, but mark 0.0 as "-")
|
| 82 |
for col in df.columns:
|
| 83 |
if df[col].dtype in ['float64', 'float32']:
|
| 84 |
+
df[col] = df[col].apply(lambda x: "-" if x == 0.0 else f"{x:.1f}%")
|
| 85 |
|
| 86 |
# Format status column
|
| 87 |
status_col = "الحالة" if use_arabic else "Status"
|
| 88 |
if status_col in df.columns:
|
| 89 |
+
df[status_col] = df[status_col].apply(
|
| 90 |
+
lambda x: "⏳ قيد الانتظار" if x == "pending" else "✅ مكتمل"
|
| 91 |
+
if use_arabic else "⏳ Pending" if x == "pending" else "✅ Done"
|
| 92 |
+
)
|
| 93 |
|
| 94 |
return df
|
| 95 |
|
| 96 |
|
| 97 |
+
def create_models_list_tab():
|
| 98 |
+
"""Create the models list tab showing all models to be evaluated."""
|
| 99 |
data = get_leaderboard_data()
|
|
|
|
| 100 |
|
| 101 |
+
# Group by organization
|
| 102 |
+
orgs = {}
|
| 103 |
+
for entry in data:
|
| 104 |
+
org = entry.get("organization", "Other")
|
| 105 |
+
if org not in orgs:
|
| 106 |
+
orgs[org] = []
|
| 107 |
+
orgs[org].append(entry)
|
| 108 |
|
| 109 |
+
# Create markdown content
|
| 110 |
+
md_content = """
|
| 111 |
+
## 📋 Models Queue | قائمة النماذج للتقييم
|
| 112 |
|
| 113 |
+
The following **{total}** models are queued for evaluation on the Arabic Function Calling benchmark:
|
|
|
|
|
|
|
| 114 |
|
| 115 |
+
النماذج التالية (**{total}** نموذج) في قائمة الانتظار للتقييم:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
+
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
+
""".format(total=len(data))
|
| 120 |
+
|
| 121 |
+
for org, models in sorted(orgs.items()):
|
| 122 |
+
md_content += f"### {org}\n"
|
| 123 |
+
for m in models:
|
| 124 |
+
model_url = m.get("model_url", "#")
|
| 125 |
+
md_content += f"- [{m['model']}]({model_url}) - ⏳ Pending\n"
|
| 126 |
+
md_content += "\n"
|
| 127 |
+
|
| 128 |
+
return gr.Markdown(md_content)
|
| 129 |
|
| 130 |
|
| 131 |
def create_submit_tab():
|
|
|
|
| 205 |
|
| 206 |
## Evaluation Categories | فئات التقييم
|
| 207 |
|
| 208 |
+
| Category | الفئة | Samples | Description |
|
| 209 |
+
|----------|-------|---------|-------------|
|
| 210 |
+
| Simple | بسيط | 200 | Single function, single call |
|
| 211 |
+
| Multiple | متعدد | 200 | Select correct function from options |
|
| 212 |
+
| Parallel | متوازي | 200 | Multiple calls of same function |
|
| 213 |
+
| Parallel Multiple | متوازي متعدد | 200 | Multiple functions, multiple calls |
|
| 214 |
+
| Irrelevance | اللا صلة | 200 | No function should be called |
|
| 215 |
+
| Dialect Handling | اللهجات | 150 | Egyptian/Gulf/Levantine queries |
|
| 216 |
+
| Java | جافا | 100 | Java API function calls |
|
| 217 |
+
| JavaScript | جافاسكريبت | 50 | JS function calls |
|
| 218 |
+
| REST | REST | 70 | REST API calls |
|
| 219 |
+
| SQL | SQL | 100 | SQL query generation |
|
| 220 |
+
|
| 221 |
+
**Total: 1,470 samples**
|
| 222 |
|
| 223 |
## Scoring Formula | معادلة التقييم
|
| 224 |
|
|
|
|
| 236 |
- Multi-Turn: 15%
|
| 237 |
- Native Arabic: 10%
|
| 238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
## Dataset | مجموعة البيانات
|
| 240 |
|
| 241 |
+
📊 **[HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)**
|
| 242 |
+
|
| 243 |
+
- **Total Samples**: 1,470
|
| 244 |
- **Languages**: Arabic (MSA + Dialects) & English
|
| 245 |
+
- **Categories**: 10 evaluation categories
|
| 246 |
+
- **Source**: Translated from BFCL with dialect variants
|
| 247 |
|
| 248 |
## Citation | الاقتباس
|
| 249 |
|
|
|
|
| 255 |
url={https://huggingface.co/spaces/HeshamHaroon/Arabic-Function-Calling-Leaderboard}
|
| 256 |
}
|
| 257 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
""")
|
| 259 |
|
| 260 |
|
|
|
|
| 285 |
|
| 286 |
# Stats row
|
| 287 |
data = get_leaderboard_data()
|
| 288 |
+
evaluated = len([d for d in data if d.get("status") != "pending"])
|
| 289 |
+
pending = len([d for d in data if d.get("status") == "pending"])
|
| 290 |
+
|
| 291 |
with gr.Row():
|
| 292 |
gr.Markdown(f"""
|
| 293 |
<div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
|
| 294 |
<div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(data)}</div>
|
| 295 |
+
<div style="color: #666;">Total Models | إجمالي النماذج</div>
|
| 296 |
+
</div>
|
| 297 |
+
""")
|
| 298 |
+
gr.Markdown(f"""
|
| 299 |
+
<div style="text-align: center; padding: 15px; background: #fff3cd; border-radius: 8px;">
|
| 300 |
+
<div style="font-size: 2rem; font-weight: bold; color: #856404;">{pending}</div>
|
| 301 |
+
<div style="color: #856404;">⏳ Pending | قيد الانتظار</div>
|
| 302 |
</div>
|
| 303 |
""")
|
| 304 |
gr.Markdown("""
|
|
|
|
| 307 |
<div style="color: #666;">Test Samples | عينات الاختبار</div>
|
| 308 |
</div>
|
| 309 |
""")
|
| 310 |
+
|
| 311 |
+
# Notice about pending evaluation
|
| 312 |
+
if pending > 0:
|
| 313 |
+
gr.Markdown(f"""
|
| 314 |
+
<div style="padding: 15px; background: #fff3cd; border: 1px solid #ffc107; border-radius: 8px; margin: 15px 0;">
|
| 315 |
+
⏳ <strong>Evaluation in Progress | التقييم قيد التنفيذ</strong><br>
|
| 316 |
+
{pending} models are waiting to be evaluated. Results will be updated as evaluations complete.<br>
|
| 317 |
+
{pending} نموذج في انتظار التقييم. سيتم تحديث النتائج فور اكتمال التقييم.
|
| 318 |
</div>
|
| 319 |
""")
|
| 320 |
|
|
|
|
| 328 |
wrap=True,
|
| 329 |
)
|
| 330 |
|
| 331 |
+
with gr.TabItem("📋 Models | النماذج"):
|
| 332 |
+
create_models_list_tab()
|
| 333 |
|
| 334 |
with gr.TabItem("📤 Submit | إرسال"):
|
| 335 |
create_submit_tab()
|