|
|
""" |
|
|
Arabic Function Calling Leaderboard (AFCL) |
|
|
========================================== |
|
|
|
|
|
A Gradio-based leaderboard that evaluates LLMs on Arabic function calling. |
|
|
Evaluation runs on HuggingFace Space infrastructure. |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import json |
|
|
import os |
|
|
import re |
|
|
import time |
|
|
import requests |
|
|
from pathlib import Path |
|
|
from typing import Dict, List, Optional |
|
|
from threading import Thread |
|
|
from datasets import load_dataset |
|
|
import huggingface_hub |
|
|
|
|
|
|
|
|
TITLE = "🏆 Arabic Function Calling Leaderboard" |
|
|
TITLE_AR = "🏆 لوحة تقييم استدعاء الدوال بالعربية" |
|
|
|
|
|
DESCRIPTION = """ |
|
|
The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Models on their ability to understand Arabic queries and generate appropriate function calls. |
|
|
|
|
|
**لوحة تقييم استدعاء الدوال بالعربية** تقيّم نماذج اللغة الكبيرة على قدرتها على فهم الاستعلامات العربية وإنشاء استدعاءات الدوال المناسبة. |
|
|
""" |
|
|
|
|
|
|
|
|
MODELS_TO_EVALUATE = [ |
|
|
{"model": "Jais-30B-Chat", "model_id": "inceptionai/jais-30b-chat-v3", "organization": "Inception AI"}, |
|
|
{"model": "ALLaM-7B-Instruct", "model_id": "sdaia/allam-1-7b-instruct", "organization": "SDAIA"}, |
|
|
{"model": "SILMA-9B-Instruct", "model_id": "silma-ai/SILMA-9B-Instruct-v1.0", "organization": "Silma AI"}, |
|
|
{"model": "AceGPT-13B-Chat", "model_id": "FreedomIntelligence/AceGPT-13B-chat", "organization": "FreedomIntelligence"}, |
|
|
{"model": "BLOOMZ-7B1", "model_id": "bigscience/bloomz-7b1", "organization": "BigScience"}, |
|
|
{"model": "Aya-Expanse-8B", "model_id": "CohereForAI/aya-expanse-8b", "organization": "Cohere For AI"}, |
|
|
{"model": "Qwen2.5-7B-Instruct", "model_id": "Qwen/Qwen2.5-7B-Instruct", "organization": "Alibaba Qwen"}, |
|
|
{"model": "Llama-3.1-8B-Instruct", "model_id": "meta-llama/Llama-3.1-8B-Instruct", "organization": "Meta"}, |
|
|
{"model": "Gemma-2-9B-IT", "model_id": "google/gemma-2-9b-it", "organization": "Google"}, |
|
|
{"model": "Mistral-7B-Instruct", "model_id": "mistralai/Mistral-7B-Instruct-v0.3", "organization": "Mistral AI"}, |
|
|
{"model": "Phi-3-Mini-Instruct", "model_id": "microsoft/Phi-3-mini-4k-instruct", "organization": "Microsoft"}, |
|
|
] |
|
|
|
|
|
|
|
|
LEADERBOARD_DATA = [] |
|
|
EVALUATION_STATUS = "Not started" |
|
|
|
|
|
|
|
|
def load_evaluation_dataset(): |
|
|
"""Load the Arabic FC dataset from HuggingFace.""" |
|
|
try: |
|
|
dataset = load_dataset("HeshamHaroon/Arabic_Function_Calling", split="test") |
|
|
samples = [] |
|
|
for item in dataset: |
|
|
sample = { |
|
|
'id': item['id'], |
|
|
'query_ar': item['query_ar'], |
|
|
'functions': json.loads(item['functions']) if item['functions'] else [], |
|
|
'ground_truth': json.loads(item['ground_truth']) if item['ground_truth'] else None, |
|
|
'category': item['category'], |
|
|
} |
|
|
samples.append(sample) |
|
|
return samples |
|
|
except Exception as e: |
|
|
print(f"Error loading dataset: {e}") |
|
|
return [] |
|
|
|
|
|
|
|
|
def create_prompt(query: str, functions: List[Dict]) -> str: |
|
|
"""Create evaluation prompt.""" |
|
|
func_desc = "You are a function calling AI. Given the user query and available functions, respond with a JSON function call.\n\nAvailable functions:\n" |
|
|
for f in functions: |
|
|
func_desc += f"- {f.get('name')}: {f.get('description', '')}\n" |
|
|
|
|
|
return f"""{func_desc} |
|
|
|
|
|
User Query (Arabic): {query} |
|
|
|
|
|
Respond ONLY with a JSON object: |
|
|
{{"name": "function_name", "arguments": {{"param1": "value1"}}}} |
|
|
|
|
|
If no function should be called: |
|
|
{{"name": null, "arguments": {{}}}} |
|
|
|
|
|
JSON Response:""" |
|
|
|
|
|
|
|
|
def call_model(model_id: str, prompt: str) -> str: |
|
|
"""Call model via HuggingFace Inference API.""" |
|
|
token = os.getenv("HF_TOKEN", "") |
|
|
headers = {"Authorization": f"Bearer {token}"} |
|
|
url = f"https://api-inference.huggingface.co/models/{model_id}" |
|
|
|
|
|
payload = { |
|
|
"inputs": prompt, |
|
|
"parameters": {"max_new_tokens": 200, "temperature": 0.1} |
|
|
} |
|
|
|
|
|
try: |
|
|
response = requests.post(url, headers=headers, json=payload, timeout=60) |
|
|
if response.status_code == 503: |
|
|
time.sleep(20) |
|
|
response = requests.post(url, headers=headers, json=payload, timeout=60) |
|
|
|
|
|
result = response.json() |
|
|
if isinstance(result, list) and result: |
|
|
return result[0].get("generated_text", "") |
|
|
return str(result) |
|
|
except: |
|
|
return "" |
|
|
|
|
|
|
|
|
def parse_response(response: str) -> Optional[Dict]: |
|
|
"""Parse function call from response.""" |
|
|
if not response: |
|
|
return None |
|
|
try: |
|
|
return json.loads(response.strip()) |
|
|
except: |
|
|
pass |
|
|
match = re.search(r'\{[^{}]*"name"[^{}]*\}', response) |
|
|
if match: |
|
|
try: |
|
|
return json.loads(match.group()) |
|
|
except: |
|
|
pass |
|
|
if any(x in response.lower() for x in ['null', 'none', 'لا يمكن']): |
|
|
return {"name": None} |
|
|
return None |
|
|
|
|
|
|
|
|
def evaluate_sample(model_id: str, sample: Dict) -> float: |
|
|
"""Evaluate single sample.""" |
|
|
query = sample.get('query_ar', '') |
|
|
functions = sample.get('functions', []) |
|
|
category = sample.get('category', '') |
|
|
ground_truth = sample.get('ground_truth') |
|
|
|
|
|
prompt = create_prompt(query, functions) |
|
|
response = call_model(model_id, prompt) |
|
|
parsed = parse_response(response) |
|
|
|
|
|
if category == 'irrelevance': |
|
|
return 1.0 if (parsed is None or parsed.get('name') is None) else 0.0 |
|
|
|
|
|
if not ground_truth or not parsed: |
|
|
return 0.0 |
|
|
|
|
|
expected = ground_truth.get('calls', [ground_truth])[0] if isinstance(ground_truth, dict) else ground_truth |
|
|
|
|
|
if str(parsed.get('name', '')).lower() != str(expected.get('name', '')).lower(): |
|
|
return 0.0 |
|
|
|
|
|
pred_args = parsed.get('arguments', {}) |
|
|
exp_args = expected.get('arguments', {}) |
|
|
if not exp_args: |
|
|
return 1.0 |
|
|
|
|
|
matched = sum(1 for k, v in exp_args.items() if str(pred_args.get(k, '')).lower() == str(v).lower()) |
|
|
return matched / len(exp_args) |
|
|
|
|
|
|
|
|
def run_evaluation(): |
|
|
"""Run full evaluation on all models.""" |
|
|
global LEADERBOARD_DATA, EVALUATION_STATUS |
|
|
|
|
|
EVALUATION_STATUS = "Loading dataset..." |
|
|
samples = load_evaluation_dataset() |
|
|
|
|
|
if not samples: |
|
|
EVALUATION_STATUS = "Failed to load dataset" |
|
|
return |
|
|
|
|
|
results = [] |
|
|
total_models = len(MODELS_TO_EVALUATE) |
|
|
|
|
|
for idx, model_config in enumerate(MODELS_TO_EVALUATE): |
|
|
model_name = model_config['model'] |
|
|
model_id = model_config['model_id'] |
|
|
|
|
|
EVALUATION_STATUS = f"Evaluating {model_name} ({idx+1}/{total_models})..." |
|
|
|
|
|
category_scores = {} |
|
|
category_counts = {} |
|
|
|
|
|
for sample in samples: |
|
|
cat = sample.get('category', 'simple') |
|
|
if cat not in category_scores: |
|
|
category_scores[cat] = 0.0 |
|
|
category_counts[cat] = 0 |
|
|
|
|
|
try: |
|
|
score = evaluate_sample(model_id, sample) |
|
|
category_scores[cat] += score |
|
|
except: |
|
|
pass |
|
|
category_counts[cat] += 1 |
|
|
time.sleep(0.5) |
|
|
|
|
|
|
|
|
scores = {cat: round((category_scores[cat] / category_counts[cat]) * 100, 1) |
|
|
for cat in category_scores if category_counts[cat] > 0} |
|
|
|
|
|
|
|
|
weights = {"simple": 0.15, "multiple": 0.10, "parallel": 0.10, |
|
|
"parallel_multiple": 0.10, "irrelevance": 0.15, "dialect_handling": 0.15} |
|
|
overall = sum(scores.get(c, 0) * w for c, w in weights.items()) / sum(weights.values()) |
|
|
|
|
|
results.append({ |
|
|
"model": model_name, |
|
|
"model_id": model_id, |
|
|
"organization": model_config['organization'], |
|
|
"overall": round(overall, 1), |
|
|
"simple": scores.get('simple', 0), |
|
|
"multiple": scores.get('multiple', 0), |
|
|
"parallel": scores.get('parallel', 0), |
|
|
"parallel_multiple": scores.get('parallel_multiple', 0), |
|
|
"irrelevance": scores.get('irrelevance', 0), |
|
|
"dialect_handling": scores.get('dialect_handling', 0), |
|
|
"status": "completed" |
|
|
}) |
|
|
|
|
|
|
|
|
results = sorted(results, key=lambda x: x['overall'], reverse=True) |
|
|
for i, r in enumerate(results, 1): |
|
|
r['rank'] = i |
|
|
|
|
|
LEADERBOARD_DATA = results |
|
|
EVALUATION_STATUS = f"Completed - {len(results)} models evaluated" |
|
|
|
|
|
|
|
|
def get_leaderboard_df(): |
|
|
"""Get leaderboard as DataFrame.""" |
|
|
if not LEADERBOARD_DATA: |
|
|
|
|
|
data = [{"rank": i+1, "model": m["model"], "organization": m["organization"], |
|
|
"overall": "-", "status": "⏳ Pending"} |
|
|
for i, m in enumerate(MODELS_TO_EVALUATE)] |
|
|
return pd.DataFrame(data) |
|
|
|
|
|
df = pd.DataFrame(LEADERBOARD_DATA) |
|
|
cols = ["rank", "model", "organization", "overall", "simple", "multiple", |
|
|
"parallel", "parallel_multiple", "irrelevance", "dialect_handling"] |
|
|
df = df[[c for c in cols if c in df.columns]] |
|
|
|
|
|
|
|
|
for col in df.columns: |
|
|
if df[col].dtype in ['float64', 'float32', 'int64']: |
|
|
if col != 'rank': |
|
|
df[col] = df[col].apply(lambda x: f"{x:.1f}%") |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
def create_app(): |
|
|
"""Create the Gradio app.""" |
|
|
with gr.Blocks(title="Arabic FC Leaderboard", theme=gr.themes.Soft()) as app: |
|
|
|
|
|
gr.Markdown(f""" |
|
|
<div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4a 100%); border-radius: 12px; color: white; margin-bottom: 20px;"> |
|
|
<h1>{TITLE_AR}</h1> |
|
|
<h2>{TITLE}</h2> |
|
|
<p>Evaluating LLMs on Arabic Function Calling | تقييم نماذج اللغة على استدعاء الدوال بالعربية</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
gr.Markdown(DESCRIPTION) |
|
|
|
|
|
with gr.Row(): |
|
|
gr.Markdown(f""" |
|
|
<div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;"> |
|
|
<div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(MODELS_TO_EVALUATE)}</div> |
|
|
<div>Models | النماذج</div> |
|
|
</div> |
|
|
""") |
|
|
gr.Markdown(""" |
|
|
<div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;"> |
|
|
<div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">147</div> |
|
|
<div>Test Samples | عينات</div> |
|
|
</div> |
|
|
""") |
|
|
gr.Markdown(""" |
|
|
<div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;"> |
|
|
<div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">10</div> |
|
|
<div>Categories | الفئات</div> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
status_text = gr.Markdown(f"**Status:** {EVALUATION_STATUS}") |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.TabItem("🏆 Leaderboard"): |
|
|
leaderboard_df = gr.DataFrame( |
|
|
value=get_leaderboard_df(), |
|
|
interactive=False |
|
|
) |
|
|
|
|
|
def refresh_leaderboard(): |
|
|
return get_leaderboard_df(), f"**Status:** {EVALUATION_STATUS}" |
|
|
|
|
|
refresh_btn = gr.Button("🔄 Refresh | تحديث") |
|
|
refresh_btn.click(refresh_leaderboard, outputs=[leaderboard_df, status_text]) |
|
|
|
|
|
with gr.TabItem("📊 About"): |
|
|
gr.Markdown(""" |
|
|
## Evaluation Categories |
|
|
|
|
|
| Category | Samples | Description | |
|
|
|----------|---------|-------------| |
|
|
| Simple | ~20 | Single function call | |
|
|
| Multiple | ~20 | Select from multiple functions | |
|
|
| Parallel | ~20 | Multiple calls | |
|
|
| Parallel Multiple | ~20 | Complex multi-call | |
|
|
| Irrelevance | ~20 | Should not call | |
|
|
| Dialect | ~15 | Egyptian/Gulf/Levantine | |
|
|
|
|
|
## Dataset |
|
|
📊 [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling) |
|
|
""") |
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
<div style="text-align: center; color: #666;"> |
|
|
Built for the Arabic NLP community | بُني لمجتمع معالجة اللغة العربية |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
if not LEADERBOARD_DATA: |
|
|
Thread(target=run_evaluation, daemon=True).start() |
|
|
|
|
|
return app |
|
|
|
|
|
|
|
|
app = create_app() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.launch() |
|
|
|