HeshamHaroon's picture
Update: Auto-evaluation on Space startup
a5f8ac7 verified
raw
history blame
12.7 kB
"""
Arabic Function Calling Leaderboard (AFCL)
==========================================
A Gradio-based leaderboard that evaluates LLMs on Arabic function calling.
Evaluation runs on HuggingFace Space infrastructure.
"""
import gradio as gr
import pandas as pd
import json
import os
import re
import time
import requests
from pathlib import Path
from typing import Dict, List, Optional
from threading import Thread
from datasets import load_dataset
import huggingface_hub
# Constants
TITLE = "🏆 Arabic Function Calling Leaderboard"
TITLE_AR = "🏆 لوحة تقييم استدعاء الدوال بالعربية"
DESCRIPTION = """
The **Arabic Function Calling Leaderboard (AFCL)** evaluates Large Language Models on their ability to understand Arabic queries and generate appropriate function calls.
**لوحة تقييم استدعاء الدوال بالعربية** تقيّم نماذج اللغة الكبيرة على قدرتها على فهم الاستعلامات العربية وإنشاء استدعاءات الدوال المناسبة.
"""
# Models to evaluate
MODELS_TO_EVALUATE = [
{"model": "Jais-30B-Chat", "model_id": "inceptionai/jais-30b-chat-v3", "organization": "Inception AI"},
{"model": "ALLaM-7B-Instruct", "model_id": "sdaia/allam-1-7b-instruct", "organization": "SDAIA"},
{"model": "SILMA-9B-Instruct", "model_id": "silma-ai/SILMA-9B-Instruct-v1.0", "organization": "Silma AI"},
{"model": "AceGPT-13B-Chat", "model_id": "FreedomIntelligence/AceGPT-13B-chat", "organization": "FreedomIntelligence"},
{"model": "BLOOMZ-7B1", "model_id": "bigscience/bloomz-7b1", "organization": "BigScience"},
{"model": "Aya-Expanse-8B", "model_id": "CohereForAI/aya-expanse-8b", "organization": "Cohere For AI"},
{"model": "Qwen2.5-7B-Instruct", "model_id": "Qwen/Qwen2.5-7B-Instruct", "organization": "Alibaba Qwen"},
{"model": "Llama-3.1-8B-Instruct", "model_id": "meta-llama/Llama-3.1-8B-Instruct", "organization": "Meta"},
{"model": "Gemma-2-9B-IT", "model_id": "google/gemma-2-9b-it", "organization": "Google"},
{"model": "Mistral-7B-Instruct", "model_id": "mistralai/Mistral-7B-Instruct-v0.3", "organization": "Mistral AI"},
{"model": "Phi-3-Mini-Instruct", "model_id": "microsoft/Phi-3-mini-4k-instruct", "organization": "Microsoft"},
]
# Global state
LEADERBOARD_DATA = []
EVALUATION_STATUS = "Not started"
def load_evaluation_dataset():
"""Load the Arabic FC dataset from HuggingFace."""
try:
dataset = load_dataset("HeshamHaroon/Arabic_Function_Calling", split="test")
samples = []
for item in dataset:
sample = {
'id': item['id'],
'query_ar': item['query_ar'],
'functions': json.loads(item['functions']) if item['functions'] else [],
'ground_truth': json.loads(item['ground_truth']) if item['ground_truth'] else None,
'category': item['category'],
}
samples.append(sample)
return samples
except Exception as e:
print(f"Error loading dataset: {e}")
return []
def create_prompt(query: str, functions: List[Dict]) -> str:
"""Create evaluation prompt."""
func_desc = "You are a function calling AI. Given the user query and available functions, respond with a JSON function call.\n\nAvailable functions:\n"
for f in functions:
func_desc += f"- {f.get('name')}: {f.get('description', '')}\n"
return f"""{func_desc}
User Query (Arabic): {query}
Respond ONLY with a JSON object:
{{"name": "function_name", "arguments": {{"param1": "value1"}}}}
If no function should be called:
{{"name": null, "arguments": {{}}}}
JSON Response:"""
def call_model(model_id: str, prompt: str) -> str:
"""Call model via HuggingFace Inference API."""
token = os.getenv("HF_TOKEN", "")
headers = {"Authorization": f"Bearer {token}"}
url = f"https://api-inference.huggingface.co/models/{model_id}"
payload = {
"inputs": prompt,
"parameters": {"max_new_tokens": 200, "temperature": 0.1}
}
try:
response = requests.post(url, headers=headers, json=payload, timeout=60)
if response.status_code == 503:
time.sleep(20)
response = requests.post(url, headers=headers, json=payload, timeout=60)
result = response.json()
if isinstance(result, list) and result:
return result[0].get("generated_text", "")
return str(result)
except:
return ""
def parse_response(response: str) -> Optional[Dict]:
"""Parse function call from response."""
if not response:
return None
try:
return json.loads(response.strip())
except:
pass
match = re.search(r'\{[^{}]*"name"[^{}]*\}', response)
if match:
try:
return json.loads(match.group())
except:
pass
if any(x in response.lower() for x in ['null', 'none', 'لا يمكن']):
return {"name": None}
return None
def evaluate_sample(model_id: str, sample: Dict) -> float:
"""Evaluate single sample."""
query = sample.get('query_ar', '')
functions = sample.get('functions', [])
category = sample.get('category', '')
ground_truth = sample.get('ground_truth')
prompt = create_prompt(query, functions)
response = call_model(model_id, prompt)
parsed = parse_response(response)
if category == 'irrelevance':
return 1.0 if (parsed is None or parsed.get('name') is None) else 0.0
if not ground_truth or not parsed:
return 0.0
expected = ground_truth.get('calls', [ground_truth])[0] if isinstance(ground_truth, dict) else ground_truth
if str(parsed.get('name', '')).lower() != str(expected.get('name', '')).lower():
return 0.0
pred_args = parsed.get('arguments', {})
exp_args = expected.get('arguments', {})
if not exp_args:
return 1.0
matched = sum(1 for k, v in exp_args.items() if str(pred_args.get(k, '')).lower() == str(v).lower())
return matched / len(exp_args)
def run_evaluation():
"""Run full evaluation on all models."""
global LEADERBOARD_DATA, EVALUATION_STATUS
EVALUATION_STATUS = "Loading dataset..."
samples = load_evaluation_dataset()
if not samples:
EVALUATION_STATUS = "Failed to load dataset"
return
results = []
total_models = len(MODELS_TO_EVALUATE)
for idx, model_config in enumerate(MODELS_TO_EVALUATE):
model_name = model_config['model']
model_id = model_config['model_id']
EVALUATION_STATUS = f"Evaluating {model_name} ({idx+1}/{total_models})..."
category_scores = {}
category_counts = {}
for sample in samples:
cat = sample.get('category', 'simple')
if cat not in category_scores:
category_scores[cat] = 0.0
category_counts[cat] = 0
try:
score = evaluate_sample(model_id, sample)
category_scores[cat] += score
except:
pass
category_counts[cat] += 1
time.sleep(0.5) # Rate limiting
# Calculate scores
scores = {cat: round((category_scores[cat] / category_counts[cat]) * 100, 1)
for cat in category_scores if category_counts[cat] > 0}
# Weighted overall
weights = {"simple": 0.15, "multiple": 0.10, "parallel": 0.10,
"parallel_multiple": 0.10, "irrelevance": 0.15, "dialect_handling": 0.15}
overall = sum(scores.get(c, 0) * w for c, w in weights.items()) / sum(weights.values())
results.append({
"model": model_name,
"model_id": model_id,
"organization": model_config['organization'],
"overall": round(overall, 1),
"simple": scores.get('simple', 0),
"multiple": scores.get('multiple', 0),
"parallel": scores.get('parallel', 0),
"parallel_multiple": scores.get('parallel_multiple', 0),
"irrelevance": scores.get('irrelevance', 0),
"dialect_handling": scores.get('dialect_handling', 0),
"status": "completed"
})
# Sort and rank
results = sorted(results, key=lambda x: x['overall'], reverse=True)
for i, r in enumerate(results, 1):
r['rank'] = i
LEADERBOARD_DATA = results
EVALUATION_STATUS = f"Completed - {len(results)} models evaluated"
def get_leaderboard_df():
"""Get leaderboard as DataFrame."""
if not LEADERBOARD_DATA:
# Return empty with pending status
data = [{"rank": i+1, "model": m["model"], "organization": m["organization"],
"overall": "-", "status": "⏳ Pending"}
for i, m in enumerate(MODELS_TO_EVALUATE)]
return pd.DataFrame(data)
df = pd.DataFrame(LEADERBOARD_DATA)
cols = ["rank", "model", "organization", "overall", "simple", "multiple",
"parallel", "parallel_multiple", "irrelevance", "dialect_handling"]
df = df[[c for c in cols if c in df.columns]]
# Format percentages
for col in df.columns:
if df[col].dtype in ['float64', 'float32', 'int64']:
if col != 'rank':
df[col] = df[col].apply(lambda x: f"{x:.1f}%")
return df
def create_app():
"""Create the Gradio app."""
with gr.Blocks(title="Arabic FC Leaderboard", theme=gr.themes.Soft()) as app:
gr.Markdown(f"""
<div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4a 100%); border-radius: 12px; color: white; margin-bottom: 20px;">
<h1>{TITLE_AR}</h1>
<h2>{TITLE}</h2>
<p>Evaluating LLMs on Arabic Function Calling | تقييم نماذج اللغة على استدعاء الدوال بالعربية</p>
</div>
""")
gr.Markdown(DESCRIPTION)
with gr.Row():
gr.Markdown(f"""
<div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
<div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(MODELS_TO_EVALUATE)}</div>
<div>Models | النماذج</div>
</div>
""")
gr.Markdown("""
<div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
<div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">147</div>
<div>Test Samples | عينات</div>
</div>
""")
gr.Markdown("""
<div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
<div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">10</div>
<div>Categories | الفئات</div>
</div>
""")
status_text = gr.Markdown(f"**Status:** {EVALUATION_STATUS}")
with gr.Tabs():
with gr.TabItem("🏆 Leaderboard"):
leaderboard_df = gr.DataFrame(
value=get_leaderboard_df(),
interactive=False
)
def refresh_leaderboard():
return get_leaderboard_df(), f"**Status:** {EVALUATION_STATUS}"
refresh_btn = gr.Button("🔄 Refresh | تحديث")
refresh_btn.click(refresh_leaderboard, outputs=[leaderboard_df, status_text])
with gr.TabItem("📊 About"):
gr.Markdown("""
## Evaluation Categories
| Category | Samples | Description |
|----------|---------|-------------|
| Simple | ~20 | Single function call |
| Multiple | ~20 | Select from multiple functions |
| Parallel | ~20 | Multiple calls |
| Parallel Multiple | ~20 | Complex multi-call |
| Irrelevance | ~20 | Should not call |
| Dialect | ~15 | Egyptian/Gulf/Levantine |
## Dataset
📊 [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
""")
gr.Markdown("""
---
<div style="text-align: center; color: #666;">
Built for the Arabic NLP community | بُني لمجتمع معالجة اللغة العربية
</div>
""")
# Start evaluation in background
if not LEADERBOARD_DATA:
Thread(target=run_evaluation, daemon=True).start()
return app
app = create_app()
if __name__ == "__main__":
app.launch()