|
|
""" |
|
|
Arabic Function Calling Leaderboard (AFCL) |
|
|
========================================== |
|
|
|
|
|
Professional leaderboard for evaluating LLMs on Arabic function calling. |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import json |
|
|
import os |
|
|
import re |
|
|
import time |
|
|
import requests |
|
|
from typing import Dict, List, Optional |
|
|
from threading import Thread |
|
|
from datasets import load_dataset |
|
|
|
|
|
|
|
|
TITLE = "Arabic Function Calling Leaderboard" |
|
|
TITLE_AR = "ููุญุฉ ุชูููู
ุงุณุชุฏุนุงุก ุงูุฏูุงู ุจุงูุนุฑุจูุฉ" |
|
|
|
|
|
|
|
|
MODELS_TO_EVALUATE = [ |
|
|
|
|
|
{"model": "Jais-30B-Chat", "model_id": "inceptionai/jais-30b-chat-v3", "organization": "Inception AI", "params": "30B", "type": "Arabic-Native"}, |
|
|
{"model": "ALLaM-7B-Instruct", "model_id": "sdaia/allam-1-7b-instruct", "organization": "SDAIA", "params": "7B", "type": "Arabic-Native"}, |
|
|
{"model": "SILMA-9B-Instruct", "model_id": "silma-ai/SILMA-9B-Instruct-v1.0", "organization": "Silma AI", "params": "9B", "type": "Arabic-Native"}, |
|
|
{"model": "Fanar-Star-1.2B", "model_id": "QatarComputing/fanar-star-1.2b", "organization": "QCRI", "params": "1.2B", "type": "Arabic-Native"}, |
|
|
{"model": "AceGPT-13B-Chat", "model_id": "FreedomIntelligence/AceGPT-13B-chat", "organization": "FreedomIntelligence", "params": "13B", "type": "Arabic-Native"}, |
|
|
{"model": "AraGPT2-Mega", "model_id": "aubmindlab/aragpt2-mega", "organization": "AUB MIND Lab", "params": "1.5B", "type": "Arabic-Native"}, |
|
|
|
|
|
|
|
|
{"model": "Qwen2.5-72B-Instruct", "model_id": "Qwen/Qwen2.5-72B-Instruct", "organization": "Alibaba", "params": "72B", "type": "Multilingual"}, |
|
|
{"model": "Qwen2.5-32B-Instruct", "model_id": "Qwen/Qwen2.5-32B-Instruct", "organization": "Alibaba", "params": "32B", "type": "Multilingual"}, |
|
|
{"model": "Qwen2.5-7B-Instruct", "model_id": "Qwen/Qwen2.5-7B-Instruct", "organization": "Alibaba", "params": "7B", "type": "Multilingual"}, |
|
|
{"model": "Llama-3.1-70B-Instruct", "model_id": "meta-llama/Llama-3.1-70B-Instruct", "organization": "Meta", "params": "70B", "type": "Multilingual"}, |
|
|
{"model": "Llama-3.1-8B-Instruct", "model_id": "meta-llama/Llama-3.1-8B-Instruct", "organization": "Meta", "params": "8B", "type": "Multilingual"}, |
|
|
{"model": "Gemma-2-27B-IT", "model_id": "google/gemma-2-27b-it", "organization": "Google", "params": "27B", "type": "Multilingual"}, |
|
|
{"model": "Gemma-2-9B-IT", "model_id": "google/gemma-2-9b-it", "organization": "Google", "params": "9B", "type": "Multilingual"}, |
|
|
|
|
|
|
|
|
{"model": "Aya-Expanse-32B", "model_id": "CohereForAI/aya-expanse-32b", "organization": "Cohere", "params": "32B", "type": "Multilingual"}, |
|
|
{"model": "Aya-Expanse-8B", "model_id": "CohereForAI/aya-expanse-8b", "organization": "Cohere", "params": "8B", "type": "Multilingual"}, |
|
|
{"model": "Command-R7B-Arabic", "model_id": "CohereForAI/c4ai-command-r7b-arabic-02-2025", "organization": "Cohere", "params": "7B", "type": "Arabic-Tuned"}, |
|
|
|
|
|
|
|
|
{"model": "Falcon-180B-Chat", "model_id": "tiiuae/falcon-180B-chat", "organization": "TII UAE", "params": "180B", "type": "Multilingual"}, |
|
|
{"model": "Falcon-40B-Instruct", "model_id": "tiiuae/falcon-40b-instruct", "organization": "TII UAE", "params": "40B", "type": "Multilingual"}, |
|
|
|
|
|
|
|
|
{"model": "Mistral-Large", "model_id": "mistralai/Mistral-Large-Instruct-2411", "organization": "Mistral AI", "params": "123B", "type": "Multilingual"}, |
|
|
{"model": "Mixtral-8x22B", "model_id": "mistralai/Mixtral-8x22B-Instruct-v0.1", "organization": "Mistral AI", "params": "141B", "type": "Multilingual"}, |
|
|
{"model": "Mistral-7B-Instruct", "model_id": "mistralai/Mistral-7B-Instruct-v0.3", "organization": "Mistral AI", "params": "7B", "type": "Multilingual"}, |
|
|
|
|
|
|
|
|
{"model": "DeepSeek-V3", "model_id": "deepseek-ai/DeepSeek-V3", "organization": "DeepSeek", "params": "671B", "type": "Multilingual"}, |
|
|
{"model": "Phi-4", "model_id": "microsoft/phi-4", "organization": "Microsoft", "params": "14B", "type": "Multilingual"}, |
|
|
{"model": "Phi-3-Mini", "model_id": "microsoft/Phi-3-mini-4k-instruct", "organization": "Microsoft", "params": "3.8B", "type": "Multilingual"}, |
|
|
{"model": "BLOOM-176B", "model_id": "bigscience/bloom", "organization": "BigScience", "params": "176B", "type": "Multilingual"}, |
|
|
{"model": "BLOOMZ-7B1", "model_id": "bigscience/bloomz-7b1", "organization": "BigScience", "params": "7B", "type": "Multilingual"}, |
|
|
|
|
|
|
|
|
{"model": "Arabic-Llama-3.1-8B", "model_id": "Ammar-Arabi/Arabic-Llama-3.1-8B-Instruct", "organization": "Community", "params": "8B", "type": "Arabic-Tuned"}, |
|
|
{"model": "Llama3-8B-Arabic", "model_id": "MahmoudAshraf/Llama3-8B-Arabic-instruct", "organization": "Community", "params": "8B", "type": "Arabic-Tuned"}, |
|
|
] |
|
|
|
|
|
|
|
|
LEADERBOARD_DATA = [] |
|
|
EVALUATION_STATUS = {"current": "Initializing...", "progress": 0, "total": len(MODELS_TO_EVALUATE)} |
|
|
RESULTS_DATASET_REPO = "HeshamHaroon/AFCL-Results" |
|
|
|
|
|
|
|
|
def load_cached_results() -> List[Dict]: |
|
|
"""Load cached evaluation results from HuggingFace dataset.""" |
|
|
try: |
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
file_path = hf_hub_download( |
|
|
repo_id=RESULTS_DATASET_REPO, |
|
|
filename="results.json", |
|
|
repo_type="dataset", |
|
|
token=os.getenv("HF_TOKEN") |
|
|
) |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
cached = json.load(f) |
|
|
print(f"โ
Loaded {len(cached)} cached results from HuggingFace") |
|
|
return cached |
|
|
except Exception as e: |
|
|
print(f"No cached results found (will evaluate all models): {e}") |
|
|
return [] |
|
|
|
|
|
|
|
|
def save_cached_results(results: List[Dict]): |
|
|
"""Save evaluation results to HuggingFace dataset for persistence.""" |
|
|
try: |
|
|
from huggingface_hub import HfApi, create_repo |
|
|
import tempfile |
|
|
|
|
|
api = HfApi() |
|
|
token = os.getenv("HF_TOKEN") |
|
|
|
|
|
|
|
|
try: |
|
|
create_repo( |
|
|
repo_id=RESULTS_DATASET_REPO, |
|
|
repo_type="dataset", |
|
|
exist_ok=True, |
|
|
token=token |
|
|
) |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False, encoding='utf-8') as f: |
|
|
json.dump(results, f, ensure_ascii=False, indent=2) |
|
|
temp_path = f.name |
|
|
|
|
|
api.upload_file( |
|
|
path_or_fileobj=temp_path, |
|
|
path_in_repo="results.json", |
|
|
repo_id=RESULTS_DATASET_REPO, |
|
|
repo_type="dataset", |
|
|
token=token, |
|
|
commit_message=f"Update results ({len(results)} models)" |
|
|
) |
|
|
|
|
|
os.unlink(temp_path) |
|
|
print(f"โ
Saved {len(results)} results to HuggingFace dataset") |
|
|
except Exception as e: |
|
|
print(f"โ ๏ธ Error saving to HuggingFace (results may not persist): {e}") |
|
|
|
|
|
|
|
|
CUSTOM_CSS = """ |
|
|
/* Professional Dark Theme */ |
|
|
.gradio-container { |
|
|
background: linear-gradient(135deg, #0f0f1a 0%, #1a1a2e 100%) !important; |
|
|
font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important; |
|
|
} |
|
|
|
|
|
/* Header styling */ |
|
|
.header-container { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
|
border-radius: 16px; |
|
|
padding: 32px; |
|
|
margin-bottom: 24px; |
|
|
box-shadow: 0 20px 40px rgba(102, 126, 234, 0.3); |
|
|
} |
|
|
|
|
|
/* Stats cards */ |
|
|
.stat-card { |
|
|
background: rgba(255,255,255,0.05); |
|
|
backdrop-filter: blur(10px); |
|
|
border: 1px solid rgba(255,255,255,0.1); |
|
|
border-radius: 12px; |
|
|
padding: 24px; |
|
|
text-align: center; |
|
|
transition: transform 0.3s ease; |
|
|
} |
|
|
|
|
|
.stat-card:hover { |
|
|
transform: translateY(-4px); |
|
|
} |
|
|
|
|
|
.stat-value { |
|
|
font-size: 2.5rem; |
|
|
font-weight: 700; |
|
|
background: linear-gradient(135deg, #667eea, #764ba2); |
|
|
-webkit-background-clip: text; |
|
|
-webkit-text-fill-color: transparent; |
|
|
} |
|
|
|
|
|
.stat-label { |
|
|
color: #a0a0a0; |
|
|
font-size: 0.9rem; |
|
|
margin-top: 8px; |
|
|
} |
|
|
|
|
|
/* Table styling */ |
|
|
.leaderboard-table { |
|
|
background: rgba(255,255,255,0.02) !important; |
|
|
border-radius: 12px !important; |
|
|
border: 1px solid rgba(255,255,255,0.1) !important; |
|
|
} |
|
|
|
|
|
/* Rank badges */ |
|
|
.rank-1 { color: #ffd700 !important; font-weight: bold; } |
|
|
.rank-2 { color: #c0c0c0 !important; font-weight: bold; } |
|
|
.rank-3 { color: #cd7f32 !important; font-weight: bold; } |
|
|
|
|
|
/* Progress bar */ |
|
|
.progress-container { |
|
|
background: rgba(255,255,255,0.1); |
|
|
border-radius: 8px; |
|
|
padding: 16px; |
|
|
margin: 16px 0; |
|
|
} |
|
|
|
|
|
.progress-bar { |
|
|
height: 8px; |
|
|
background: linear-gradient(90deg, #667eea, #764ba2); |
|
|
border-radius: 4px; |
|
|
transition: width 0.5s ease; |
|
|
} |
|
|
|
|
|
/* Tabs */ |
|
|
.tabs { |
|
|
border: none !important; |
|
|
} |
|
|
|
|
|
.tab-nav { |
|
|
background: transparent !important; |
|
|
border-bottom: 2px solid rgba(255,255,255,0.1) !important; |
|
|
} |
|
|
|
|
|
.tab-nav button { |
|
|
color: #a0a0a0 !important; |
|
|
font-weight: 500 !important; |
|
|
padding: 12px 24px !important; |
|
|
} |
|
|
|
|
|
.tab-nav button.selected { |
|
|
color: #667eea !important; |
|
|
border-bottom: 2px solid #667eea !important; |
|
|
} |
|
|
|
|
|
/* Category pills */ |
|
|
.category-pill { |
|
|
display: inline-block; |
|
|
padding: 4px 12px; |
|
|
border-radius: 20px; |
|
|
font-size: 0.75rem; |
|
|
font-weight: 500; |
|
|
} |
|
|
|
|
|
.cat-arabic { background: #22c55e20; color: #22c55e; } |
|
|
.cat-multilingual { background: #3b82f620; color: #3b82f6; } |
|
|
.cat-tuned { background: #f59e0b20; color: #f59e0b; } |
|
|
""" |
|
|
|
|
|
|
|
|
def load_evaluation_dataset(): |
|
|
"""Load ALL Arabic FC dataset from HuggingFace (train + test = 1,470 samples).""" |
|
|
try: |
|
|
|
|
|
dataset = load_dataset("HeshamHaroon/Arabic_Function_Calling") |
|
|
samples = [] |
|
|
|
|
|
|
|
|
for split_name in dataset.keys(): |
|
|
for item in dataset[split_name]: |
|
|
sample = { |
|
|
'id': item['id'], |
|
|
'query_ar': item['query_ar'], |
|
|
'functions': json.loads(item['functions']) if item['functions'] else [], |
|
|
'ground_truth': json.loads(item['ground_truth']) if item['ground_truth'] else None, |
|
|
'category': item['category'], |
|
|
} |
|
|
samples.append(sample) |
|
|
|
|
|
print(f"Loaded {len(samples)} total samples from all splits") |
|
|
return samples |
|
|
except Exception as e: |
|
|
print(f"Error loading dataset: {e}") |
|
|
return [] |
|
|
|
|
|
|
|
|
def create_prompt(query: str, functions: List[Dict]) -> str: |
|
|
"""Create evaluation prompt in Arabic with full function details.""" |
|
|
|
|
|
prompt = """ุฃูุช ู
ุณุงุนุฏ ุฐูู ู
ุชุฎุตุต ูู ุงุณุชุฏุนุงุก ุงูุฏูุงู ุงูุจุฑู
ุฌูุฉ. ู
ูู
ุชู ูู ุชุญููู ุทูุจ ุงูู
ุณุชุฎุฏู
ูุงุฎุชูุงุฑ ุงูุฏุงูุฉ ุงูู
ูุงุณุจุฉ ู
ุน ุชุญุฏูุฏ ุงูู
ุนุงู
ูุงุช ุงูุตุญูุญุฉ. |
|
|
|
|
|
### ุงูุฏูุงู ุงูู
ุชุงุญุฉ: |
|
|
|
|
|
""" |
|
|
for f in functions: |
|
|
func_name = f.get('name', '') |
|
|
func_desc = f.get('description', 'ูุง ููุฌุฏ ูุตู') |
|
|
prompt += f"**{func_name}**\n" |
|
|
prompt += f"ุงููุตู: {func_desc}\n" |
|
|
|
|
|
if 'parameters' in f: |
|
|
params = f['parameters'] |
|
|
if 'properties' in params: |
|
|
prompt += "ุงูู
ุนุงู
ูุงุช:\n" |
|
|
required_params = params.get('required', []) |
|
|
for param_name, param_info in params['properties'].items(): |
|
|
param_type = param_info.get('type', 'any') |
|
|
param_desc = param_info.get('description', '') |
|
|
is_required = param_name in required_params |
|
|
req_str = " (ู
ุทููุจ)" if is_required else " (ุงุฎุชูุงุฑู)" |
|
|
prompt += f" โข {param_name} ({param_type}){req_str}: {param_desc}\n" |
|
|
prompt += "\n" |
|
|
|
|
|
prompt += f"""### ุทูุจ ุงูู
ุณุชุฎุฏู
: |
|
|
{query} |
|
|
|
|
|
### ุงูุชุนููู
ุงุช: |
|
|
1. ุญูู ุทูุจ ุงูู
ุณุชุฎุฏู
ุจุนูุงูุฉ |
|
|
2. ุงุฎุชุฑ ุงูุฏุงูุฉ ุงูู
ูุงุณุจุฉ ู
ู ุงููุงุฆู
ุฉ ุฃุนูุงู |
|
|
3. ุงุณุชุฎุฑุฌ ููู
ุงูู
ุนุงู
ูุงุช ู
ู ุงูุทูุจ |
|
|
4. ุฃุฌุจ ุจุตูุบุฉ JSON ููุท |
|
|
|
|
|
### ุตูุบุฉ ุงูุฅุฌุงุจุฉ: |
|
|
ุฅุฐุง ูุงูุช ููุงู ุฏุงูุฉ ู
ูุงุณุจุฉ: |
|
|
{{"name": "ุงุณู
_ุงูุฏุงูุฉ", "arguments": {{"ุงูู
ุนุงู
ู1": "ุงูููู
ุฉ1", "ุงูู
ุนุงู
ู2": "ุงูููู
ุฉ2"}}}} |
|
|
|
|
|
ุฅุฐุง ูู
ุชูู ููุงู ุฏุงูุฉ ู
ูุงุณุจุฉ ููุทูุจ: |
|
|
{{"name": null, "arguments": {{}}}} |
|
|
|
|
|
### ุงูุฅุฌุงุจุฉ (JSON ููุท): |
|
|
""" |
|
|
return prompt |
|
|
|
|
|
|
|
|
def call_model(model_id: str, prompt: str) -> str: |
|
|
"""Call model via HuggingFace Inference API.""" |
|
|
token = os.getenv("HF_TOKEN", "") |
|
|
headers = {"Authorization": f"Bearer {token}"} |
|
|
url = f"https://api-inference.huggingface.co/models/{model_id}" |
|
|
|
|
|
payload = {"inputs": prompt, "parameters": {"max_new_tokens": 200, "temperature": 0.1}} |
|
|
|
|
|
try: |
|
|
response = requests.post(url, headers=headers, json=payload, timeout=60) |
|
|
if response.status_code == 503: |
|
|
time.sleep(20) |
|
|
response = requests.post(url, headers=headers, json=payload, timeout=60) |
|
|
result = response.json() |
|
|
if isinstance(result, list) and result: |
|
|
return result[0].get("generated_text", "") |
|
|
return str(result) |
|
|
except: |
|
|
return "" |
|
|
|
|
|
|
|
|
def parse_response(response: str) -> Optional[Dict]: |
|
|
"""Parse function call from response with robust extraction.""" |
|
|
if not response: |
|
|
return None |
|
|
|
|
|
|
|
|
response = response.strip() |
|
|
|
|
|
|
|
|
try: |
|
|
data = json.loads(response) |
|
|
if isinstance(data, dict): |
|
|
return data |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
json_patterns = [ |
|
|
r'```json\s*([\s\S]*?)\s*```', |
|
|
r'```\s*([\s\S]*?)\s*```', |
|
|
r'(\{[\s\S]*\})', |
|
|
] |
|
|
|
|
|
for pattern in json_patterns: |
|
|
matches = re.findall(pattern, response) |
|
|
for match in matches: |
|
|
try: |
|
|
data = json.loads(match.strip()) |
|
|
if isinstance(data, dict) and 'name' in data: |
|
|
return data |
|
|
except: |
|
|
continue |
|
|
|
|
|
|
|
|
start_idx = response.find('{') |
|
|
if start_idx != -1: |
|
|
|
|
|
brace_count = 0 |
|
|
for i, char in enumerate(response[start_idx:], start_idx): |
|
|
if char == '{': |
|
|
brace_count += 1 |
|
|
elif char == '}': |
|
|
brace_count -= 1 |
|
|
if brace_count == 0: |
|
|
try: |
|
|
json_str = response[start_idx:i+1] |
|
|
data = json.loads(json_str) |
|
|
if isinstance(data, dict): |
|
|
return data |
|
|
except: |
|
|
pass |
|
|
break |
|
|
|
|
|
|
|
|
no_call_patterns = [ |
|
|
'no function', 'cannot', 'ูุง ูู
ูู', 'ูุง ุชูุฌุฏ', |
|
|
'null', 'none', 'not applicable', 'ุบูุฑ ู
ุชุงุญ', |
|
|
'ูุง ููุฌุฏ', 'no matching', 'no relevant' |
|
|
] |
|
|
response_lower = response.lower() |
|
|
if any(p in response_lower for p in no_call_patterns): |
|
|
return {"name": None, "arguments": {}} |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def normalize_arabic(text: str) -> str: |
|
|
"""Normalize Arabic text for comparison.""" |
|
|
if not text: |
|
|
return "" |
|
|
text = str(text) |
|
|
|
|
|
text = re.sub(r'[\u064B-\u065F\u0670]', '', text) |
|
|
|
|
|
text = re.sub(r'[ุฅุฃุขุง]', 'ุง', text) |
|
|
|
|
|
text = text.replace('ุฉ', 'ู') |
|
|
|
|
|
text = text.replace('ู', 'ู') |
|
|
|
|
|
return text.lower().strip() |
|
|
|
|
|
|
|
|
def compare_values(pred_val, exp_val) -> bool: |
|
|
"""Compare two values with Arabic normalization.""" |
|
|
pred_str = normalize_arabic(str(pred_val)) |
|
|
exp_str = normalize_arabic(str(exp_val)) |
|
|
|
|
|
|
|
|
if pred_str == exp_str: |
|
|
return True |
|
|
|
|
|
|
|
|
try: |
|
|
if float(pred_val) == float(exp_val): |
|
|
return True |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
if pred_str in exp_str or exp_str in pred_str: |
|
|
return True |
|
|
|
|
|
return False |
|
|
|
|
|
|
|
|
def evaluate_sample(model_id: str, sample: Dict) -> float: |
|
|
"""Evaluate single sample with robust comparison.""" |
|
|
query = sample.get('query_ar', '') |
|
|
functions = sample.get('functions', []) |
|
|
category = sample.get('category', '') |
|
|
ground_truth = sample.get('ground_truth') |
|
|
|
|
|
prompt = create_prompt(query, functions) |
|
|
response = call_model(model_id, prompt) |
|
|
parsed = parse_response(response) |
|
|
|
|
|
|
|
|
if category == 'irrelevance': |
|
|
if parsed is None: |
|
|
return 1.0 |
|
|
if parsed.get('name') is None or parsed.get('name') == 'null': |
|
|
return 1.0 |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
if not parsed: |
|
|
return 0.0 |
|
|
|
|
|
if not ground_truth: |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
expected = ground_truth |
|
|
if isinstance(ground_truth, dict) and 'calls' in ground_truth: |
|
|
calls = ground_truth.get('calls', []) |
|
|
if calls: |
|
|
expected = calls[0] |
|
|
else: |
|
|
expected = ground_truth |
|
|
|
|
|
|
|
|
pred_name = normalize_arabic(str(parsed.get('name', ''))) |
|
|
exp_name = normalize_arabic(str(expected.get('name', ''))) |
|
|
|
|
|
if not pred_name or not exp_name: |
|
|
return 0.0 |
|
|
|
|
|
if pred_name != exp_name: |
|
|
|
|
|
if pred_name not in exp_name and exp_name not in pred_name: |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
pred_args = parsed.get('arguments', {}) or {} |
|
|
exp_args = expected.get('arguments', {}) or {} |
|
|
|
|
|
if not exp_args: |
|
|
return 1.0 |
|
|
|
|
|
if not pred_args: |
|
|
return 0.5 |
|
|
|
|
|
|
|
|
matched = 0 |
|
|
total = len(exp_args) |
|
|
|
|
|
for key, exp_val in exp_args.items(): |
|
|
|
|
|
if key in pred_args: |
|
|
if compare_values(pred_args[key], exp_val): |
|
|
matched += 1 |
|
|
continue |
|
|
|
|
|
|
|
|
norm_key = normalize_arabic(key) |
|
|
for pred_key, pred_val in pred_args.items(): |
|
|
if normalize_arabic(pred_key) == norm_key: |
|
|
if compare_values(pred_val, exp_val): |
|
|
matched += 1 |
|
|
break |
|
|
|
|
|
return matched / total if total > 0 else 1.0 |
|
|
|
|
|
|
|
|
def run_evaluation(): |
|
|
"""Run evaluation only on new models (uses cache for existing results).""" |
|
|
global LEADERBOARD_DATA, EVALUATION_STATUS |
|
|
|
|
|
|
|
|
EVALUATION_STATUS["current"] = "Loading cached results..." |
|
|
cached_results = load_cached_results() |
|
|
|
|
|
|
|
|
evaluated_models = {r['model_id'] for r in cached_results} |
|
|
print(f"Already evaluated: {len(evaluated_models)} models") |
|
|
|
|
|
|
|
|
models_to_run = [m for m in MODELS_TO_EVALUATE if m['model_id'] not in evaluated_models] |
|
|
|
|
|
if not models_to_run: |
|
|
|
|
|
EVALUATION_STATUS["current"] = "All models evaluated (from cache)" |
|
|
EVALUATION_STATUS["progress"] = len(MODELS_TO_EVALUATE) |
|
|
LEADERBOARD_DATA = sorted(cached_results, key=lambda x: x['overall'], reverse=True) |
|
|
for i, r in enumerate(LEADERBOARD_DATA, 1): |
|
|
r['rank'] = i |
|
|
print("All models loaded from cache - no new evaluation needed") |
|
|
return |
|
|
|
|
|
|
|
|
EVALUATION_STATUS["current"] = f"Loading dataset ({len(models_to_run)} new models to evaluate)..." |
|
|
samples = load_evaluation_dataset() |
|
|
|
|
|
if not samples: |
|
|
EVALUATION_STATUS["current"] = "Failed to load dataset" |
|
|
|
|
|
if cached_results: |
|
|
LEADERBOARD_DATA = sorted(cached_results, key=lambda x: x['overall'], reverse=True) |
|
|
for i, r in enumerate(LEADERBOARD_DATA, 1): |
|
|
r['rank'] = i |
|
|
return |
|
|
|
|
|
|
|
|
results = list(cached_results) |
|
|
total_models = len(MODELS_TO_EVALUATE) |
|
|
|
|
|
|
|
|
for idx, model_config in enumerate(models_to_run): |
|
|
model_name = model_config['model'] |
|
|
model_id = model_config['model_id'] |
|
|
|
|
|
evaluated_count = len(evaluated_models) + idx + 1 |
|
|
EVALUATION_STATUS["current"] = f"Evaluating {model_name}... ({evaluated_count}/{total_models})" |
|
|
EVALUATION_STATUS["progress"] = evaluated_count |
|
|
|
|
|
category_scores = {} |
|
|
category_counts = {} |
|
|
|
|
|
for sample in samples: |
|
|
cat = sample.get('category', 'simple') |
|
|
if cat not in category_scores: |
|
|
category_scores[cat] = 0.0 |
|
|
category_counts[cat] = 0 |
|
|
|
|
|
try: |
|
|
score = evaluate_sample(model_id, sample) |
|
|
category_scores[cat] += score |
|
|
except: |
|
|
pass |
|
|
category_counts[cat] += 1 |
|
|
time.sleep(0.5) |
|
|
|
|
|
|
|
|
scores = {cat: round((category_scores[cat] / category_counts[cat]) * 100, 1) |
|
|
for cat in category_scores if category_counts[cat] > 0} |
|
|
|
|
|
|
|
|
weights = {"simple": 0.15, "multiple": 0.10, "parallel": 0.10, |
|
|
"parallel_multiple": 0.10, "irrelevance": 0.15, "dialect_handling": 0.15} |
|
|
overall = sum(scores.get(c, 0) * w for c, w in weights.items()) / sum(weights.values()) |
|
|
|
|
|
new_result = { |
|
|
"model": model_name, |
|
|
"model_id": model_id, |
|
|
"organization": model_config['organization'], |
|
|
"params": model_config['params'], |
|
|
"type": model_config['type'], |
|
|
"overall": round(overall, 1), |
|
|
"simple": scores.get('simple', 0), |
|
|
"multiple": scores.get('multiple', 0), |
|
|
"parallel": scores.get('parallel', 0), |
|
|
"parallel_multiple": scores.get('parallel_multiple', 0), |
|
|
"irrelevance": scores.get('irrelevance', 0), |
|
|
"dialect_handling": scores.get('dialect_handling', 0), |
|
|
"status": "completed" |
|
|
} |
|
|
|
|
|
results.append(new_result) |
|
|
|
|
|
|
|
|
save_cached_results(results) |
|
|
|
|
|
|
|
|
temp_results = sorted(results, key=lambda x: x['overall'], reverse=True) |
|
|
for i, r in enumerate(temp_results, 1): |
|
|
r['rank'] = i |
|
|
LEADERBOARD_DATA = temp_results |
|
|
|
|
|
EVALUATION_STATUS["current"] = "Evaluation Complete" |
|
|
EVALUATION_STATUS["progress"] = total_models |
|
|
|
|
|
|
|
|
def get_leaderboard_df(): |
|
|
"""Get leaderboard as DataFrame.""" |
|
|
if not LEADERBOARD_DATA: |
|
|
data = [] |
|
|
for i, m in enumerate(MODELS_TO_EVALUATE, 1): |
|
|
data.append({ |
|
|
"Rank": i, |
|
|
"Model": m["model"], |
|
|
"Org": m["organization"], |
|
|
"Size": m["params"], |
|
|
"Type": m["type"], |
|
|
"Overall": "โ", |
|
|
"Simple": "โ", |
|
|
"Multiple": "โ", |
|
|
"Parallel": "โ", |
|
|
"Irrelevance": "โ", |
|
|
"Dialect": "โ", |
|
|
}) |
|
|
return pd.DataFrame(data) |
|
|
|
|
|
data = [] |
|
|
for r in LEADERBOARD_DATA: |
|
|
data.append({ |
|
|
"Rank": f"๐ฅ {r['rank']}" if r['rank'] == 1 else f"๐ฅ {r['rank']}" if r['rank'] == 2 else f"๐ฅ {r['rank']}" if r['rank'] == 3 else r['rank'], |
|
|
"Model": r['model'], |
|
|
"Org": r['organization'], |
|
|
"Size": r['params'], |
|
|
"Type": r['type'], |
|
|
"Overall": f"{r['overall']}%", |
|
|
"Simple": f"{r['simple']}%", |
|
|
"Multiple": f"{r['multiple']}%", |
|
|
"Parallel": f"{r['parallel']}%", |
|
|
"Irrelevance": f"{r['irrelevance']}%", |
|
|
"Dialect": f"{r['dialect_handling']}%", |
|
|
}) |
|
|
|
|
|
return pd.DataFrame(data) |
|
|
|
|
|
|
|
|
def get_status_html(): |
|
|
"""Get evaluation status as HTML.""" |
|
|
progress = EVALUATION_STATUS["progress"] |
|
|
total = EVALUATION_STATUS["total"] |
|
|
current = EVALUATION_STATUS["current"] |
|
|
pct = (progress / total) * 100 if total > 0 else 0 |
|
|
|
|
|
return f""" |
|
|
<div style="background: rgba(102,126,234,0.1); border: 1px solid rgba(102,126,234,0.3); border-radius: 12px; padding: 20px; margin: 16px 0;"> |
|
|
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 12px;"> |
|
|
<span style="color: #667eea; font-weight: 600;">๐ {current}</span> |
|
|
<span style="color: #a0a0a0;">{progress}/{total} models</span> |
|
|
</div> |
|
|
<div style="background: rgba(255,255,255,0.1); border-radius: 8px; height: 8px; overflow: hidden;"> |
|
|
<div style="background: linear-gradient(90deg, #667eea, #764ba2); height: 100%; width: {pct}%; transition: width 0.5s ease;"></div> |
|
|
</div> |
|
|
</div> |
|
|
""" |
|
|
|
|
|
|
|
|
def create_app(): |
|
|
"""Create the Gradio app.""" |
|
|
|
|
|
with gr.Blocks(title="AFCL - Arabic Function Calling Leaderboard", css=CUSTOM_CSS, theme=gr.themes.Base()) as app: |
|
|
|
|
|
|
|
|
gr.HTML(""" |
|
|
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 16px; padding: 40px; margin-bottom: 24px; text-align: center;"> |
|
|
<h1 style="color: white; font-size: 2.5rem; margin: 0; font-weight: 700;"> |
|
|
๐ Arabic Function Calling Leaderboard |
|
|
</h1> |
|
|
<p style="color: rgba(255,255,255,0.9); font-size: 1.1rem; margin-top: 8px;"> |
|
|
ููุญุฉ ุชูููู
ุงุณุชุฏุนุงุก ุงูุฏูุงู ุจุงูุนุฑุจูุฉ |
|
|
</p> |
|
|
<p style="color: rgba(255,255,255,0.7); font-size: 0.95rem; margin-top: 16px; max-width: 600px; margin-left: auto; margin-right: auto;"> |
|
|
Comprehensive benchmark evaluating LLMs on Arabic function calling across 10 categories including dialects |
|
|
</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
gr.HTML(f""" |
|
|
<div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 24px; text-align: center; flex: 1;"> |
|
|
<div style="font-size: 2.5rem; font-weight: 700; background: linear-gradient(135deg, #667eea, #764ba2); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">{len(MODELS_TO_EVALUATE)}</div> |
|
|
<div style="color: #a0a0a0; font-size: 0.9rem; margin-top: 8px;">Models</div> |
|
|
</div> |
|
|
""") |
|
|
gr.HTML(""" |
|
|
<div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 24px; text-align: center; flex: 1;"> |
|
|
<div style="font-size: 2.5rem; font-weight: 700; background: linear-gradient(135deg, #22c55e, #16a34a); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">1,470</div> |
|
|
<div style="color: #a0a0a0; font-size: 0.9rem; margin-top: 8px;">Total Samples</div> |
|
|
</div> |
|
|
""") |
|
|
gr.HTML(""" |
|
|
<div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 24px; text-align: center; flex: 1;"> |
|
|
<div style="font-size: 2.5rem; font-weight: 700; background: linear-gradient(135deg, #f59e0b, #d97706); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">10</div> |
|
|
<div style="color: #a0a0a0; font-size: 0.9rem; margin-top: 8px;">Categories</div> |
|
|
</div> |
|
|
""") |
|
|
gr.HTML(""" |
|
|
<div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 24px; text-align: center; flex: 1;"> |
|
|
<div style="font-size: 2.5rem; font-weight: 700; background: linear-gradient(135deg, #ec4899, #be185d); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">3</div> |
|
|
<div style="color: #a0a0a0; font-size: 0.9rem; margin-top: 8px;">Dialects</div> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
status_html = gr.HTML(get_status_html()) |
|
|
|
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.TabItem("๐ Leaderboard"): |
|
|
leaderboard_table = gr.DataFrame( |
|
|
value=get_leaderboard_df(), |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
refresh_btn = gr.Button("๐ Refresh Results", variant="primary", size="lg") |
|
|
|
|
|
def refresh(): |
|
|
return get_leaderboard_df(), get_status_html() |
|
|
|
|
|
refresh_btn.click(refresh, outputs=[leaderboard_table, status_html]) |
|
|
|
|
|
with gr.TabItem("๐ Categories"): |
|
|
gr.HTML(""" |
|
|
<div style="padding: 24px;"> |
|
|
<h3 style="color: #667eea; margin-bottom: 24px;">Evaluation Categories</h3> |
|
|
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 16px;"> |
|
|
<div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;"> |
|
|
<h4 style="color: #22c55e; margin: 0;">Simple</h4> |
|
|
<p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Single function, single call scenarios</p> |
|
|
</div> |
|
|
<div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;"> |
|
|
<h4 style="color: #3b82f6; margin: 0;">Multiple</h4> |
|
|
<p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Select correct function from 2-4 options</p> |
|
|
</div> |
|
|
<div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;"> |
|
|
<h4 style="color: #f59e0b; margin: 0;">Parallel</h4> |
|
|
<p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Multiple calls of same function</p> |
|
|
</div> |
|
|
<div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;"> |
|
|
<h4 style="color: #ec4899; margin: 0;">Parallel Multiple</h4> |
|
|
<p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Multiple functions, multiple calls</p> |
|
|
</div> |
|
|
<div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;"> |
|
|
<h4 style="color: #ef4444; margin: 0;">Irrelevance</h4> |
|
|
<p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Correctly reject when no function applies</p> |
|
|
</div> |
|
|
<div style="background: rgba(255,255,255,0.03); border: 1px solid rgba(255,255,255,0.1); border-radius: 12px; padding: 20px;"> |
|
|
<h4 style="color: #8b5cf6; margin: 0;">Dialect Handling</h4> |
|
|
<p style="color: #a0a0a0; margin: 8px 0 0 0; font-size: 0.9rem;">Egyptian ๐ช๐ฌ / Gulf ๐ธ๐ฆ / Levantine ๐ฑ๐ง</p> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
with gr.TabItem("๐ About"): |
|
|
gr.HTML(""" |
|
|
<div style="padding: 24px; max-width: 800px;"> |
|
|
<h3 style="color: #667eea;">About AFCL</h3> |
|
|
<p style="color: #c0c0c0; line-height: 1.8;"> |
|
|
The <strong>Arabic Function Calling Leaderboard (AFCL)</strong> is the first comprehensive benchmark |
|
|
for evaluating LLMs on function calling capabilities in Arabic. It tests models across Modern Standard |
|
|
Arabic (MSA) and three major dialects: Egyptian, Gulf, and Levantine. |
|
|
</p> |
|
|
|
|
|
<h4 style="color: #22c55e; margin-top: 24px;">Dataset</h4> |
|
|
<p style="color: #c0c0c0;"> |
|
|
๐ <a href="https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling" style="color: #667eea;">HeshamHaroon/Arabic_Function_Calling</a> |
|
|
</p> |
|
|
|
|
|
<h4 style="color: #f59e0b; margin-top: 24px;">Scoring</h4> |
|
|
<p style="color: #c0c0c0; line-height: 1.8;"> |
|
|
Models are scored using AST-based matching with Arabic text normalization. |
|
|
The overall score is a weighted average across all categories, with emphasis on |
|
|
irrelevance detection and dialect handling. |
|
|
</p> |
|
|
|
|
|
<h4 style="color: #ec4899; margin-top: 24px;">Citation</h4> |
|
|
<pre style="background: rgba(255,255,255,0.05); padding: 16px; border-radius: 8px; color: #a0a0a0; overflow-x: auto;"> |
|
|
@misc{afcl2024, |
|
|
title={Arabic Function Calling Leaderboard}, |
|
|
author={Hesham Haroon}, |
|
|
year={2024}, |
|
|
url={https://huggingface.co/spaces/HeshamHaroon/Arabic-Function-Calling-Leaderboard} |
|
|
}</pre> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
gr.HTML(""" |
|
|
<div style="text-align: center; padding: 24px; margin-top: 24px; border-top: 1px solid rgba(255,255,255,0.1);"> |
|
|
<p style="color: #666; font-size: 0.9rem;"> |
|
|
Built for the Arabic NLP Community | ุจููู ูู
ุฌุชู
ุน ู
ุนุงูุฌุฉ ุงููุบุฉ ุงูุนุฑุจูุฉ |
|
|
</p> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
|
|
|
if not LEADERBOARD_DATA: |
|
|
Thread(target=run_evaluation, daemon=True).start() |
|
|
|
|
|
return app |
|
|
|
|
|
|
|
|
app = create_app() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.launch() |
|
|
|