Spaces:

HeshamHaroon
/

Arabic-Function-Calling-Leaderboard

Running

App Files Files Community

Arabic-Function-Calling-Leaderboard / afcl /app.py

HeshamHaroon

Update: Auto-evaluation on Space startup

a5f8ac7 verified 18 days ago

raw

history blame

12.7 kB

	"""
	Arabic Function Calling Leaderboard (AFCL)
	==========================================

	A Gradio-based leaderboard that evaluates LLMs on Arabic function calling.
	Evaluation runs on HuggingFace Space infrastructure.
	"""

	import gradio as gr
	import pandas as pd
	import json
	import os
	import re
	import time
	import requests
	from pathlib import Path
	from typing import Dict, List, Optional
	from threading import Thread
	from datasets import load_dataset
	import huggingface_hub

	# Constants
	TITLE = "🏆 Arabic Function Calling Leaderboard"
	TITLE_AR = "🏆 لوحة تقييم استدعاء الدوال بالعربية"

	DESCRIPTION = """
	The Arabic Function Calling Leaderboard (AFCL) evaluates Large Language Models on their ability to understand Arabic queries and generate appropriate function calls.

	لوحة تقييم استدعاء الدوال بالعربية تقيّم نماذج اللغة الكبيرة على قدرتها على فهم الاستعلامات العربية وإنشاء استدعاءات الدوال المناسبة.
	"""

	# Models to evaluate
	MODELS_TO_EVALUATE = [
	{"model": "Jais-30B-Chat", "model_id": "inceptionai/jais-30b-chat-v3", "organization": "Inception AI"},
	{"model": "ALLaM-7B-Instruct", "model_id": "sdaia/allam-1-7b-instruct", "organization": "SDAIA"},
	{"model": "SILMA-9B-Instruct", "model_id": "silma-ai/SILMA-9B-Instruct-v1.0", "organization": "Silma AI"},
	{"model": "AceGPT-13B-Chat", "model_id": "FreedomIntelligence/AceGPT-13B-chat", "organization": "FreedomIntelligence"},
	{"model": "BLOOMZ-7B1", "model_id": "bigscience/bloomz-7b1", "organization": "BigScience"},
	{"model": "Aya-Expanse-8B", "model_id": "CohereForAI/aya-expanse-8b", "organization": "Cohere For AI"},
	{"model": "Qwen2.5-7B-Instruct", "model_id": "Qwen/Qwen2.5-7B-Instruct", "organization": "Alibaba Qwen"},
	{"model": "Llama-3.1-8B-Instruct", "model_id": "meta-llama/Llama-3.1-8B-Instruct", "organization": "Meta"},
	{"model": "Gemma-2-9B-IT", "model_id": "google/gemma-2-9b-it", "organization": "Google"},
	{"model": "Mistral-7B-Instruct", "model_id": "mistralai/Mistral-7B-Instruct-v0.3", "organization": "Mistral AI"},
	{"model": "Phi-3-Mini-Instruct", "model_id": "microsoft/Phi-3-mini-4k-instruct", "organization": "Microsoft"},
	]

	# Global state
	LEADERBOARD_DATA = []
	EVALUATION_STATUS = "Not started"


	def load_evaluation_dataset():
	"""Load the Arabic FC dataset from HuggingFace."""
	try:
	dataset = load_dataset("HeshamHaroon/Arabic_Function_Calling", split="test")
	samples = []
	for item in dataset:
	sample = {
	'id': item['id'],
	'query_ar': item['query_ar'],
	'functions': json.loads(item['functions']) if item['functions'] else [],
	'ground_truth': json.loads(item['ground_truth']) if item['ground_truth'] else None,
	'category': item['category'],
	}
	samples.append(sample)
	return samples
	except Exception as e:
	print(f"Error loading dataset: {e}")
	return []


	def create_prompt(query: str, functions: List[Dict]) -> str:
	"""Create evaluation prompt."""
	func_desc = "You are a function calling AI. Given the user query and available functions, respond with a JSON function call.\n\nAvailable functions:\n"
	for f in functions:
	func_desc += f"- {f.get('name')}: {f.get('description', '')}\n"

	return f"""{func_desc}

	User Query (Arabic): {query}

	Respond ONLY with a JSON object:
	{{"name": "function_name", "arguments": {{"param1": "value1"}}}}

	If no function should be called:
	{{"name": null, "arguments": {{}}}}

	JSON Response:"""


	def call_model(model_id: str, prompt: str) -> str:
	"""Call model via HuggingFace Inference API."""
	token = os.getenv("HF_TOKEN", "")
	headers = {"Authorization": f"Bearer {token}"}
	url = f"https://api-inference.huggingface.co/models/{model_id}"

	payload = {
	"inputs": prompt,
	"parameters": {"max_new_tokens": 200, "temperature": 0.1}
	}

	try:
	response = requests.post(url, headers=headers, json=payload, timeout=60)
	if response.status_code == 503:
	time.sleep(20)
	response = requests.post(url, headers=headers, json=payload, timeout=60)

	result = response.json()
	if isinstance(result, list) and result:
	return result[0].get("generated_text", "")
	return str(result)
	except:
	return ""


	def parse_response(response: str) -> Optional[Dict]:
	"""Parse function call from response."""
	if not response:
	return None
	try:
	return json.loads(response.strip())
	except:
	pass
	match = re.search(r'\{[^{}]"name"[^{}]\}', response)
	if match:
	try:
	return json.loads(match.group())
	except:
	pass
	if any(x in response.lower() for x in ['null', 'none', 'لا يمكن']):
	return {"name": None}
	return None


	def evaluate_sample(model_id: str, sample: Dict) -> float:
	"""Evaluate single sample."""
	query = sample.get('query_ar', '')
	functions = sample.get('functions', [])
	category = sample.get('category', '')
	ground_truth = sample.get('ground_truth')

	prompt = create_prompt(query, functions)
	response = call_model(model_id, prompt)
	parsed = parse_response(response)

	if category == 'irrelevance':
	return 1.0 if (parsed is None or parsed.get('name') is None) else 0.0

	if not ground_truth or not parsed:
	return 0.0

	expected = ground_truth.get('calls', [ground_truth])[0] if isinstance(ground_truth, dict) else ground_truth

	if str(parsed.get('name', '')).lower() != str(expected.get('name', '')).lower():
	return 0.0

	pred_args = parsed.get('arguments', {})
	exp_args = expected.get('arguments', {})
	if not exp_args:
	return 1.0

	matched = sum(1 for k, v in exp_args.items() if str(pred_args.get(k, '')).lower() == str(v).lower())
	return matched / len(exp_args)


	def run_evaluation():
	"""Run full evaluation on all models."""
	global LEADERBOARD_DATA, EVALUATION_STATUS

	EVALUATION_STATUS = "Loading dataset..."
	samples = load_evaluation_dataset()

	if not samples:
	EVALUATION_STATUS = "Failed to load dataset"
	return

	results = []
	total_models = len(MODELS_TO_EVALUATE)

	for idx, model_config in enumerate(MODELS_TO_EVALUATE):
	model_name = model_config['model']
	model_id = model_config['model_id']

	EVALUATION_STATUS = f"Evaluating {model_name} ({idx+1}/{total_models})..."

	category_scores = {}
	category_counts = {}

	for sample in samples:
	cat = sample.get('category', 'simple')
	if cat not in category_scores:
	category_scores[cat] = 0.0
	category_counts[cat] = 0

	try:
	score = evaluate_sample(model_id, sample)
	category_scores[cat] += score
	except:
	pass
	category_counts[cat] += 1
	time.sleep(0.5) # Rate limiting

	# Calculate scores
	scores = {cat: round((category_scores[cat] / category_counts[cat]) * 100, 1)
	for cat in category_scores if category_counts[cat] > 0}

	# Weighted overall
	weights = {"simple": 0.15, "multiple": 0.10, "parallel": 0.10,
	"parallel_multiple": 0.10, "irrelevance": 0.15, "dialect_handling": 0.15}
	overall = sum(scores.get(c, 0) * w for c, w in weights.items()) / sum(weights.values())

	results.append({
	"model": model_name,
	"model_id": model_id,
	"organization": model_config['organization'],
	"overall": round(overall, 1),
	"simple": scores.get('simple', 0),
	"multiple": scores.get('multiple', 0),
	"parallel": scores.get('parallel', 0),
	"parallel_multiple": scores.get('parallel_multiple', 0),
	"irrelevance": scores.get('irrelevance', 0),
	"dialect_handling": scores.get('dialect_handling', 0),
	"status": "completed"
	})

	# Sort and rank
	results = sorted(results, key=lambda x: x['overall'], reverse=True)
	for i, r in enumerate(results, 1):
	r['rank'] = i

	LEADERBOARD_DATA = results
	EVALUATION_STATUS = f"Completed - {len(results)} models evaluated"


	def get_leaderboard_df():
	"""Get leaderboard as DataFrame."""
	if not LEADERBOARD_DATA:
	# Return empty with pending status
	data = [{"rank": i+1, "model": m["model"], "organization": m["organization"],
	"overall": "-", "status": "⏳ Pending"}
	for i, m in enumerate(MODELS_TO_EVALUATE)]
	return pd.DataFrame(data)

	df = pd.DataFrame(LEADERBOARD_DATA)
	cols = ["rank", "model", "organization", "overall", "simple", "multiple",
	"parallel", "parallel_multiple", "irrelevance", "dialect_handling"]
	df = df[[c for c in cols if c in df.columns]]

	# Format percentages
	for col in df.columns:
	if df[col].dtype in ['float64', 'float32', 'int64']:
	if col != 'rank':
	df[col] = df[col].apply(lambda x: f"{x:.1f}%")

	return df


	def create_app():
	"""Create the Gradio app."""
	with gr.Blocks(title="Arabic FC Leaderboard", theme=gr.themes.Soft()) as app:

	gr.Markdown(f"""
	<div style="text-align: center; padding: 20px; background: linear-gradient(135deg, #1a5f2a 0%, #2d8f4a 100%); border-radius: 12px; color: white; margin-bottom: 20px;">
	<h1>{TITLE_AR}</h1>
	<h2>{TITLE}</h2>
	<p>Evaluating LLMs on Arabic Function Calling \| تقييم نماذج اللغة على استدعاء الدوال بالعربية</p>
	</div>
	""")

	gr.Markdown(DESCRIPTION)

	with gr.Row():
	gr.Markdown(f"""
	<div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
	<div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">{len(MODELS_TO_EVALUATE)}</div>
	<div>Models \| النماذج</div>
	</div>
	""")
	gr.Markdown("""
	<div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
	<div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">147</div>
	<div>Test Samples \| عينات</div>
	</div>
	""")
	gr.Markdown("""
	<div style="text-align: center; padding: 15px; background: #f5f5f5; border-radius: 8px;">
	<div style="font-size: 2rem; font-weight: bold; color: #1a5f2a;">10</div>
	<div>Categories \| الفئات</div>
	</div>
	""")

	status_text = gr.Markdown(f"Status: {EVALUATION_STATUS}")

	with gr.Tabs():
	with gr.TabItem("🏆 Leaderboard"):
	leaderboard_df = gr.DataFrame(
	value=get_leaderboard_df(),
	interactive=False
	)

	def refresh_leaderboard():
	return get_leaderboard_df(), f"Status: {EVALUATION_STATUS}"

	refresh_btn = gr.Button("🔄 Refresh \| تحديث")
	refresh_btn.click(refresh_leaderboard, outputs=[leaderboard_df, status_text])

	with gr.TabItem("📊 About"):
	gr.Markdown("""
	## Evaluation Categories

	\| Category \| Samples \| Description \|
	\|----------\|---------\|-------------\|
	\| Simple \| ~20 \| Single function call \|
	\| Multiple \| ~20 \| Select from multiple functions \|
	\| Parallel \| ~20 \| Multiple calls \|
	\| Parallel Multiple \| ~20 \| Complex multi-call \|
	\| Irrelevance \| ~20 \| Should not call \|
	\| Dialect \| ~15 \| Egyptian/Gulf/Levantine \|

	## Dataset
	📊 [HeshamHaroon/Arabic_Function_Calling](https://huggingface.co/datasets/HeshamHaroon/Arabic_Function_Calling)
	""")

	gr.Markdown("""
	---
	<div style="text-align: center; color: #666;">
	Built for the Arabic NLP community \| بُني لمجتمع معالجة اللغة العربية
	</div>
	""")

	# Start evaluation in background
	if not LEADERBOARD_DATA:
	Thread(target=run_evaluation, daemon=True).start()

	return app


	app = create_app()

	if __name__ == "__main__":
	app.launch()