HeshamHaroon's picture
Initial release: Arabic Function Calling Leaderboard
566d03e verified
"""
Data Loader
===========
Load benchmark data and evaluation results.
"""
import json
import os
from typing import Dict, List, Optional
from pathlib import Path
from datasets import load_dataset
# Category weights for overall score
CATEGORY_WEIGHTS = {
"simple": 0.15,
"multiple": 0.10,
"parallel": 0.10,
"parallel_multiple": 0.10,
"irrelevance": 0.15,
"dialect_handling": 0.15,
"multi_turn": 0.15,
"native_arabic": 0.10,
# Programming categories (included in evaluation but lower weight)
"java": 0.0,
"javascript": 0.0,
"rest": 0.0,
"sql": 0.0,
}
def load_benchmark(
dataset_name: str = "HeshamHaroon/Arabic_Function_Calling",
split: str = "test",
category: Optional[str] = None
) -> List[Dict]:
"""
Load benchmark samples from HuggingFace dataset.
Args:
dataset_name: HuggingFace dataset repository
split: Dataset split ('train' or 'test')
category: Optional category filter
Returns:
List of sample dictionaries
"""
try:
dataset = load_dataset(dataset_name, split=split)
except Exception as e:
print(f"Error loading dataset: {e}")
# Fallback to local data
local_path = Path(__file__).parent.parent.parent / "arabic_fc_dataset" / "checkpoint.json"
if local_path.exists():
with open(local_path, 'r', encoding='utf-8') as f:
data = json.load(f)
samples = data.get('samples', [])
if category:
samples = [s for s in samples if s.get('category') == category]
return samples
raise
samples = []
for item in dataset:
sample = {
'id': item['id'],
'query_en': item['query_en'],
'query_ar': item['query_ar'],
'functions': json.loads(item['functions']) if item['functions'] else [],
'ground_truth': json.loads(item['ground_truth']) if item['ground_truth'] else None,
'category': item['category'],
'source': item.get('source', ''),
'dialect': item.get('dialect', ''),
}
if category is None or sample['category'] == category:
samples.append(sample)
return samples
def load_results(results_dir: str = "data/results") -> Dict[str, Dict]:
"""
Load evaluation results for all models.
Args:
results_dir: Directory containing result JSON files
Returns:
Dictionary mapping model names to their results
"""
results = {}
results_path = Path(results_dir)
if not results_path.exists():
return results
for file_path in results_path.glob("*.json"):
model_name = file_path.stem
with open(file_path, 'r', encoding='utf-8') as f:
results[model_name] = json.load(f)
return results
def load_leaderboard(leaderboard_path: str = "data/leaderboard.json") -> List[Dict]:
"""
Load the current leaderboard rankings.
Returns:
List of model entries sorted by overall score
"""
path = Path(leaderboard_path)
if not path.exists():
return []
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
return sorted(data, key=lambda x: x.get('overall', 0), reverse=True)
def save_leaderboard(entries: List[Dict], leaderboard_path: str = "data/leaderboard.json"):
"""Save leaderboard data to file."""
path = Path(leaderboard_path)
path.parent.mkdir(parents=True, exist_ok=True)
# Sort by overall score
sorted_entries = sorted(entries, key=lambda x: x.get('overall', 0), reverse=True)
# Add ranks
for i, entry in enumerate(sorted_entries, 1):
entry['rank'] = i
with open(path, 'w', encoding='utf-8') as f:
json.dump(sorted_entries, f, ensure_ascii=False, indent=2)
def calculate_overall_score(category_scores: Dict[str, float]) -> float:
"""
Calculate weighted overall score from category scores.
Args:
category_scores: Dictionary mapping category names to scores (0-100)
Returns:
Overall weighted score (0-100)
"""
total_weight = 0
weighted_sum = 0
for category, weight in CATEGORY_WEIGHTS.items():
if category in category_scores and weight > 0:
weighted_sum += category_scores[category] * weight
total_weight += weight
if total_weight == 0:
return 0.0
return weighted_sum / total_weight
def get_category_stats(samples: List[Dict]) -> Dict[str, int]:
"""Get sample counts by category."""
stats = {}
for sample in samples:
category = sample.get('category', 'unknown')
stats[category] = stats.get(category, 0) + 1
return stats