Spaces:

HeshamHaroon
/

Arabic-Function-Calling-Leaderboard

Running

App Files Files Community

Arabic-Function-Calling-Leaderboard / afcl /data /loader.py

HeshamHaroon

Initial release: Arabic Function Calling Leaderboard

566d03e verified 15 days ago

raw

history blame contribute delete

4.75 kB

	"""
	Data Loader
	===========

	Load benchmark data and evaluation results.
	"""

	import json
	import os
	from typing import Dict, List, Optional
	from pathlib import Path
	from datasets import load_dataset


	# Category weights for overall score
	CATEGORY_WEIGHTS = {
	"simple": 0.15,
	"multiple": 0.10,
	"parallel": 0.10,
	"parallel_multiple": 0.10,
	"irrelevance": 0.15,
	"dialect_handling": 0.15,
	"multi_turn": 0.15,
	"native_arabic": 0.10,
	# Programming categories (included in evaluation but lower weight)
	"java": 0.0,
	"javascript": 0.0,
	"rest": 0.0,
	"sql": 0.0,
	}


	def load_benchmark(
	dataset_name: str = "HeshamHaroon/Arabic_Function_Calling",
	split: str = "test",
	category: Optional[str] = None
	) -> List[Dict]:
	"""
	Load benchmark samples from HuggingFace dataset.

	Args:
	dataset_name: HuggingFace dataset repository
	split: Dataset split ('train' or 'test')
	category: Optional category filter

	Returns:
	List of sample dictionaries
	"""
	try:
	dataset = load_dataset(dataset_name, split=split)
	except Exception as e:
	print(f"Error loading dataset: {e}")
	# Fallback to local data
	local_path = Path(__file__).parent.parent.parent / "arabic_fc_dataset" / "checkpoint.json"
	if local_path.exists():
	with open(local_path, 'r', encoding='utf-8') as f:
	data = json.load(f)
	samples = data.get('samples', [])
	if category:
	samples = [s for s in samples if s.get('category') == category]
	return samples
	raise

	samples = []
	for item in dataset:
	sample = {
	'id': item['id'],
	'query_en': item['query_en'],
	'query_ar': item['query_ar'],
	'functions': json.loads(item['functions']) if item['functions'] else [],
	'ground_truth': json.loads(item['ground_truth']) if item['ground_truth'] else None,
	'category': item['category'],
	'source': item.get('source', ''),
	'dialect': item.get('dialect', ''),
	}
	if category is None or sample['category'] == category:
	samples.append(sample)

	return samples


	def load_results(results_dir: str = "data/results") -> Dict[str, Dict]:
	"""
	Load evaluation results for all models.

	Args:
	results_dir: Directory containing result JSON files

	Returns:
	Dictionary mapping model names to their results
	"""
	results = {}
	results_path = Path(results_dir)

	if not results_path.exists():
	return results

	for file_path in results_path.glob("*.json"):
	model_name = file_path.stem
	with open(file_path, 'r', encoding='utf-8') as f:
	results[model_name] = json.load(f)

	return results


	def load_leaderboard(leaderboard_path: str = "data/leaderboard.json") -> List[Dict]:
	"""
	Load the current leaderboard rankings.

	Returns:
	List of model entries sorted by overall score
	"""
	path = Path(leaderboard_path)
	if not path.exists():
	return []

	with open(path, 'r', encoding='utf-8') as f:
	data = json.load(f)

	return sorted(data, key=lambda x: x.get('overall', 0), reverse=True)


	def save_leaderboard(entries: List[Dict], leaderboard_path: str = "data/leaderboard.json"):
	"""Save leaderboard data to file."""
	path = Path(leaderboard_path)
	path.parent.mkdir(parents=True, exist_ok=True)

	# Sort by overall score
	sorted_entries = sorted(entries, key=lambda x: x.get('overall', 0), reverse=True)

	# Add ranks
	for i, entry in enumerate(sorted_entries, 1):
	entry['rank'] = i

	with open(path, 'w', encoding='utf-8') as f:
	json.dump(sorted_entries, f, ensure_ascii=False, indent=2)


	def calculate_overall_score(category_scores: Dict[str, float]) -> float:
	"""
	Calculate weighted overall score from category scores.

	Args:
	category_scores: Dictionary mapping category names to scores (0-100)

	Returns:
	Overall weighted score (0-100)
	"""
	total_weight = 0
	weighted_sum = 0

	for category, weight in CATEGORY_WEIGHTS.items():
	if category in category_scores and weight > 0:
	weighted_sum += category_scores[category] * weight
	total_weight += weight

	if total_weight == 0:
	return 0.0

	return weighted_sum / total_weight


	def get_category_stats(samples: List[Dict]) -> Dict[str, int]:
	"""Get sample counts by category."""
	stats = {}
	for sample in samples:
	category = sample.get('category', 'unknown')
	stats[category] = stats.get(category, 0) + 1
	return stats