|
|
""" |
|
|
Data Loader |
|
|
=========== |
|
|
|
|
|
Load benchmark data and evaluation results. |
|
|
""" |
|
|
|
|
|
import json |
|
|
import os |
|
|
from typing import Dict, List, Optional |
|
|
from pathlib import Path |
|
|
from datasets import load_dataset |
|
|
|
|
|
|
|
|
|
|
|
CATEGORY_WEIGHTS = { |
|
|
"simple": 0.15, |
|
|
"multiple": 0.10, |
|
|
"parallel": 0.10, |
|
|
"parallel_multiple": 0.10, |
|
|
"irrelevance": 0.15, |
|
|
"dialect_handling": 0.15, |
|
|
"multi_turn": 0.15, |
|
|
"native_arabic": 0.10, |
|
|
|
|
|
"java": 0.0, |
|
|
"javascript": 0.0, |
|
|
"rest": 0.0, |
|
|
"sql": 0.0, |
|
|
} |
|
|
|
|
|
|
|
|
def load_benchmark( |
|
|
dataset_name: str = "HeshamHaroon/Arabic_Function_Calling", |
|
|
split: str = "test", |
|
|
category: Optional[str] = None |
|
|
) -> List[Dict]: |
|
|
""" |
|
|
Load benchmark samples from HuggingFace dataset. |
|
|
|
|
|
Args: |
|
|
dataset_name: HuggingFace dataset repository |
|
|
split: Dataset split ('train' or 'test') |
|
|
category: Optional category filter |
|
|
|
|
|
Returns: |
|
|
List of sample dictionaries |
|
|
""" |
|
|
try: |
|
|
dataset = load_dataset(dataset_name, split=split) |
|
|
except Exception as e: |
|
|
print(f"Error loading dataset: {e}") |
|
|
|
|
|
local_path = Path(__file__).parent.parent.parent / "arabic_fc_dataset" / "checkpoint.json" |
|
|
if local_path.exists(): |
|
|
with open(local_path, 'r', encoding='utf-8') as f: |
|
|
data = json.load(f) |
|
|
samples = data.get('samples', []) |
|
|
if category: |
|
|
samples = [s for s in samples if s.get('category') == category] |
|
|
return samples |
|
|
raise |
|
|
|
|
|
samples = [] |
|
|
for item in dataset: |
|
|
sample = { |
|
|
'id': item['id'], |
|
|
'query_en': item['query_en'], |
|
|
'query_ar': item['query_ar'], |
|
|
'functions': json.loads(item['functions']) if item['functions'] else [], |
|
|
'ground_truth': json.loads(item['ground_truth']) if item['ground_truth'] else None, |
|
|
'category': item['category'], |
|
|
'source': item.get('source', ''), |
|
|
'dialect': item.get('dialect', ''), |
|
|
} |
|
|
if category is None or sample['category'] == category: |
|
|
samples.append(sample) |
|
|
|
|
|
return samples |
|
|
|
|
|
|
|
|
def load_results(results_dir: str = "data/results") -> Dict[str, Dict]: |
|
|
""" |
|
|
Load evaluation results for all models. |
|
|
|
|
|
Args: |
|
|
results_dir: Directory containing result JSON files |
|
|
|
|
|
Returns: |
|
|
Dictionary mapping model names to their results |
|
|
""" |
|
|
results = {} |
|
|
results_path = Path(results_dir) |
|
|
|
|
|
if not results_path.exists(): |
|
|
return results |
|
|
|
|
|
for file_path in results_path.glob("*.json"): |
|
|
model_name = file_path.stem |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
results[model_name] = json.load(f) |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
def load_leaderboard(leaderboard_path: str = "data/leaderboard.json") -> List[Dict]: |
|
|
""" |
|
|
Load the current leaderboard rankings. |
|
|
|
|
|
Returns: |
|
|
List of model entries sorted by overall score |
|
|
""" |
|
|
path = Path(leaderboard_path) |
|
|
if not path.exists(): |
|
|
return [] |
|
|
|
|
|
with open(path, 'r', encoding='utf-8') as f: |
|
|
data = json.load(f) |
|
|
|
|
|
return sorted(data, key=lambda x: x.get('overall', 0), reverse=True) |
|
|
|
|
|
|
|
|
def save_leaderboard(entries: List[Dict], leaderboard_path: str = "data/leaderboard.json"): |
|
|
"""Save leaderboard data to file.""" |
|
|
path = Path(leaderboard_path) |
|
|
path.parent.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
sorted_entries = sorted(entries, key=lambda x: x.get('overall', 0), reverse=True) |
|
|
|
|
|
|
|
|
for i, entry in enumerate(sorted_entries, 1): |
|
|
entry['rank'] = i |
|
|
|
|
|
with open(path, 'w', encoding='utf-8') as f: |
|
|
json.dump(sorted_entries, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
|
|
|
def calculate_overall_score(category_scores: Dict[str, float]) -> float: |
|
|
""" |
|
|
Calculate weighted overall score from category scores. |
|
|
|
|
|
Args: |
|
|
category_scores: Dictionary mapping category names to scores (0-100) |
|
|
|
|
|
Returns: |
|
|
Overall weighted score (0-100) |
|
|
""" |
|
|
total_weight = 0 |
|
|
weighted_sum = 0 |
|
|
|
|
|
for category, weight in CATEGORY_WEIGHTS.items(): |
|
|
if category in category_scores and weight > 0: |
|
|
weighted_sum += category_scores[category] * weight |
|
|
total_weight += weight |
|
|
|
|
|
if total_weight == 0: |
|
|
return 0.0 |
|
|
|
|
|
return weighted_sum / total_weight |
|
|
|
|
|
|
|
|
def get_category_stats(samples: List[Dict]) -> Dict[str, int]: |
|
|
"""Get sample counts by category.""" |
|
|
stats = {} |
|
|
for sample in samples: |
|
|
category = sample.get('category', 'unknown') |
|
|
stats[category] = stats.get(category, 0) + 1 |
|
|
return stats |
|
|
|