File size: 4,745 Bytes
566d03e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
"""
Data Loader
===========
Load benchmark data and evaluation results.
"""
import json
import os
from typing import Dict, List, Optional
from pathlib import Path
from datasets import load_dataset
# Category weights for overall score
CATEGORY_WEIGHTS = {
"simple": 0.15,
"multiple": 0.10,
"parallel": 0.10,
"parallel_multiple": 0.10,
"irrelevance": 0.15,
"dialect_handling": 0.15,
"multi_turn": 0.15,
"native_arabic": 0.10,
# Programming categories (included in evaluation but lower weight)
"java": 0.0,
"javascript": 0.0,
"rest": 0.0,
"sql": 0.0,
}
def load_benchmark(
dataset_name: str = "HeshamHaroon/Arabic_Function_Calling",
split: str = "test",
category: Optional[str] = None
) -> List[Dict]:
"""
Load benchmark samples from HuggingFace dataset.
Args:
dataset_name: HuggingFace dataset repository
split: Dataset split ('train' or 'test')
category: Optional category filter
Returns:
List of sample dictionaries
"""
try:
dataset = load_dataset(dataset_name, split=split)
except Exception as e:
print(f"Error loading dataset: {e}")
# Fallback to local data
local_path = Path(__file__).parent.parent.parent / "arabic_fc_dataset" / "checkpoint.json"
if local_path.exists():
with open(local_path, 'r', encoding='utf-8') as f:
data = json.load(f)
samples = data.get('samples', [])
if category:
samples = [s for s in samples if s.get('category') == category]
return samples
raise
samples = []
for item in dataset:
sample = {
'id': item['id'],
'query_en': item['query_en'],
'query_ar': item['query_ar'],
'functions': json.loads(item['functions']) if item['functions'] else [],
'ground_truth': json.loads(item['ground_truth']) if item['ground_truth'] else None,
'category': item['category'],
'source': item.get('source', ''),
'dialect': item.get('dialect', ''),
}
if category is None or sample['category'] == category:
samples.append(sample)
return samples
def load_results(results_dir: str = "data/results") -> Dict[str, Dict]:
"""
Load evaluation results for all models.
Args:
results_dir: Directory containing result JSON files
Returns:
Dictionary mapping model names to their results
"""
results = {}
results_path = Path(results_dir)
if not results_path.exists():
return results
for file_path in results_path.glob("*.json"):
model_name = file_path.stem
with open(file_path, 'r', encoding='utf-8') as f:
results[model_name] = json.load(f)
return results
def load_leaderboard(leaderboard_path: str = "data/leaderboard.json") -> List[Dict]:
"""
Load the current leaderboard rankings.
Returns:
List of model entries sorted by overall score
"""
path = Path(leaderboard_path)
if not path.exists():
return []
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
return sorted(data, key=lambda x: x.get('overall', 0), reverse=True)
def save_leaderboard(entries: List[Dict], leaderboard_path: str = "data/leaderboard.json"):
"""Save leaderboard data to file."""
path = Path(leaderboard_path)
path.parent.mkdir(parents=True, exist_ok=True)
# Sort by overall score
sorted_entries = sorted(entries, key=lambda x: x.get('overall', 0), reverse=True)
# Add ranks
for i, entry in enumerate(sorted_entries, 1):
entry['rank'] = i
with open(path, 'w', encoding='utf-8') as f:
json.dump(sorted_entries, f, ensure_ascii=False, indent=2)
def calculate_overall_score(category_scores: Dict[str, float]) -> float:
"""
Calculate weighted overall score from category scores.
Args:
category_scores: Dictionary mapping category names to scores (0-100)
Returns:
Overall weighted score (0-100)
"""
total_weight = 0
weighted_sum = 0
for category, weight in CATEGORY_WEIGHTS.items():
if category in category_scores and weight > 0:
weighted_sum += category_scores[category] * weight
total_weight += weight
if total_weight == 0:
return 0.0
return weighted_sum / total_weight
def get_category_stats(samples: List[Dict]) -> Dict[str, int]:
"""Get sample counts by category."""
stats = {}
for sample in samples:
category = sample.get('category', 'unknown')
stats[category] = stats.get(category, 0) + 1
return stats
|