""" Data Loader =========== Load benchmark data and evaluation results. """ import json import os from typing import Dict, List, Optional from pathlib import Path from datasets import load_dataset # Category weights for overall score CATEGORY_WEIGHTS = { "simple": 0.15, "multiple": 0.10, "parallel": 0.10, "parallel_multiple": 0.10, "irrelevance": 0.15, "dialect_handling": 0.15, "multi_turn": 0.15, "native_arabic": 0.10, # Programming categories (included in evaluation but lower weight) "java": 0.0, "javascript": 0.0, "rest": 0.0, "sql": 0.0, } def load_benchmark( dataset_name: str = "HeshamHaroon/Arabic_Function_Calling", split: str = "test", category: Optional[str] = None ) -> List[Dict]: """ Load benchmark samples from HuggingFace dataset. Args: dataset_name: HuggingFace dataset repository split: Dataset split ('train' or 'test') category: Optional category filter Returns: List of sample dictionaries """ try: dataset = load_dataset(dataset_name, split=split) except Exception as e: print(f"Error loading dataset: {e}") # Fallback to local data local_path = Path(__file__).parent.parent.parent / "arabic_fc_dataset" / "checkpoint.json" if local_path.exists(): with open(local_path, 'r', encoding='utf-8') as f: data = json.load(f) samples = data.get('samples', []) if category: samples = [s for s in samples if s.get('category') == category] return samples raise samples = [] for item in dataset: sample = { 'id': item['id'], 'query_en': item['query_en'], 'query_ar': item['query_ar'], 'functions': json.loads(item['functions']) if item['functions'] else [], 'ground_truth': json.loads(item['ground_truth']) if item['ground_truth'] else None, 'category': item['category'], 'source': item.get('source', ''), 'dialect': item.get('dialect', ''), } if category is None or sample['category'] == category: samples.append(sample) return samples def load_results(results_dir: str = "data/results") -> Dict[str, Dict]: """ Load evaluation results for all models. Args: results_dir: Directory containing result JSON files Returns: Dictionary mapping model names to their results """ results = {} results_path = Path(results_dir) if not results_path.exists(): return results for file_path in results_path.glob("*.json"): model_name = file_path.stem with open(file_path, 'r', encoding='utf-8') as f: results[model_name] = json.load(f) return results def load_leaderboard(leaderboard_path: str = "data/leaderboard.json") -> List[Dict]: """ Load the current leaderboard rankings. Returns: List of model entries sorted by overall score """ path = Path(leaderboard_path) if not path.exists(): return [] with open(path, 'r', encoding='utf-8') as f: data = json.load(f) return sorted(data, key=lambda x: x.get('overall', 0), reverse=True) def save_leaderboard(entries: List[Dict], leaderboard_path: str = "data/leaderboard.json"): """Save leaderboard data to file.""" path = Path(leaderboard_path) path.parent.mkdir(parents=True, exist_ok=True) # Sort by overall score sorted_entries = sorted(entries, key=lambda x: x.get('overall', 0), reverse=True) # Add ranks for i, entry in enumerate(sorted_entries, 1): entry['rank'] = i with open(path, 'w', encoding='utf-8') as f: json.dump(sorted_entries, f, ensure_ascii=False, indent=2) def calculate_overall_score(category_scores: Dict[str, float]) -> float: """ Calculate weighted overall score from category scores. Args: category_scores: Dictionary mapping category names to scores (0-100) Returns: Overall weighted score (0-100) """ total_weight = 0 weighted_sum = 0 for category, weight in CATEGORY_WEIGHTS.items(): if category in category_scores and weight > 0: weighted_sum += category_scores[category] * weight total_weight += weight if total_weight == 0: return 0.0 return weighted_sum / total_weight def get_category_stats(samples: List[Dict]) -> Dict[str, int]: """Get sample counts by category.""" stats = {} for sample in samples: category = sample.get('category', 'unknown') stats[category] = stats.get(category, 0) + 1 return stats