| """Evaluation Dataset Loaders for Standard Benchmarks""" | |
| import json | |
| import logging | |
| from pathlib import Path | |
| from typing import Any, Dict, List, Optional | |
| from torch.utils.data import Dataset | |
| logger = logging.getLogger(__name__) | |
| class EvaluationDataset(Dataset): | |
| """Base class for evaluation datasets.""" | |
| def __init__(self, data: List[Dict[str, Any]], tokenizer: Any, max_length: int = 2048): | |
| self.data = data | |
| self.tokenizer = tokenizer | |
| self.max_length = max_length | |
| def __len__(self) -> int: | |
| return len(self.data) | |
| def __getitem__(self, idx: int) -> Dict[str, Any]: | |
| sample = self.data[idx] | |
| return sample | |
| def load_human_eval(split: str = "test") -> List[Dict[str, Any]]: | |
| """Load HumanEval dataset.""" | |
| try: | |
| from datasets import load_dataset | |
| ds = load_dataset("openai_humaneval", split=split) | |
| data = [] | |
| for ex in ds: | |
| data.append({ | |
| "task_id": ex["task_id"], | |
| "prompt": ex["prompt"], | |
| "canonical_solution": ex["canonical_solution"], | |
| "test": ex["test"], | |
| "entry_point": ex["entry_point"], | |
| }) | |
| return data | |
| except Exception as e: | |
| logger.warning(f"Failed to load HumanEval: {e}") | |
| return [] | |
| def load_mbpp(split: str = "test") -> List[Dict[str, Any]]: | |
| """Load MBPP dataset.""" | |
| try: | |
| from datasets import load_dataset | |
| ds = load_dataset("mbpp", split=split) | |
| data = [] | |
| for ex in ds: | |
| data.append({ | |
| "task_id": ex["task_id"], | |
| "prompt": ex["text"], | |
| "canonical_solution": ex["code"], | |
| "test": ex["test_list"], | |
| "entry_point": ex["entry_point"], | |
| }) | |
| return data | |
| except Exception as e: | |
| logger.warning(f"Failed to load MBPP: {e}") | |
| return [] | |
| def load_gsm8k(split: str = "test") -> List[Dict[str, Any]]: | |
| """Load GSM8K dataset.""" | |
| try: | |
| from datasets import load_dataset | |
| ds = load_dataset("gsm8k", "main", split=split) | |
| data = [] | |
| for ex in ds: | |
| data.append({ | |
| "question": ex["question"], | |
| "answer": ex["answer"], | |
| "solution": ex.get("solution", ""), | |
| }) | |
| return data | |
| except Exception as e: | |
| logger.warning(f"Failed to load GSM8K: {e}") | |
| return [] | |
| def load_math(split: str = "test") -> List[Dict[str, Any]]: | |
| """Load MATH dataset.""" | |
| try: | |
| from datasets import load_dataset | |
| ds = load_dataset("hendrycks_math", split=split) | |
| data = [] | |
| for ex in ds: | |
| data.append({ | |
| "problem": ex["problem"], | |
| "answer": ex["answer"], | |
| "level": ex["level"], | |
| "type": ex["type"], | |
| "solution": ex.get("solution", ""), | |
| }) | |
| return data | |
| except Exception as e: | |
| logger.warning(f"Failed to load MATH: {e}") | |
| return [] | |
| def load_truthfulqa(split: str = "validation") -> List[Dict[str, Any]]: | |
| """Load TruthfulQA dataset.""" | |
| try: | |
| from datasets import load_dataset | |
| ds = load_dataset("truthful_qa", "multiple_choice", split=split) | |
| data = [] | |
| for ex in ds: | |
| data.append({ | |
| "question": ex["question"], | |
| "mc1_choices": ex["mc1_choices"], | |
| "mc1_idx": ex["mc1_idx"], | |
| "mc2_choices": ex["mc2_choices"], | |
| "mc2_idx": ex["mc2_idx"], | |
| }) | |
| return data | |
| except Exception as e: | |
| logger.warning(f"Failed to load TruthfulQA: {e}") | |
| return [] | |
| def load_emotional_bench(split: str = "test") -> List[Dict[str, Any]]: | |
| """Load emotional intelligence benchmark.""" | |
| try: | |
| from datasets import load_dataset | |
| ds = load_dataset("emotion", split=split) | |
| data = [] | |
| for ex in ds: | |
| data.append({ | |
| "text": ex["text"], | |
| "label": ex["label"], | |
| "emotion": ex["label"], | |
| }) | |
| return data | |
| except Exception as e: | |
| logger.warning(f"Failed to load emotion dataset: {e}") | |
| return [] | |
| def load_custom_benchmark(filepath: str) -> List[Dict[str, Any]]: | |
| """Load custom benchmark from JSON file.""" | |
| with open(filepath, 'r') as f: | |
| data = json.load(f) | |
| return data | |
| def save_benchmark_results( | |
| results: Dict[str, Any], | |
| filepath: str, | |
| ): | |
| """Save benchmark results to file.""" | |
| with open(filepath, 'w') as f: | |
| json.dump(results, f, indent=2) | |