"""Evaluation Dataset Loaders for Standard Benchmarks""" import json import logging from pathlib import Path from typing import Any, Dict, List, Optional from torch.utils.data import Dataset logger = logging.getLogger(__name__) class EvaluationDataset(Dataset): """Base class for evaluation datasets.""" def __init__(self, data: List[Dict[str, Any]], tokenizer: Any, max_length: int = 2048): self.data = data self.tokenizer = tokenizer self.max_length = max_length def __len__(self) -> int: return len(self.data) def __getitem__(self, idx: int) -> Dict[str, Any]: sample = self.data[idx] return sample def load_human_eval(split: str = "test") -> List[Dict[str, Any]]: """Load HumanEval dataset.""" try: from datasets import load_dataset ds = load_dataset("openai_humaneval", split=split) data = [] for ex in ds: data.append({ "task_id": ex["task_id"], "prompt": ex["prompt"], "canonical_solution": ex["canonical_solution"], "test": ex["test"], "entry_point": ex["entry_point"], }) return data except Exception as e: logger.warning(f"Failed to load HumanEval: {e}") return [] def load_mbpp(split: str = "test") -> List[Dict[str, Any]]: """Load MBPP dataset.""" try: from datasets import load_dataset ds = load_dataset("mbpp", split=split) data = [] for ex in ds: data.append({ "task_id": ex["task_id"], "prompt": ex["text"], "canonical_solution": ex["code"], "test": ex["test_list"], "entry_point": ex["entry_point"], }) return data except Exception as e: logger.warning(f"Failed to load MBPP: {e}") return [] def load_gsm8k(split: str = "test") -> List[Dict[str, Any]]: """Load GSM8K dataset.""" try: from datasets import load_dataset ds = load_dataset("gsm8k", "main", split=split) data = [] for ex in ds: data.append({ "question": ex["question"], "answer": ex["answer"], "solution": ex.get("solution", ""), }) return data except Exception as e: logger.warning(f"Failed to load GSM8K: {e}") return [] def load_math(split: str = "test") -> List[Dict[str, Any]]: """Load MATH dataset.""" try: from datasets import load_dataset ds = load_dataset("hendrycks_math", split=split) data = [] for ex in ds: data.append({ "problem": ex["problem"], "answer": ex["answer"], "level": ex["level"], "type": ex["type"], "solution": ex.get("solution", ""), }) return data except Exception as e: logger.warning(f"Failed to load MATH: {e}") return [] def load_truthfulqa(split: str = "validation") -> List[Dict[str, Any]]: """Load TruthfulQA dataset.""" try: from datasets import load_dataset ds = load_dataset("truthful_qa", "multiple_choice", split=split) data = [] for ex in ds: data.append({ "question": ex["question"], "mc1_choices": ex["mc1_choices"], "mc1_idx": ex["mc1_idx"], "mc2_choices": ex["mc2_choices"], "mc2_idx": ex["mc2_idx"], }) return data except Exception as e: logger.warning(f"Failed to load TruthfulQA: {e}") return [] def load_emotional_bench(split: str = "test") -> List[Dict[str, Any]]: """Load emotional intelligence benchmark.""" try: from datasets import load_dataset ds = load_dataset("emotion", split=split) data = [] for ex in ds: data.append({ "text": ex["text"], "label": ex["label"], "emotion": ex["label"], }) return data except Exception as e: logger.warning(f"Failed to load emotion dataset: {e}") return [] def load_custom_benchmark(filepath: str) -> List[Dict[str, Any]]: """Load custom benchmark from JSON file.""" with open(filepath, 'r') as f: data = json.load(f) return data def save_benchmark_results( results: Dict[str, Any], filepath: str, ): """Save benchmark results to file.""" with open(filepath, 'w') as f: json.dump(results, f, indent=2)