File size: 4,831 Bytes
8d18b7c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | """Evaluation Dataset Loaders for Standard Benchmarks"""
import json
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional
from torch.utils.data import Dataset
logger = logging.getLogger(__name__)
class EvaluationDataset(Dataset):
"""Base class for evaluation datasets."""
def __init__(self, data: List[Dict[str, Any]], tokenizer: Any, max_length: int = 2048):
self.data = data
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self) -> int:
return len(self.data)
def __getitem__(self, idx: int) -> Dict[str, Any]:
sample = self.data[idx]
return sample
def load_human_eval(split: str = "test") -> List[Dict[str, Any]]:
"""Load HumanEval dataset."""
try:
from datasets import load_dataset
ds = load_dataset("openai_humaneval", split=split)
data = []
for ex in ds:
data.append({
"task_id": ex["task_id"],
"prompt": ex["prompt"],
"canonical_solution": ex["canonical_solution"],
"test": ex["test"],
"entry_point": ex["entry_point"],
})
return data
except Exception as e:
logger.warning(f"Failed to load HumanEval: {e}")
return []
def load_mbpp(split: str = "test") -> List[Dict[str, Any]]:
"""Load MBPP dataset."""
try:
from datasets import load_dataset
ds = load_dataset("mbpp", split=split)
data = []
for ex in ds:
data.append({
"task_id": ex["task_id"],
"prompt": ex["text"],
"canonical_solution": ex["code"],
"test": ex["test_list"],
"entry_point": ex["entry_point"],
})
return data
except Exception as e:
logger.warning(f"Failed to load MBPP: {e}")
return []
def load_gsm8k(split: str = "test") -> List[Dict[str, Any]]:
"""Load GSM8K dataset."""
try:
from datasets import load_dataset
ds = load_dataset("gsm8k", "main", split=split)
data = []
for ex in ds:
data.append({
"question": ex["question"],
"answer": ex["answer"],
"solution": ex.get("solution", ""),
})
return data
except Exception as e:
logger.warning(f"Failed to load GSM8K: {e}")
return []
def load_math(split: str = "test") -> List[Dict[str, Any]]:
"""Load MATH dataset."""
try:
from datasets import load_dataset
ds = load_dataset("hendrycks_math", split=split)
data = []
for ex in ds:
data.append({
"problem": ex["problem"],
"answer": ex["answer"],
"level": ex["level"],
"type": ex["type"],
"solution": ex.get("solution", ""),
})
return data
except Exception as e:
logger.warning(f"Failed to load MATH: {e}")
return []
def load_truthfulqa(split: str = "validation") -> List[Dict[str, Any]]:
"""Load TruthfulQA dataset."""
try:
from datasets import load_dataset
ds = load_dataset("truthful_qa", "multiple_choice", split=split)
data = []
for ex in ds:
data.append({
"question": ex["question"],
"mc1_choices": ex["mc1_choices"],
"mc1_idx": ex["mc1_idx"],
"mc2_choices": ex["mc2_choices"],
"mc2_idx": ex["mc2_idx"],
})
return data
except Exception as e:
logger.warning(f"Failed to load TruthfulQA: {e}")
return []
def load_emotional_bench(split: str = "test") -> List[Dict[str, Any]]:
"""Load emotional intelligence benchmark."""
try:
from datasets import load_dataset
ds = load_dataset("emotion", split=split)
data = []
for ex in ds:
data.append({
"text": ex["text"],
"label": ex["label"],
"emotion": ex["label"],
})
return data
except Exception as e:
logger.warning(f"Failed to load emotion dataset: {e}")
return []
def load_custom_benchmark(filepath: str) -> List[Dict[str, Any]]:
"""Load custom benchmark from JSON file."""
with open(filepath, 'r') as f:
data = json.load(f)
return data
def save_benchmark_results(
results: Dict[str, Any],
filepath: str,
):
"""Save benchmark results to file."""
with open(filepath, 'w') as f:
json.dump(results, f, indent=2)
|