Zenith-7b-V1 / tests /evaluation /eval_datasets.py
Zandy-Wandy's picture
Upload Zenith-7B model
8d18b7c verified
"""Evaluation Dataset Loaders for Standard Benchmarks"""
import json
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional
from torch.utils.data import Dataset
logger = logging.getLogger(__name__)
class EvaluationDataset(Dataset):
"""Base class for evaluation datasets."""
def __init__(self, data: List[Dict[str, Any]], tokenizer: Any, max_length: int = 2048):
self.data = data
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self) -> int:
return len(self.data)
def __getitem__(self, idx: int) -> Dict[str, Any]:
sample = self.data[idx]
return sample
def load_human_eval(split: str = "test") -> List[Dict[str, Any]]:
"""Load HumanEval dataset."""
try:
from datasets import load_dataset
ds = load_dataset("openai_humaneval", split=split)
data = []
for ex in ds:
data.append({
"task_id": ex["task_id"],
"prompt": ex["prompt"],
"canonical_solution": ex["canonical_solution"],
"test": ex["test"],
"entry_point": ex["entry_point"],
})
return data
except Exception as e:
logger.warning(f"Failed to load HumanEval: {e}")
return []
def load_mbpp(split: str = "test") -> List[Dict[str, Any]]:
"""Load MBPP dataset."""
try:
from datasets import load_dataset
ds = load_dataset("mbpp", split=split)
data = []
for ex in ds:
data.append({
"task_id": ex["task_id"],
"prompt": ex["text"],
"canonical_solution": ex["code"],
"test": ex["test_list"],
"entry_point": ex["entry_point"],
})
return data
except Exception as e:
logger.warning(f"Failed to load MBPP: {e}")
return []
def load_gsm8k(split: str = "test") -> List[Dict[str, Any]]:
"""Load GSM8K dataset."""
try:
from datasets import load_dataset
ds = load_dataset("gsm8k", "main", split=split)
data = []
for ex in ds:
data.append({
"question": ex["question"],
"answer": ex["answer"],
"solution": ex.get("solution", ""),
})
return data
except Exception as e:
logger.warning(f"Failed to load GSM8K: {e}")
return []
def load_math(split: str = "test") -> List[Dict[str, Any]]:
"""Load MATH dataset."""
try:
from datasets import load_dataset
ds = load_dataset("hendrycks_math", split=split)
data = []
for ex in ds:
data.append({
"problem": ex["problem"],
"answer": ex["answer"],
"level": ex["level"],
"type": ex["type"],
"solution": ex.get("solution", ""),
})
return data
except Exception as e:
logger.warning(f"Failed to load MATH: {e}")
return []
def load_truthfulqa(split: str = "validation") -> List[Dict[str, Any]]:
"""Load TruthfulQA dataset."""
try:
from datasets import load_dataset
ds = load_dataset("truthful_qa", "multiple_choice", split=split)
data = []
for ex in ds:
data.append({
"question": ex["question"],
"mc1_choices": ex["mc1_choices"],
"mc1_idx": ex["mc1_idx"],
"mc2_choices": ex["mc2_choices"],
"mc2_idx": ex["mc2_idx"],
})
return data
except Exception as e:
logger.warning(f"Failed to load TruthfulQA: {e}")
return []
def load_emotional_bench(split: str = "test") -> List[Dict[str, Any]]:
"""Load emotional intelligence benchmark."""
try:
from datasets import load_dataset
ds = load_dataset("emotion", split=split)
data = []
for ex in ds:
data.append({
"text": ex["text"],
"label": ex["label"],
"emotion": ex["label"],
})
return data
except Exception as e:
logger.warning(f"Failed to load emotion dataset: {e}")
return []
def load_custom_benchmark(filepath: str) -> List[Dict[str, Any]]:
"""Load custom benchmark from JSON file."""
with open(filepath, 'r') as f:
data = json.load(f)
return data
def save_benchmark_results(
results: Dict[str, Any],
filepath: str,
):
"""Save benchmark results to file."""
with open(filepath, 'w') as f:
json.dump(results, f, indent=2)