File size: 4,831 Bytes
8d18b7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""Evaluation Dataset Loaders for Standard Benchmarks"""

import json
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional

from torch.utils.data import Dataset

logger = logging.getLogger(__name__)


class EvaluationDataset(Dataset):
    """Base class for evaluation datasets."""

    def __init__(self, data: List[Dict[str, Any]], tokenizer: Any, max_length: int = 2048):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx: int) -> Dict[str, Any]:
        sample = self.data[idx]
        return sample


def load_human_eval(split: str = "test") -> List[Dict[str, Any]]:
    """Load HumanEval dataset."""
    try:
        from datasets import load_dataset
        ds = load_dataset("openai_humaneval", split=split)
        data = []
        for ex in ds:
            data.append({
                "task_id": ex["task_id"],
                "prompt": ex["prompt"],
                "canonical_solution": ex["canonical_solution"],
                "test": ex["test"],
                "entry_point": ex["entry_point"],
            })
        return data
    except Exception as e:
        logger.warning(f"Failed to load HumanEval: {e}")
        return []


def load_mbpp(split: str = "test") -> List[Dict[str, Any]]:
    """Load MBPP dataset."""
    try:
        from datasets import load_dataset
        ds = load_dataset("mbpp", split=split)
        data = []
        for ex in ds:
            data.append({
                "task_id": ex["task_id"],
                "prompt": ex["text"],
                "canonical_solution": ex["code"],
                "test": ex["test_list"],
                "entry_point": ex["entry_point"],
            })
        return data
    except Exception as e:
        logger.warning(f"Failed to load MBPP: {e}")
        return []


def load_gsm8k(split: str = "test") -> List[Dict[str, Any]]:
    """Load GSM8K dataset."""
    try:
        from datasets import load_dataset
        ds = load_dataset("gsm8k", "main", split=split)
        data = []
        for ex in ds:
            data.append({
                "question": ex["question"],
                "answer": ex["answer"],
                "solution": ex.get("solution", ""),
            })
        return data
    except Exception as e:
        logger.warning(f"Failed to load GSM8K: {e}")
        return []


def load_math(split: str = "test") -> List[Dict[str, Any]]:
    """Load MATH dataset."""
    try:
        from datasets import load_dataset
        ds = load_dataset("hendrycks_math", split=split)
        data = []
        for ex in ds:
            data.append({
                "problem": ex["problem"],
                "answer": ex["answer"],
                "level": ex["level"],
                "type": ex["type"],
                "solution": ex.get("solution", ""),
            })
        return data
    except Exception as e:
        logger.warning(f"Failed to load MATH: {e}")
        return []


def load_truthfulqa(split: str = "validation") -> List[Dict[str, Any]]:
    """Load TruthfulQA dataset."""
    try:
        from datasets import load_dataset
        ds = load_dataset("truthful_qa", "multiple_choice", split=split)
        data = []
        for ex in ds:
            data.append({
                "question": ex["question"],
                "mc1_choices": ex["mc1_choices"],
                "mc1_idx": ex["mc1_idx"],
                "mc2_choices": ex["mc2_choices"],
                "mc2_idx": ex["mc2_idx"],
            })
        return data
    except Exception as e:
        logger.warning(f"Failed to load TruthfulQA: {e}")
        return []


def load_emotional_bench(split: str = "test") -> List[Dict[str, Any]]:
    """Load emotional intelligence benchmark."""
    try:
        from datasets import load_dataset
        ds = load_dataset("emotion", split=split)
        data = []
        for ex in ds:
            data.append({
                "text": ex["text"],
                "label": ex["label"],
                "emotion": ex["label"],
            })
        return data
    except Exception as e:
        logger.warning(f"Failed to load emotion dataset: {e}")
        return []


def load_custom_benchmark(filepath: str) -> List[Dict[str, Any]]:
    """Load custom benchmark from JSON file."""
    with open(filepath, 'r') as f:
        data = json.load(f)
    return data


def save_benchmark_results(

    results: Dict[str, Any],

    filepath: str,

):
    """Save benchmark results to file."""
    with open(filepath, 'w') as f:
        json.dump(results, f, indent=2)