File size: 14,136 Bytes
5374a2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
import os 
# import regex
from typing import Union, Any, List
from ..core.logging import logger
from .benchmark import CodingBenchmark 
from ..core.module_utils import extract_code_blocks
from typing import Union, Any, List, Callable
from .lcb_utils.code_generation import (
    CodeGenerationProblem, 
    load_code_generation_dataset
)
from .lcb_utils.test_output_prediction import (
    TestOutputPredictionProblem, 
    load_test_prediction_dataset
)
from .lcb_utils.code_execution import (
    CodeExecutionProblem, 
    load_code_execution_dataset
)
from .lcb_utils.evaluation import (
    codegen_metrics, 
    test_output_metrics,
    code_execution_metrics,
    reliability_guard
)
from .lcb_utils.utils import extract_test_output_code, extract_execution_code


VALID_SCENARIO = ["code_generation", "test_output_prediction", "code_execution"]

class LiveCodeBench(CodingBenchmark):

    """Benchmark class for evaluating LLM capabilities on real-world programming tasks.
    
    LiveCodeBench provides a framework for evaluating different scenarios of code-related tasks:
    1. Code Generation: generating code from problem descriptions
    2. Test Output Prediction: predicting test outputs given test code
    3. Code Execution: generating code that executes correctly
    
    The benchmark supports different evaluation modes, metrics, and can be customized
    with various parameters like timeouts, sample dates, and processing options.
    
    Attributes:
        k: An integer or list of integers specifying which pass@k metrics to compute
        version: Release version of the dataset to use
        num_process: Number of processes to use for evaluation
        start_date: Filter problems to those after this date
        end_date: Filter problems to those before this date
        scenario: Type of programming task to evaluate ("code_generation", 
                  "test_output_prediction", or "code_execution")
        use_cot_for_execution: Whether to use chain-of-thought processing for code execution
    """

    def __init__(
        self, 
        path: str = None, 
        mode: str = "all", 
        timeout: int = 60, 
        k: Union[int, list] = 1, 
        num_process: int = 6, 
        scenario: str = "code_generation", 
        version: str = "release_latest", 
        start_date: str = None, 
        end_date: str = None, 
        use_cot_for_execution: bool = False, 
        **kwargs
    ):
        path = os.path.expanduser(path or "~/.evoagentx/data/livecodebench")
        self.k = k 
        self.version = version
        self.num_process = num_process
        self.start_date = start_date
        self.end_date = end_date
        self.scenario = scenario 
        self.name = 'livecodebench'
        self.use_cot_for_execution = use_cot_for_execution
        assert scenario in VALID_SCENARIO, f"Invalid scenario: {scenario}. Available choices: {VALID_SCENARIO}." 
        super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs)
    
    def _load_data(self):
        if self.mode == "train" or self.mode == "all":
            self._train_data = None 
        if self.mode == "dev" or self.mode == "all":
            self._dev_data = None 
        if self.mode == "test" or self.mode == "all":
            self._test_data = self._load_test_data()
    
    def _load_test_data(self):

        if self.scenario == "code_generation":
            logger.info(f"Loading code generation dataset from {self.path} with version {self.version}.")
            data: List[CodeGenerationProblem] = load_code_generation_dataset(
                release_version=self.version, 
                cache_dir=self.path, 
                start_date=self.start_date, 
                end_date=self.end_date
            )
        elif self.scenario == "test_output_prediction":
            logger.info(f"Loading test output prediction dataset from {self.path}.")
            data: List[TestOutputPredictionProblem] = load_test_prediction_dataset(cache_dir=self.path)
        elif self.scenario == "code_execution":
            logger.info(f"Loading code execution dataset from {self.path}.")
            data: List[CodeExecutionProblem] = load_code_execution_dataset(cache_dir=self.path)
        else:
            raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.")

        return data 
    
    def _get_id(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> str:
        return example.question_id  
    
    def _get_label(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> dict:
        return example.get_evaluation_sample()
    
    async def async_evaluate(self, graph: Callable, example: Any) -> float:

        # generate solution 
        prompt, entry_point = example.question_content, example.starter_code
        solution = await graph(prompt, entry_point)
        label = self._get_label(example)
        metrics = await super().async_evaluate(prediction=solution, label=label)
        return metrics["pass@1"]
    
    
    def evaluate(self, prediction: Any, label: Any) -> dict:
        """
        Evaluate the solution code.

        Args:
            prediction (str | List[str]): The solution code(s).
            label (dict | List[dict]): The test cases and expected outputs. 

        Returns:
            dict: The evaluation metrics (pass@k).
        """
        
#         print("pred", prediction)
#         print("label", label)
#         reliability_guard(8*1024*1024*1024)
        prediction, label = self._check_evaluation_inputs(prediction, label)
        k_list = [self.k] if isinstance(self.k, int) else self.k

        if self.scenario == "code_generation":
            solutions: List[str] = [extract_code_blocks(pred)[0] for pred in prediction]
#             print(solutions)
#             print(label)
            metrics, results, metadatas = codegen_metrics(
                samples_list=label, # label is already a list 
                generations_list=[solutions], # for a single example. 
                k_list=k_list, 
                num_process_evaluate=self.num_process,
                timeout=self.timeout
            )
    
            self.met = metrics
            self.res = results
            self.metadatas = metadatas
            
        elif self.scenario == "test_output_prediction":
            pred_outputs = [extract_test_output_code(pred) for pred in prediction]
            metrics, results = test_output_metrics(
                samples=label, 
                generations=[pred_outputs], 
                k_list=k_list, 
            )
        elif self.scenario == "code_execution":
            pred_outputs = [extract_execution_code(pred, self.use_cot_for_execution) for pred in prediction]
            metrics, results = code_execution_metrics(
                samples=label, 
                generations=[pred_outputs], 
            )
        else:
            raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.")
        
        pass_at_k = {f"pass@{k}": float(metrics[f"pass@{k}"]) for k in k_list}
        return pass_at_k
    
class AFlowLiveCodeBench(CodingBenchmark):

    """Benchmark class for evaluating LLM capabilities on real-world programming tasks.
    
    LiveCodeBench provides a framework for evaluating different scenarios of code-related tasks:
    1. Code Generation: generating code from problem descriptions
    2. Test Output Prediction: predicting test outputs given test code
    3. Code Execution: generating code that executes correctly
    
    The benchmark supports different evaluation modes, metrics, and can be customized
    with various parameters like timeouts, sample dates, and processing options.
    
    Attributes:
        k: An integer or list of integers specifying which pass@k metrics to compute
        version: Release version of the dataset to use
        num_process: Number of processes to use for evaluation
        start_date: Filter problems to those after this date
        end_date: Filter problems to those before this date
        scenario: Type of programming task to evaluate ("code_generation", 
                  "test_output_prediction", or "code_execution")
        use_cot_for_execution: Whether to use chain-of-thought processing for code execution
    """

    def __init__(
        self, 
        path: str = None, 
        mode: str = "all", 
        timeout: int = 60, 
        k: Union[int, list] = 1, 
        num_process: int = 6, 
        scenario: str = "code_generation", 
        version: str = "release_latest", 
        start_date: str = None, 
        end_date: str = None, 
        use_cot_for_execution: bool = False, 
        **kwargs
    ):
        path = os.path.expanduser(path or "~/.evoagentx/data/livecodebench")
        self.k = k 
        self.version = version
        self.num_process = num_process
        self.start_date = start_date
        self.end_date = end_date
        self.scenario = scenario 
        self.use_cot_for_execution = use_cot_for_execution
        assert scenario in VALID_SCENARIO, f"Invalid scenario: {scenario}. Available choices: {VALID_SCENARIO}." 
        super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs)
    
    def _load_data(self):
        if self.mode == "train" or self.mode == "all":
            self._train_data = None 
        if self.mode == "dev" or self.mode == "all":
            self._dev_data = None 
        if self.mode == "test" or self.mode == "all":
            self._test_data = self._load_test_data()
    
    def _load_test_data(self):

        if self.scenario == "code_generation":
            logger.info(f"Loading code generation dataset from {self.path} with version {self.version}.")
            data: List[CodeGenerationProblem] = load_code_generation_dataset(
                release_version=self.version, 
                cache_dir=self.path, 
                start_date=self.start_date, 
                end_date=self.end_date
            )
        elif self.scenario == "test_output_prediction":
            logger.info(f"Loading test output prediction dataset from {self.path}.")
            data: List[TestOutputPredictionProblem] = load_test_prediction_dataset(cache_dir=self.path)
        elif self.scenario == "code_execution":
            logger.info(f"Loading code execution dataset from {self.path}.")
            data: List[CodeExecutionProblem] = load_code_execution_dataset(cache_dir=self.path)
        else:
            raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.")

        return data 
    
    def _get_id(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> str:
        return example.question_id  
    
    def _get_label(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> dict:
        return example.get_evaluation_sample()
    
    async def async_evaluate(self, graph: Callable, example: Any) -> float:

        # generate solution 
        prompt, entry_point = example.question_content, example.question_title
        solution = await graph(prompt, entry_point)
        label = self._get_label(example)
        metrics = await super().async_evaluate(prediction=solution, label=label)
        return metrics["pass@1"]
    
    def extract_test_cases_with_entry_point(self, entry_point: str):

        hardcoded_cases = {
            "remove_odd": "",
            "replace_spaces": "",
            "snake_to_camel": "",
            "Split": "",
            "swap_List": "",
            "square_Sum": "",
            "sort_sublists": "",
            "unique_sublists": "",
        }
        if entry_point in hardcoded_cases:
            return hardcoded_cases[entry_point]
        
        for case in self._dev_data + self._test_data:
            print(entry_point)
            print(case.question_title)
            if case.question_title == entry_point:
                return case.private_test_cases
        
        return None
    
    
    def evaluate(self, prediction: Any, label: Any) -> dict:
        """
        Evaluate the solution code.

        Args:
            prediction (str | List[str]): The solution code(s).
            label (dict | List[dict]): The test cases and expected outputs. 

        Returns:
            dict: The evaluation metrics (pass@k).
        """
        
#         print("pred", prediction)
#         print("label", label)
#         reliability_guard(8*1024*1024*1024)
        prediction, label = self._check_evaluation_inputs(prediction, label)
        k_list = [self.k] if isinstance(self.k, int) else self.k

        if self.scenario == "code_generation":
            solutions: List[str] = [extract_code_blocks(pred)[0] for pred in prediction]
#             print(solutions)
#             print(label)
            metrics, results, metadatas = codegen_metrics(
                samples_list=label, # label is already a list 
                generations_list=[solutions], # for a single example. 
                k_list=k_list, 
                num_process_evaluate=self.num_process,
                timeout=self.timeout
            )
            
        elif self.scenario == "test_output_prediction":
            pred_outputs = [extract_test_output_code(pred) for pred in prediction]
            metrics, results = test_output_metrics(
                samples=label, 
                generations=[pred_outputs], 
                k_list=k_list, 
            )
        elif self.scenario == "code_execution":
            pred_outputs = [extract_execution_code(pred, self.use_cot_for_execution) for pred in prediction]
            metrics, results = code_execution_metrics(
                samples=label, 
                generations=[pred_outputs], 
            )
        else:
            raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.")
        
        pass_at_k = {f"pass@{k}": float(metrics[f"pass@{k}"]) for k in k_list}
        return pass_at_k