import os # import regex from typing import Union, Any, List from ..core.logging import logger from .benchmark import CodingBenchmark from ..core.module_utils import extract_code_blocks from typing import Union, Any, List, Callable from .lcb_utils.code_generation import ( CodeGenerationProblem, load_code_generation_dataset ) from .lcb_utils.test_output_prediction import ( TestOutputPredictionProblem, load_test_prediction_dataset ) from .lcb_utils.code_execution import ( CodeExecutionProblem, load_code_execution_dataset ) from .lcb_utils.evaluation import ( codegen_metrics, test_output_metrics, code_execution_metrics, reliability_guard ) from .lcb_utils.utils import extract_test_output_code, extract_execution_code VALID_SCENARIO = ["code_generation", "test_output_prediction", "code_execution"] class LiveCodeBench(CodingBenchmark): """Benchmark class for evaluating LLM capabilities on real-world programming tasks. LiveCodeBench provides a framework for evaluating different scenarios of code-related tasks: 1. Code Generation: generating code from problem descriptions 2. Test Output Prediction: predicting test outputs given test code 3. Code Execution: generating code that executes correctly The benchmark supports different evaluation modes, metrics, and can be customized with various parameters like timeouts, sample dates, and processing options. Attributes: k: An integer or list of integers specifying which pass@k metrics to compute version: Release version of the dataset to use num_process: Number of processes to use for evaluation start_date: Filter problems to those after this date end_date: Filter problems to those before this date scenario: Type of programming task to evaluate ("code_generation", "test_output_prediction", or "code_execution") use_cot_for_execution: Whether to use chain-of-thought processing for code execution """ def __init__( self, path: str = None, mode: str = "all", timeout: int = 60, k: Union[int, list] = 1, num_process: int = 6, scenario: str = "code_generation", version: str = "release_latest", start_date: str = None, end_date: str = None, use_cot_for_execution: bool = False, **kwargs ): path = os.path.expanduser(path or "~/.evoagentx/data/livecodebench") self.k = k self.version = version self.num_process = num_process self.start_date = start_date self.end_date = end_date self.scenario = scenario self.name = 'livecodebench' self.use_cot_for_execution = use_cot_for_execution assert scenario in VALID_SCENARIO, f"Invalid scenario: {scenario}. Available choices: {VALID_SCENARIO}." super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs) def _load_data(self): if self.mode == "train" or self.mode == "all": self._train_data = None if self.mode == "dev" or self.mode == "all": self._dev_data = None if self.mode == "test" or self.mode == "all": self._test_data = self._load_test_data() def _load_test_data(self): if self.scenario == "code_generation": logger.info(f"Loading code generation dataset from {self.path} with version {self.version}.") data: List[CodeGenerationProblem] = load_code_generation_dataset( release_version=self.version, cache_dir=self.path, start_date=self.start_date, end_date=self.end_date ) elif self.scenario == "test_output_prediction": logger.info(f"Loading test output prediction dataset from {self.path}.") data: List[TestOutputPredictionProblem] = load_test_prediction_dataset(cache_dir=self.path) elif self.scenario == "code_execution": logger.info(f"Loading code execution dataset from {self.path}.") data: List[CodeExecutionProblem] = load_code_execution_dataset(cache_dir=self.path) else: raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") return data def _get_id(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> str: return example.question_id def _get_label(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> dict: return example.get_evaluation_sample() async def async_evaluate(self, graph: Callable, example: Any) -> float: # generate solution prompt, entry_point = example.question_content, example.starter_code solution = await graph(prompt, entry_point) label = self._get_label(example) metrics = await super().async_evaluate(prediction=solution, label=label) return metrics["pass@1"] def evaluate(self, prediction: Any, label: Any) -> dict: """ Evaluate the solution code. Args: prediction (str | List[str]): The solution code(s). label (dict | List[dict]): The test cases and expected outputs. Returns: dict: The evaluation metrics (pass@k). """ # print("pred", prediction) # print("label", label) # reliability_guard(8*1024*1024*1024) prediction, label = self._check_evaluation_inputs(prediction, label) k_list = [self.k] if isinstance(self.k, int) else self.k if self.scenario == "code_generation": solutions: List[str] = [extract_code_blocks(pred)[0] for pred in prediction] # print(solutions) # print(label) metrics, results, metadatas = codegen_metrics( samples_list=label, # label is already a list generations_list=[solutions], # for a single example. k_list=k_list, num_process_evaluate=self.num_process, timeout=self.timeout ) self.met = metrics self.res = results self.metadatas = metadatas elif self.scenario == "test_output_prediction": pred_outputs = [extract_test_output_code(pred) for pred in prediction] metrics, results = test_output_metrics( samples=label, generations=[pred_outputs], k_list=k_list, ) elif self.scenario == "code_execution": pred_outputs = [extract_execution_code(pred, self.use_cot_for_execution) for pred in prediction] metrics, results = code_execution_metrics( samples=label, generations=[pred_outputs], ) else: raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") pass_at_k = {f"pass@{k}": float(metrics[f"pass@{k}"]) for k in k_list} return pass_at_k class AFlowLiveCodeBench(CodingBenchmark): """Benchmark class for evaluating LLM capabilities on real-world programming tasks. LiveCodeBench provides a framework for evaluating different scenarios of code-related tasks: 1. Code Generation: generating code from problem descriptions 2. Test Output Prediction: predicting test outputs given test code 3. Code Execution: generating code that executes correctly The benchmark supports different evaluation modes, metrics, and can be customized with various parameters like timeouts, sample dates, and processing options. Attributes: k: An integer or list of integers specifying which pass@k metrics to compute version: Release version of the dataset to use num_process: Number of processes to use for evaluation start_date: Filter problems to those after this date end_date: Filter problems to those before this date scenario: Type of programming task to evaluate ("code_generation", "test_output_prediction", or "code_execution") use_cot_for_execution: Whether to use chain-of-thought processing for code execution """ def __init__( self, path: str = None, mode: str = "all", timeout: int = 60, k: Union[int, list] = 1, num_process: int = 6, scenario: str = "code_generation", version: str = "release_latest", start_date: str = None, end_date: str = None, use_cot_for_execution: bool = False, **kwargs ): path = os.path.expanduser(path or "~/.evoagentx/data/livecodebench") self.k = k self.version = version self.num_process = num_process self.start_date = start_date self.end_date = end_date self.scenario = scenario self.use_cot_for_execution = use_cot_for_execution assert scenario in VALID_SCENARIO, f"Invalid scenario: {scenario}. Available choices: {VALID_SCENARIO}." super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs) def _load_data(self): if self.mode == "train" or self.mode == "all": self._train_data = None if self.mode == "dev" or self.mode == "all": self._dev_data = None if self.mode == "test" or self.mode == "all": self._test_data = self._load_test_data() def _load_test_data(self): if self.scenario == "code_generation": logger.info(f"Loading code generation dataset from {self.path} with version {self.version}.") data: List[CodeGenerationProblem] = load_code_generation_dataset( release_version=self.version, cache_dir=self.path, start_date=self.start_date, end_date=self.end_date ) elif self.scenario == "test_output_prediction": logger.info(f"Loading test output prediction dataset from {self.path}.") data: List[TestOutputPredictionProblem] = load_test_prediction_dataset(cache_dir=self.path) elif self.scenario == "code_execution": logger.info(f"Loading code execution dataset from {self.path}.") data: List[CodeExecutionProblem] = load_code_execution_dataset(cache_dir=self.path) else: raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") return data def _get_id(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> str: return example.question_id def _get_label(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> dict: return example.get_evaluation_sample() async def async_evaluate(self, graph: Callable, example: Any) -> float: # generate solution prompt, entry_point = example.question_content, example.question_title solution = await graph(prompt, entry_point) label = self._get_label(example) metrics = await super().async_evaluate(prediction=solution, label=label) return metrics["pass@1"] def extract_test_cases_with_entry_point(self, entry_point: str): hardcoded_cases = { "remove_odd": "", "replace_spaces": "", "snake_to_camel": "", "Split": "", "swap_List": "", "square_Sum": "", "sort_sublists": "", "unique_sublists": "", } if entry_point in hardcoded_cases: return hardcoded_cases[entry_point] for case in self._dev_data + self._test_data: print(entry_point) print(case.question_title) if case.question_title == entry_point: return case.private_test_cases return None def evaluate(self, prediction: Any, label: Any) -> dict: """ Evaluate the solution code. Args: prediction (str | List[str]): The solution code(s). label (dict | List[dict]): The test cases and expected outputs. Returns: dict: The evaluation metrics (pass@k). """ # print("pred", prediction) # print("label", label) # reliability_guard(8*1024*1024*1024) prediction, label = self._check_evaluation_inputs(prediction, label) k_list = [self.k] if isinstance(self.k, int) else self.k if self.scenario == "code_generation": solutions: List[str] = [extract_code_blocks(pred)[0] for pred in prediction] # print(solutions) # print(label) metrics, results, metadatas = codegen_metrics( samples_list=label, # label is already a list generations_list=[solutions], # for a single example. k_list=k_list, num_process_evaluate=self.num_process, timeout=self.timeout ) elif self.scenario == "test_output_prediction": pred_outputs = [extract_test_output_code(pred) for pred in prediction] metrics, results = test_output_metrics( samples=label, generations=[pred_outputs], k_list=k_list, ) elif self.scenario == "code_execution": pred_outputs = [extract_execution_code(pred, self.use_cot_for_execution) for pred in prediction] metrics, results = code_execution_metrics( samples=label, generations=[pred_outputs], ) else: raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") pass_at_k = {f"pass@{k}": float(metrics[f"pass@{k}"]) for k in k_list} return pass_at_k