import os import gzip import shutil from typing import Union, Any, Callable from .benchmark import CodingBenchmark from ..core.logging import logger from ..utils.utils import download_file from ..core.module_utils import load_json from ..utils.aflow_utils.data_utils import AFLOW_DATASET_FILES_MAP, download_aflow_benchmark_data def download_raw_humaneval_data(save_folder: str): url = "https://raw.githubusercontent.com/openai/human-eval/master/data/HumanEval.jsonl.gz" logger.info(f"Downloading HumanEval data from {url} ...") save_file_path = os.path.join(save_folder, "HumanEval.jsonl.gz") download_file(url=url, save_file=save_file_path) with gzip.open(save_file_path, "rb") as f_in, open(os.path.join(save_folder, "HumanEval.jsonl"), "wb") as f_out: shutil.copyfileobj(f_in, f_out) if os.path.exists(save_file_path): os.remove(save_file_path) def load_humaneval_data(data_path: str): data = load_json(data_path, type="jsonl") # Handle 115 prompt to make its docstring well-formed for example in data: if example["task_id"] == "HumanEval/115": example["prompt"] = "import math\n" + example["prompt"].replace("import math", "") return data class HumanEvalPLUS(CodingBenchmark): """Benchmark class for evaluating code generation on HumanEval. HumanEval is a collection of Python programming problems designed to test a model's ability to generate functionally correct code from natural language descriptions. This class handles loading the dataset, evaluating solutions, and computing metrics such as pass@k. Each HumanEval example has the following structure: { "task_id": "HumanEval/0", "prompt": "from typing import List\n\ndef func_name(*args, **kwargs) -> return_type\n "function description"\n\n", "entry_point": "func_name", "canonical_solution": "canonical solution (code)", "test": "METADATA = {xxx}\n\n\ndef check(candidate):\n assert candidate(inputs) == output\n" } Attributes: k: An integer or list of integers specifying which pass@k metrics to compute """ def __init__(self, path: str = None, mode: str = "all", timeout: int = 60, k: Union[int, list] = 1, **kwargs): path = os.path.expanduser(path or "~/.evoagentx/data/humaneval") self.k = k self.name = "humaneval" super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs) def _load_data(self): data_path = "/home/tl688/pitl688/selfevolve/EvoAgentX/humanevalplus_info.jsonl" if not os.path.exists(data_path): download_raw_humaneval_data(self.path) # load data if self.mode == "train" or self.mode == "all": self._train_data = None if self.mode == "dev" or self.mode == "all": self._dev_data = None if self.mode == "test" or self.mode == "all": self._test_data = load_humaneval_data(data_path) def _get_label(self, example: Any): # return the unit test code return { "task_id": example["task_id"], "canonical_solution": example["canonical_solution"], "test": example["test"], "entry_point": example["entry_point"] } def _get_id(self, example: Any): return example["task_id"] def handle_special_cases(self, task_id: str, solution: str, test: str) -> bool: """ Handle special cases for HumanEval. """ if task_id == "HumanEval/50": solution = ( '\n\ndef encode_shift(s: str):\n """\n returns encoded string by shifting every character by 5 in the alphabet.\n """\n return "".join([chr(((ord(ch) + 5 - ord("a")) % 26) + ord("a")) for ch in s])\n\n\n' + solution ) return solution, test return super().handle_special_cases(task_id=task_id, solution=solution, test=test) def evaluate(self, prediction: Any, label: Any) -> dict: """ Evaluate the solution code. Args: prediction (str | List[str]): The solution code(s). label (dict | List[dict]): The unit test code(s). Returns: dict: The evaluation metrics (pass@k). """ prediction, label = self._check_evaluation_inputs(prediction, label) results = [] for solution in prediction: # print(solution) solution_states = [] for label_data in label: task_id = label_data["task_id"] prompt = self.get_example_by_id(task_id)["prompt"] unit_test = label_data["test"] entry_point = label_data["entry_point"] state, message = self.check_solution( task_id=task_id, solution=prompt + solution, test=unit_test, entry_point=entry_point ) if state != self.SUCCESS: break solution_states.append(state) self.error_list[task_id] = message.split('\n')[0] results.append(len(solution_states)==len(label) and all(state==self.SUCCESS for state in solution_states)) k_list = [self.k] if isinstance(self.k, int) else self.k pass_at_k = self.compute_pass_at_k(results, k_list) return pass_at_k class AFlowHumanEvalPLUS(HumanEvalPLUS): """ AFlow-specific implementation of HumanEval benchmark. """ def __init__(self, path: str = None, mode: str = "all", timeout: int = 60, k: Union[int, list] = 1, **kwargs): path = os.path.expanduser(path or "~/.evoagentx/data/aflow/humaneval") super().__init__(path=path, mode=mode, timeout=timeout, k=k, **kwargs) def _load_data_from_file(self, file_name: str): if file_name is None: return None file_path = os.path.join(self.path, file_name) if not os.path.exists(file_path): download_aflow_benchmark_data(dataset="humaneval", save_folder=self.path) return load_json(path=file_path, type="jsonl") def _load_data(self): data_path = "/home/tl688/pitl688/selfevolve/EvoAgentX/humanevalplus_info.jsonl" if self.mode == "train" or self.mode == "all": logger.info(f"Loading train data from {AFLOW_DATASET_FILES_MAP['humaneval']['train']}") self._train_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["humaneval"]["train"]) if self.mode == "dev" or self.mode == "all": logger.info(f"Loading dev data from {AFLOW_DATASET_FILES_MAP['humaneval']['dev']}") self._dev_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["humaneval"]["dev"]) if self.mode == "test" or self.mode == "all": logger.info(f"Loading test data from {AFLOW_DATASET_FILES_MAP['humaneval']['test']}") self._test_data = self._load_data_from_file(file_name=data_path) # load test cases self._test_cases = self._load_data_from_file(data_path) def extract_test_cases_with_entry_point(self, entry_point: str): """ Extract test cases with the given entry point. """ hardcoded_cases = { "find_zero": "", "decode_cyclic": "", "decode_shift": "", "by_length": "", "add": "", "triangle_area": "", "correct_bracketing": "", "solve": "", "sum_squares": "", "starts_one_ends": "", } if entry_point in hardcoded_cases: return hardcoded_cases[entry_point] for case in self._test_cases: if case["entry_point"] == entry_point: return case["test"] return None async def async_evaluate(self, graph: Callable, example: Any) -> float: # generate solution prompt, entry_point = example["prompt"], example["entry_point"] solution = await graph(prompt, entry_point) label = self._get_label(example) metrics = await super().async_evaluate(prediction=solution, label=label) return metrics["pass@1"]