|
|
import os |
|
|
import gzip |
|
|
import shutil |
|
|
from typing import Union, Any, Callable |
|
|
from .benchmark import CodingBenchmark |
|
|
from ..core.logging import logger |
|
|
from ..utils.utils import download_file |
|
|
from ..core.module_utils import load_json |
|
|
from ..utils.aflow_utils.data_utils import AFLOW_DATASET_FILES_MAP, download_aflow_benchmark_data |
|
|
|
|
|
|
|
|
def download_raw_humaneval_data(save_folder: str): |
|
|
url = "https://raw.githubusercontent.com/openai/human-eval/master/data/HumanEval.jsonl.gz" |
|
|
logger.info(f"Downloading HumanEval data from {url} ...") |
|
|
save_file_path = os.path.join(save_folder, "HumanEval.jsonl.gz") |
|
|
download_file(url=url, save_file=save_file_path) |
|
|
with gzip.open(save_file_path, "rb") as f_in, open(os.path.join(save_folder, "HumanEval.jsonl"), "wb") as f_out: |
|
|
shutil.copyfileobj(f_in, f_out) |
|
|
if os.path.exists(save_file_path): |
|
|
os.remove(save_file_path) |
|
|
|
|
|
|
|
|
def load_humaneval_data(data_path: str): |
|
|
data = load_json(data_path, type="jsonl") |
|
|
|
|
|
for example in data: |
|
|
if example["task_id"] == "HumanEval/115": |
|
|
example["prompt"] = "import math\n" + example["prompt"].replace("import math", "") |
|
|
return data |
|
|
|
|
|
|
|
|
class HumanEvalPLUS(CodingBenchmark): |
|
|
|
|
|
"""Benchmark class for evaluating code generation on HumanEval. |
|
|
|
|
|
HumanEval is a collection of Python programming problems designed to test |
|
|
a model's ability to generate functionally correct code from natural language |
|
|
descriptions. This class handles loading the dataset, evaluating solutions, |
|
|
and computing metrics such as pass@k. |
|
|
|
|
|
Each HumanEval example has the following structure: |
|
|
{ |
|
|
"task_id": "HumanEval/0", |
|
|
"prompt": "from typing import List\n\ndef func_name(*args, **kwargs) -> return_type\n "function description"\n\n", |
|
|
"entry_point": "func_name", |
|
|
"canonical_solution": "canonical solution (code)", |
|
|
"test": "METADATA = {xxx}\n\n\ndef check(candidate):\n assert candidate(inputs) == output\n" |
|
|
} |
|
|
|
|
|
Attributes: |
|
|
k: An integer or list of integers specifying which pass@k metrics to compute |
|
|
""" |
|
|
|
|
|
def __init__(self, path: str = None, mode: str = "all", timeout: int = 60, k: Union[int, list] = 1, **kwargs): |
|
|
path = os.path.expanduser(path or "~/.evoagentx/data/humaneval") |
|
|
self.k = k |
|
|
self.name = "humaneval" |
|
|
super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs) |
|
|
|
|
|
def _load_data(self): |
|
|
|
|
|
data_path = "/home/tl688/pitl688/selfevolve/EvoAgentX/humanevalplus_info.jsonl" |
|
|
if not os.path.exists(data_path): |
|
|
download_raw_humaneval_data(self.path) |
|
|
|
|
|
|
|
|
if self.mode == "train" or self.mode == "all": |
|
|
self._train_data = None |
|
|
if self.mode == "dev" or self.mode == "all": |
|
|
self._dev_data = None |
|
|
if self.mode == "test" or self.mode == "all": |
|
|
self._test_data = load_humaneval_data(data_path) |
|
|
|
|
|
def _get_label(self, example: Any): |
|
|
|
|
|
return { |
|
|
"task_id": example["task_id"], |
|
|
"canonical_solution": example["canonical_solution"], |
|
|
"test": example["test"], |
|
|
"entry_point": example["entry_point"] |
|
|
} |
|
|
|
|
|
def _get_id(self, example: Any): |
|
|
return example["task_id"] |
|
|
|
|
|
def handle_special_cases(self, task_id: str, solution: str, test: str) -> bool: |
|
|
""" |
|
|
Handle special cases for HumanEval. |
|
|
""" |
|
|
if task_id == "HumanEval/50": |
|
|
solution = ( |
|
|
'\n\ndef encode_shift(s: str):\n """\n returns encoded string by shifting every character by 5 in the alphabet.\n """\n return "".join([chr(((ord(ch) + 5 - ord("a")) % 26) + ord("a")) for ch in s])\n\n\n' |
|
|
+ solution |
|
|
) |
|
|
return solution, test |
|
|
|
|
|
return super().handle_special_cases(task_id=task_id, solution=solution, test=test) |
|
|
|
|
|
def evaluate(self, prediction: Any, label: Any) -> dict: |
|
|
""" |
|
|
Evaluate the solution code. |
|
|
|
|
|
Args: |
|
|
prediction (str | List[str]): The solution code(s). |
|
|
label (dict | List[dict]): The unit test code(s). |
|
|
|
|
|
Returns: |
|
|
dict: The evaluation metrics (pass@k). |
|
|
""" |
|
|
prediction, label = self._check_evaluation_inputs(prediction, label) |
|
|
|
|
|
results = [] |
|
|
for solution in prediction: |
|
|
|
|
|
solution_states = [] |
|
|
for label_data in label: |
|
|
task_id = label_data["task_id"] |
|
|
prompt = self.get_example_by_id(task_id)["prompt"] |
|
|
unit_test = label_data["test"] |
|
|
entry_point = label_data["entry_point"] |
|
|
state, message = self.check_solution( |
|
|
task_id=task_id, |
|
|
solution=prompt + solution, |
|
|
test=unit_test, |
|
|
entry_point=entry_point |
|
|
) |
|
|
if state != self.SUCCESS: |
|
|
break |
|
|
solution_states.append(state) |
|
|
self.error_list[task_id] = message.split('\n')[0] |
|
|
results.append(len(solution_states)==len(label) and all(state==self.SUCCESS for state in solution_states)) |
|
|
|
|
|
k_list = [self.k] if isinstance(self.k, int) else self.k |
|
|
pass_at_k = self.compute_pass_at_k(results, k_list) |
|
|
|
|
|
return pass_at_k |
|
|
|
|
|
class AFlowHumanEvalPLUS(HumanEvalPLUS): |
|
|
|
|
|
""" |
|
|
AFlow-specific implementation of HumanEval benchmark. |
|
|
""" |
|
|
|
|
|
def __init__(self, path: str = None, mode: str = "all", timeout: int = 60, k: Union[int, list] = 1, **kwargs): |
|
|
path = os.path.expanduser(path or "~/.evoagentx/data/aflow/humaneval") |
|
|
super().__init__(path=path, mode=mode, timeout=timeout, k=k, **kwargs) |
|
|
|
|
|
def _load_data_from_file(self, file_name: str): |
|
|
if file_name is None: |
|
|
return None |
|
|
file_path = os.path.join(self.path, file_name) |
|
|
if not os.path.exists(file_path): |
|
|
download_aflow_benchmark_data(dataset="humaneval", save_folder=self.path) |
|
|
|
|
|
return load_json(path=file_path, type="jsonl") |
|
|
|
|
|
def _load_data(self): |
|
|
data_path = "/home/tl688/pitl688/selfevolve/EvoAgentX/humanevalplus_info.jsonl" |
|
|
if self.mode == "train" or self.mode == "all": |
|
|
logger.info(f"Loading train data from {AFLOW_DATASET_FILES_MAP['humaneval']['train']}") |
|
|
self._train_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["humaneval"]["train"]) |
|
|
if self.mode == "dev" or self.mode == "all": |
|
|
logger.info(f"Loading dev data from {AFLOW_DATASET_FILES_MAP['humaneval']['dev']}") |
|
|
self._dev_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["humaneval"]["dev"]) |
|
|
if self.mode == "test" or self.mode == "all": |
|
|
logger.info(f"Loading test data from {AFLOW_DATASET_FILES_MAP['humaneval']['test']}") |
|
|
self._test_data = self._load_data_from_file(file_name=data_path) |
|
|
|
|
|
self._test_cases = self._load_data_from_file(data_path) |
|
|
|
|
|
def extract_test_cases_with_entry_point(self, entry_point: str): |
|
|
""" |
|
|
Extract test cases with the given entry point. |
|
|
""" |
|
|
|
|
|
hardcoded_cases = { |
|
|
"find_zero": "", |
|
|
"decode_cyclic": "", |
|
|
"decode_shift": "", |
|
|
"by_length": "", |
|
|
"add": "", |
|
|
"triangle_area": "", |
|
|
"correct_bracketing": "", |
|
|
"solve": "", |
|
|
"sum_squares": "", |
|
|
"starts_one_ends": "", |
|
|
} |
|
|
if entry_point in hardcoded_cases: |
|
|
return hardcoded_cases[entry_point] |
|
|
|
|
|
for case in self._test_cases: |
|
|
if case["entry_point"] == entry_point: |
|
|
return case["test"] |
|
|
|
|
|
return None |
|
|
|
|
|
async def async_evaluate(self, graph: Callable, example: Any) -> float: |
|
|
|
|
|
|
|
|
prompt, entry_point = example["prompt"], example["entry_point"] |
|
|
solution = await graph(prompt, entry_point) |
|
|
label = self._get_label(example) |
|
|
metrics = await super().async_evaluate(prediction=solution, label=label) |
|
|
return metrics["pass@1"] |
|
|
|
|
|
|