iLOVE2D's picture
Upload 2846 files
5374a2d verified
import os
import gzip
import shutil
from typing import Union, Any, Callable
from .benchmark import CodingBenchmark
from ..core.logging import logger
from ..utils.utils import download_file
from ..core.module_utils import load_json
from ..utils.aflow_utils.data_utils import AFLOW_DATASET_FILES_MAP, download_aflow_benchmark_data
def download_raw_humaneval_data(save_folder: str):
url = "https://raw.githubusercontent.com/openai/human-eval/master/data/HumanEval.jsonl.gz"
logger.info(f"Downloading HumanEval data from {url} ...")
save_file_path = os.path.join(save_folder, "HumanEval.jsonl.gz")
download_file(url=url, save_file=save_file_path)
with gzip.open(save_file_path, "rb") as f_in, open(os.path.join(save_folder, "HumanEval.jsonl"), "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
if os.path.exists(save_file_path):
os.remove(save_file_path)
def load_humaneval_data(data_path: str):
data = load_json(data_path, type="jsonl")
# Handle 115 prompt to make its docstring well-formed
for example in data:
if example["task_id"] == "HumanEval/115":
example["prompt"] = "import math\n" + example["prompt"].replace("import math", "")
return data
class HumanEvalPLUS(CodingBenchmark):
"""Benchmark class for evaluating code generation on HumanEval.
HumanEval is a collection of Python programming problems designed to test
a model's ability to generate functionally correct code from natural language
descriptions. This class handles loading the dataset, evaluating solutions,
and computing metrics such as pass@k.
Each HumanEval example has the following structure:
{
"task_id": "HumanEval/0",
"prompt": "from typing import List\n\ndef func_name(*args, **kwargs) -> return_type\n "function description"\n\n",
"entry_point": "func_name",
"canonical_solution": "canonical solution (code)",
"test": "METADATA = {xxx}\n\n\ndef check(candidate):\n assert candidate(inputs) == output\n"
}
Attributes:
k: An integer or list of integers specifying which pass@k metrics to compute
"""
def __init__(self, path: str = None, mode: str = "all", timeout: int = 60, k: Union[int, list] = 1, **kwargs):
path = os.path.expanduser(path or "~/.evoagentx/data/humaneval")
self.k = k
self.name = "humaneval"
super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs)
def _load_data(self):
data_path = "/home/tl688/pitl688/selfevolve/EvoAgentX/humanevalplus_info.jsonl"
if not os.path.exists(data_path):
download_raw_humaneval_data(self.path)
# load data
if self.mode == "train" or self.mode == "all":
self._train_data = None
if self.mode == "dev" or self.mode == "all":
self._dev_data = None
if self.mode == "test" or self.mode == "all":
self._test_data = load_humaneval_data(data_path)
def _get_label(self, example: Any):
# return the unit test code
return {
"task_id": example["task_id"],
"canonical_solution": example["canonical_solution"],
"test": example["test"],
"entry_point": example["entry_point"]
}
def _get_id(self, example: Any):
return example["task_id"]
def handle_special_cases(self, task_id: str, solution: str, test: str) -> bool:
"""
Handle special cases for HumanEval.
"""
if task_id == "HumanEval/50":
solution = (
'\n\ndef encode_shift(s: str):\n """\n returns encoded string by shifting every character by 5 in the alphabet.\n """\n return "".join([chr(((ord(ch) + 5 - ord("a")) % 26) + ord("a")) for ch in s])\n\n\n'
+ solution
)
return solution, test
return super().handle_special_cases(task_id=task_id, solution=solution, test=test)
def evaluate(self, prediction: Any, label: Any) -> dict:
"""
Evaluate the solution code.
Args:
prediction (str | List[str]): The solution code(s).
label (dict | List[dict]): The unit test code(s).
Returns:
dict: The evaluation metrics (pass@k).
"""
prediction, label = self._check_evaluation_inputs(prediction, label)
results = []
for solution in prediction:
# print(solution)
solution_states = []
for label_data in label:
task_id = label_data["task_id"]
prompt = self.get_example_by_id(task_id)["prompt"]
unit_test = label_data["test"]
entry_point = label_data["entry_point"]
state, message = self.check_solution(
task_id=task_id,
solution=prompt + solution,
test=unit_test,
entry_point=entry_point
)
if state != self.SUCCESS:
break
solution_states.append(state)
self.error_list[task_id] = message.split('\n')[0]
results.append(len(solution_states)==len(label) and all(state==self.SUCCESS for state in solution_states))
k_list = [self.k] if isinstance(self.k, int) else self.k
pass_at_k = self.compute_pass_at_k(results, k_list)
return pass_at_k
class AFlowHumanEvalPLUS(HumanEvalPLUS):
"""
AFlow-specific implementation of HumanEval benchmark.
"""
def __init__(self, path: str = None, mode: str = "all", timeout: int = 60, k: Union[int, list] = 1, **kwargs):
path = os.path.expanduser(path or "~/.evoagentx/data/aflow/humaneval")
super().__init__(path=path, mode=mode, timeout=timeout, k=k, **kwargs)
def _load_data_from_file(self, file_name: str):
if file_name is None:
return None
file_path = os.path.join(self.path, file_name)
if not os.path.exists(file_path):
download_aflow_benchmark_data(dataset="humaneval", save_folder=self.path)
return load_json(path=file_path, type="jsonl")
def _load_data(self):
data_path = "/home/tl688/pitl688/selfevolve/EvoAgentX/humanevalplus_info.jsonl"
if self.mode == "train" or self.mode == "all":
logger.info(f"Loading train data from {AFLOW_DATASET_FILES_MAP['humaneval']['train']}")
self._train_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["humaneval"]["train"])
if self.mode == "dev" or self.mode == "all":
logger.info(f"Loading dev data from {AFLOW_DATASET_FILES_MAP['humaneval']['dev']}")
self._dev_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["humaneval"]["dev"])
if self.mode == "test" or self.mode == "all":
logger.info(f"Loading test data from {AFLOW_DATASET_FILES_MAP['humaneval']['test']}")
self._test_data = self._load_data_from_file(file_name=data_path)
# load test cases
self._test_cases = self._load_data_from_file(data_path)
def extract_test_cases_with_entry_point(self, entry_point: str):
"""
Extract test cases with the given entry point.
"""
hardcoded_cases = {
"find_zero": "",
"decode_cyclic": "",
"decode_shift": "",
"by_length": "",
"add": "",
"triangle_area": "",
"correct_bracketing": "",
"solve": "",
"sum_squares": "",
"starts_one_ends": "",
}
if entry_point in hardcoded_cases:
return hardcoded_cases[entry_point]
for case in self._test_cases:
if case["entry_point"] == entry_point:
return case["test"]
return None
async def async_evaluate(self, graph: Callable, example: Any) -> float:
# generate solution
prompt, entry_point = example["prompt"], example["entry_point"]
solution = await graph(prompt, entry_point)
label = self._get_label(example)
metrics = await super().async_evaluate(prediction=solution, label=label)
return metrics["pass@1"]