iLOVE2D's picture
Upload 2846 files
5374a2d verified
import os
import regex
from typing import Union, Any, List, Callable
from ..core.logging import logger
from .benchmark import CodingBenchmark
from ..utils.utils import download_file
from ..core.module_utils import load_json
from ..utils.aflow_utils.data_utils import AFLOW_DATASET_FILES_MAP, download_aflow_benchmark_data
def download_raw_mbpp_data(name: str, save_folder: str):
url = "https://raw.githubusercontent.com/google-research/google-research/master/mbpp/sanitized-mbpp.json"
logger.info(f"Downloading MBPP data from: {url}")
download_file(url=url, save_file=os.path.join(save_folder, name))
def load_mbpp_data(data_path: str):
"""
load MBPP data from the given path and convert to HumanEval format
"""
def extract_func_name(func_header: str) -> str:
func_name_pattern = r"def\s+([a-zA-Z_]\w*)\s*\("
match = regex.search(func_name_pattern, func_header)
if match:
return match.group(1)
else:
return None
def extract_func_header(code: str, test_list: List[str]) -> str:
lines = code.split("\n")
imports, defs = [], []
for line in lines:
if line.startswith("def "):
break
imports.append(line)
for line in lines:
if line.startswith("def "):
defs.append(line)
func_head = None
for header in defs:
func_name = extract_func_name(header)
if func_name is None:
continue
if all(func_name in test for test in test_list):
func_head = header
break
if func_head is None:
logger.warning(f"No function header found for {code}")
return ("\n".join(imports) + "\n\n" + func_head).strip()
data = load_json(data_path, type="json")
for example in data:
original_prompt = example["prompt"]
code = example["code"]
test_list = [assert_str.strip() for assert_str in example["test_list"]]
func_header = extract_func_header(code, test_list)
if example["task_id"] == 56:
# change the `check` function to `check_answer`
func_header = func_header.replace("check", "check_answer")
code = code.replace("check", "check_answer")
test_list = [test.replace("check", "check_answer") for test in test_list]
prompt = example["prompt"] + "\n\n" + func_header + "\n"
canonical_solution = code
test = "def check(candidate):\n " + "\n ".join(test_list) + "\n"
entry_point = extract_func_name(func_header)
example["prompt"] = prompt
example["entry_point"] = entry_point
example["canonical_solution"] = canonical_solution
example["test"] = test
example["original_prompt"] = original_prompt
return data
class MBPPPLUS(CodingBenchmark):
"""Benchmark class for evaluating code generation on the MBPP dataset.
MBPP (Mostly Basic Python Programming) is a collection of Python programming
problems designed to test a model's ability to generate functionally correct
code from natural language descriptions. This class handles loading the dataset,
evaluating solutions, and computing metrics such as pass@k.
The original MBPP format is transformed to be compatible with the HumanEval
benchmark format, allowing for consistent evaluation infrastructure.
Each MBPP example has the following structure:
{
"task_id" (int): 2,
"prompt" (str): "Write a function to find the shared elements from the given two lists.",
"code" (str): "def similar_elements(test_tup1, test_tup2):\n res = tuple(set(test_tup1) & set(test_tup2))\n return (res) ",
"test_imports": []
"test_list" (List[str]): ['assert set(similar_elements((3, 4, 5, 6),(5, 7, 4, 10))) == set((4, 5))', 'assert set(similar_elements((1, 2, 3, 4),(5, 4, 3, 7))) == set((3, 4))', 'assert set(similar_elements((11, 12, 14, 13),(17, 15, 14, 13))) == set((13, 14))']
}
Attributes:
k: An integer or list of integers specifying which pass@k metrics to compute
"""
def __init__(self, path: str = None, mode: str = "all", timeout: int = 60, k: Union[int, list] = 1,**kwargs):
path = os.path.expanduser(path or "~/.evoagentx/data/mbpp")
self.k = k
self.name = "mbpp"
super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs)
def _load_data_from_file(self, file_name: str):
if file_name is None:
return None
file_path = os.path.join(self.path, file_name)
if not os.path.exists(file_path):
download_aflow_benchmark_data(dataset="mbpp", save_folder=self.path)
return load_json(path=file_path, type="jsonl")
# def _load_data_from_file(self, file_name: str):
# # if file_name is None:
# # return None
# # file_path = os.path.join(self.path, file_name)
# # if not os.path.exists(file_path):
# # download_aflow_benchmark_data(dataset="mbpp", save_folder=self.path)
# if self.mode == "train" or self.mode == "all":
# logger.info(f"Loading train data from {AFLOW_DATASET_FILES_MAP['mbpp']['train']}")
# self._train_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["mbpp"]["train"])
# if self.mode == "dev" or self.mode == "all":
# logger.info(f"Loading dev data from {AFLOW_DATASET_FILES_MAP['mbpp']['dev']}")
# self._dev_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["mbpp"]["dev"])
# if self.mode == "test" or self.mode == "all":
# logger.info(f"Loading test data from {AFLOW_DATASET_FILES_MAP['mbpp']['test']}")
# self._test_data = self._load_data_from_file("/home/tl688/pitl688/selfevolve/EvoAgentX/mbppplus_info.jsonl")
# # load test cases
# self._test_cases = self._load_data_from_file("/home/tl688/pitl688/selfevolve/EvoAgentX/mbppplus_info.jsonl")
def _load_data(self):
# print(AFLOW_DATASET_FILES_MAP)
if self.mode == "train" or self.mode == "all":
logger.info(f"Loading train data from {AFLOW_DATASET_FILES_MAP['mbpp']['train']}")
self._train_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["mbpp"]["train"])
if self.mode == "dev" or self.mode == "all":
logger.info(f"Loading dev data from {AFLOW_DATASET_FILES_MAP['mbpp']['dev']}")
self._dev_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["mbpp"]["dev"])
if self.mode == "test" or self.mode == "all":
logger.info(f"Loading test data from {AFLOW_DATASET_FILES_MAP['mbpp']['test']}")
self._test_data = self._load_data_from_file("/home/tl688/pitl688/selfevolve/EvoAgentX/mbppplus_info.jsonl")
# load test cases
self._test_cases = self._load_data_from_file("/home/tl688/pitl688/selfevolve/EvoAgentX/mbppplus_info.jsonl")
def _get_id(self, example: Any) -> Any:
return example["task_id"]
def _get_label(self, example: Any) -> Any:
# return the unit test code
return {
"task_id": example["task_id"],
"canonical_solution": example["code"],
"test": example["test"],
"entry_point": example["entry_point"]
}
def evaluate(self, prediction: Any, label: Any) -> dict:
"""
Evaluate the solution code.
Args:
prediction (str | List[str]): The solution code(s).
label (dict | List[dict]): The unit test code(s).
Returns:
dict: The evaluation metrics (pass@k).
"""
prediction, label = self._check_evaluation_inputs(prediction, label)
results = []
for solution in prediction:
solution_states = []
for label_data in label:
task_id = label_data["task_id"]
prompt = self.get_example_by_id(task_id)["prompt"]
unit_test = label_data["test"]
entry_point = label_data["entry_point"]
state, message = self.check_solution_plus(
task_id=task_id,
solution=solution,
test=unit_test,
entry_point=entry_point
)
if state != self.SUCCESS:
break
solution_states.append(state)
results.append(len(solution_states)==len(label) and all(state==self.SUCCESS for state in solution_states))
k_list = [self.k] if isinstance(self.k, int) else self.k
pass_at_k = self.compute_pass_at_k(results, k_list)
return pass_at_k
class AFlowMBPPPLUS(MBPPPLUS):
"""
AFlow-specific implementation of MBPP benchmark.
"""
def __init__(self, path: str = None, mode: str = "all", timeout: int = 60, k: Union[int, list] = 1,**kwargs):
path = os.path.expanduser(path or "~/.evoagentx/data/aflow/mbpp")
super().__init__(path=path, mode=mode, timeout=timeout, k=k, **kwargs)
def _load_data_from_file(self, file_name: str):
if file_name is None:
return None
file_path = os.path.join(self.path, file_name)
if not os.path.exists(file_path):
download_aflow_benchmark_data(dataset="mbpp", save_folder=self.path)
return load_json(path=file_path, type="jsonl")
def _load_data(self):
# AFLOW_DATASET_FILES_MAP["mbpp"]["train"] = None
# AFLOW_DATASET_FILES_MAP["mbpp"]["dev"] = "/gpfs/radev/home/tl688/pitl688/selfevolve/AFlow/data/datasets/mbpp_validate.jsonl"
# AFLOW_DATASET_FILES_MAP["mbpp"]["test"] = "/gpfs/radev/home/tl688/pitl688/selfevolve/AFlow/data/datasets/mbpp_test.jsonl"
# AFLOW_DATASET_FILES_MAP["mbpp"]["test_cases"] = "/gpfs/radev/home/tl688/pitl688/selfevolve/AFlow/data/datasets/mbpp_public_test.jsonl"
if self.mode == "train" or self.mode == "all":
logger.info(f"Loading train data from {AFLOW_DATASET_FILES_MAP['mbpp']['train']}")
self._train_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["mbpp"]["train"])
if self.mode == "dev" or self.mode == "all":
logger.info(f"Loading dev data from {AFLOW_DATASET_FILES_MAP['mbpp']['dev']}")
self._dev_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["mbpp"]["dev"])
if self.mode == "test" or self.mode == "all":
logger.info(f"Loading test data from {AFLOW_DATASET_FILES_MAP['mbpp']['test']}")
self._test_data = self._load_data_from_file("/home/tl688/pitl688/selfevolve/EvoAgentX/mbppplus_info.jsonl")
# load test cases
self._test_cases = self._load_data_from_file("/home/tl688/pitl688/selfevolve/EvoAgentX/mbppplus_info.jsonl")
def _get_label(self, example: Any):
return {
"task_id": example["task_id"],
"canonical_solution": example["code"],
"test": example["test"],
"entry_point": example["entry_point"]
}
def extract_test_cases_with_entry_point(self, entry_point: str):
hardcoded_cases = {
"remove_odd": "",
"replace_spaces": "",
"snake_to_camel": "",
"Split": "",
"swap_List": "",
"square_Sum": "",
"sort_sublists": "",
"unique_sublists": "",
}
if entry_point in hardcoded_cases:
return hardcoded_cases[entry_point]
for case in self._test_cases:
if case["entry_point"] == entry_point:
return case["test"]
return None
async def async_evaluate(self, graph: Callable, example: Any) -> float:
# generate solution
prompt, entry_point = example["prompt"], example["entry_point"]
solution = await graph(prompt, entry_point)
label = self._get_label(example)
metrics = await super().async_evaluate(prediction=solution, label=label)
return metrics["pass@1"]
def evaluate(self, prediction: Any, label: Any) -> dict:
"""
Evaluate the solution code.
Args:
prediction (str | List[str]): The solution code(s).
label (dict | List[dict]): The unit test code(s).
Returns:
dict: The evaluation metrics (pass@k).
"""
prediction, label = self._check_evaluation_inputs(prediction, label)
results = []
for solution in prediction:
solution_states = []
for label_data in label:
task_id = label_data["task_id"]
prompt = self.get_example_by_id(task_id)["prompt"]
unit_test = label_data["test"]
entry_point = label_data["entry_point"]
state, message = self.check_solution_plus(
task_id=task_id,
solution=solution,
test=unit_test,
entry_point=entry_point,
use_entrypoint_as_input=False
)
# print(state)
# print(message)
if state != self.SUCCESS:
break
solution_states.append(state)
self.error_list[task_id] = message.split('\n')[0]
results.append(len(solution_states)==len(label) and all(state==self.SUCCESS for state in solution_states))
k_list = [self.k] if isinstance(self.k, int) else self.k
pass_at_k = self.compute_pass_at_k(results, k_list)
# print(self.error_list)
return pass_at_k
def evaluate_mbpp_witherror(self, prediction: Any, label: Any) -> dict:
"""
Evaluate the solution code.
Args:
prediction (str | List[str]): The solution code(s).
label (dict | List[dict]): The unit test code(s).
Returns:
dict: The evaluation metrics (pass@k).
"""
prediction, label = self._check_evaluation_inputs(prediction, label)
results = []
error_list = []
for solution in prediction:
solution_states = []
for label_data in label:
task_id = label_data["task_id"]
prompt = self.get_example_by_id(task_id)["prompt"]
unit_test = label_data["test"]
entry_point = label_data["entry_point"]
state, message = self.check_solution_plus(
task_id=task_id,
solution=prompt + "\n" + solution,
test=unit_test,
entry_point=entry_point,
use_entrypoint_as_input=False
)
print(state)
print(message)
if state != self.SUCCESS:
break
solution_states.append(state)
error_list.append(message)
results.append(len(solution_states)==len(label) and all(state==self.SUCCESS for state in solution_states))
k_list = [self.k] if isinstance(self.k, int) else self.k
pass_at_k = self.compute_pass_at_k(results, k_list)
return pass_at_k