|
|
import os |
|
|
import regex |
|
|
from typing import Union, Any, List, Callable |
|
|
from ..core.logging import logger |
|
|
from .benchmark import CodingBenchmark |
|
|
from ..utils.utils import download_file |
|
|
from ..core.module_utils import load_json |
|
|
from ..utils.aflow_utils.data_utils import AFLOW_DATASET_FILES_MAP, download_aflow_benchmark_data |
|
|
|
|
|
|
|
|
def download_raw_mbpp_data(name: str, save_folder: str): |
|
|
url = "https://raw.githubusercontent.com/google-research/google-research/master/mbpp/sanitized-mbpp.json" |
|
|
logger.info(f"Downloading MBPP data from: {url}") |
|
|
download_file(url=url, save_file=os.path.join(save_folder, name)) |
|
|
|
|
|
|
|
|
def load_mbpp_data(data_path: str): |
|
|
|
|
|
""" |
|
|
load MBPP data from the given path and convert to HumanEval format |
|
|
""" |
|
|
|
|
|
def extract_func_name(func_header: str) -> str: |
|
|
func_name_pattern = r"def\s+([a-zA-Z_]\w*)\s*\(" |
|
|
match = regex.search(func_name_pattern, func_header) |
|
|
if match: |
|
|
return match.group(1) |
|
|
else: |
|
|
return None |
|
|
|
|
|
def extract_func_header(code: str, test_list: List[str]) -> str: |
|
|
lines = code.split("\n") |
|
|
imports, defs = [], [] |
|
|
for line in lines: |
|
|
if line.startswith("def "): |
|
|
break |
|
|
imports.append(line) |
|
|
for line in lines: |
|
|
if line.startswith("def "): |
|
|
defs.append(line) |
|
|
func_head = None |
|
|
for header in defs: |
|
|
func_name = extract_func_name(header) |
|
|
if func_name is None: |
|
|
continue |
|
|
if all(func_name in test for test in test_list): |
|
|
func_head = header |
|
|
break |
|
|
if func_head is None: |
|
|
logger.warning(f"No function header found for {code}") |
|
|
return ("\n".join(imports) + "\n\n" + func_head).strip() |
|
|
|
|
|
data = load_json(data_path, type="json") |
|
|
|
|
|
for example in data: |
|
|
original_prompt = example["prompt"] |
|
|
code = example["code"] |
|
|
test_list = [assert_str.strip() for assert_str in example["test_list"]] |
|
|
func_header = extract_func_header(code, test_list) |
|
|
|
|
|
if example["task_id"] == 56: |
|
|
|
|
|
func_header = func_header.replace("check", "check_answer") |
|
|
code = code.replace("check", "check_answer") |
|
|
test_list = [test.replace("check", "check_answer") for test in test_list] |
|
|
|
|
|
prompt = example["prompt"] + "\n\n" + func_header + "\n" |
|
|
canonical_solution = code |
|
|
test = "def check(candidate):\n " + "\n ".join(test_list) + "\n" |
|
|
entry_point = extract_func_name(func_header) |
|
|
|
|
|
example["prompt"] = prompt |
|
|
example["entry_point"] = entry_point |
|
|
example["canonical_solution"] = canonical_solution |
|
|
example["test"] = test |
|
|
example["original_prompt"] = original_prompt |
|
|
|
|
|
return data |
|
|
|
|
|
|
|
|
class MBPPPLUS(CodingBenchmark): |
|
|
|
|
|
"""Benchmark class for evaluating code generation on the MBPP dataset. |
|
|
|
|
|
MBPP (Mostly Basic Python Programming) is a collection of Python programming |
|
|
problems designed to test a model's ability to generate functionally correct |
|
|
code from natural language descriptions. This class handles loading the dataset, |
|
|
evaluating solutions, and computing metrics such as pass@k. |
|
|
|
|
|
The original MBPP format is transformed to be compatible with the HumanEval |
|
|
benchmark format, allowing for consistent evaluation infrastructure. |
|
|
|
|
|
Each MBPP example has the following structure: |
|
|
{ |
|
|
"task_id" (int): 2, |
|
|
"prompt" (str): "Write a function to find the shared elements from the given two lists.", |
|
|
"code" (str): "def similar_elements(test_tup1, test_tup2):\n res = tuple(set(test_tup1) & set(test_tup2))\n return (res) ", |
|
|
"test_imports": [] |
|
|
"test_list" (List[str]): ['assert set(similar_elements((3, 4, 5, 6),(5, 7, 4, 10))) == set((4, 5))', 'assert set(similar_elements((1, 2, 3, 4),(5, 4, 3, 7))) == set((3, 4))', 'assert set(similar_elements((11, 12, 14, 13),(17, 15, 14, 13))) == set((13, 14))'] |
|
|
} |
|
|
|
|
|
Attributes: |
|
|
k: An integer or list of integers specifying which pass@k metrics to compute |
|
|
""" |
|
|
|
|
|
def __init__(self, path: str = None, mode: str = "all", timeout: int = 60, k: Union[int, list] = 1,**kwargs): |
|
|
path = os.path.expanduser(path or "~/.evoagentx/data/mbpp") |
|
|
self.k = k |
|
|
self.name = "mbpp" |
|
|
super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs) |
|
|
|
|
|
|
|
|
def _load_data_from_file(self, file_name: str): |
|
|
if file_name is None: |
|
|
return None |
|
|
file_path = os.path.join(self.path, file_name) |
|
|
if not os.path.exists(file_path): |
|
|
download_aflow_benchmark_data(dataset="mbpp", save_folder=self.path) |
|
|
|
|
|
return load_json(path=file_path, type="jsonl") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _load_data(self): |
|
|
|
|
|
if self.mode == "train" or self.mode == "all": |
|
|
logger.info(f"Loading train data from {AFLOW_DATASET_FILES_MAP['mbpp']['train']}") |
|
|
self._train_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["mbpp"]["train"]) |
|
|
if self.mode == "dev" or self.mode == "all": |
|
|
logger.info(f"Loading dev data from {AFLOW_DATASET_FILES_MAP['mbpp']['dev']}") |
|
|
self._dev_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["mbpp"]["dev"]) |
|
|
if self.mode == "test" or self.mode == "all": |
|
|
logger.info(f"Loading test data from {AFLOW_DATASET_FILES_MAP['mbpp']['test']}") |
|
|
self._test_data = self._load_data_from_file("/home/tl688/pitl688/selfevolve/EvoAgentX/mbppplus_info.jsonl") |
|
|
|
|
|
|
|
|
self._test_cases = self._load_data_from_file("/home/tl688/pitl688/selfevolve/EvoAgentX/mbppplus_info.jsonl") |
|
|
|
|
|
def _get_id(self, example: Any) -> Any: |
|
|
return example["task_id"] |
|
|
|
|
|
def _get_label(self, example: Any) -> Any: |
|
|
|
|
|
return { |
|
|
"task_id": example["task_id"], |
|
|
"canonical_solution": example["code"], |
|
|
"test": example["test"], |
|
|
"entry_point": example["entry_point"] |
|
|
} |
|
|
|
|
|
def evaluate(self, prediction: Any, label: Any) -> dict: |
|
|
""" |
|
|
Evaluate the solution code. |
|
|
|
|
|
Args: |
|
|
prediction (str | List[str]): The solution code(s). |
|
|
label (dict | List[dict]): The unit test code(s). |
|
|
|
|
|
Returns: |
|
|
dict: The evaluation metrics (pass@k). |
|
|
""" |
|
|
prediction, label = self._check_evaluation_inputs(prediction, label) |
|
|
|
|
|
results = [] |
|
|
for solution in prediction: |
|
|
solution_states = [] |
|
|
for label_data in label: |
|
|
task_id = label_data["task_id"] |
|
|
prompt = self.get_example_by_id(task_id)["prompt"] |
|
|
unit_test = label_data["test"] |
|
|
entry_point = label_data["entry_point"] |
|
|
state, message = self.check_solution_plus( |
|
|
task_id=task_id, |
|
|
solution=solution, |
|
|
test=unit_test, |
|
|
entry_point=entry_point |
|
|
) |
|
|
if state != self.SUCCESS: |
|
|
break |
|
|
solution_states.append(state) |
|
|
results.append(len(solution_states)==len(label) and all(state==self.SUCCESS for state in solution_states)) |
|
|
|
|
|
k_list = [self.k] if isinstance(self.k, int) else self.k |
|
|
pass_at_k = self.compute_pass_at_k(results, k_list) |
|
|
|
|
|
return pass_at_k |
|
|
|
|
|
|
|
|
|
|
|
class AFlowMBPPPLUS(MBPPPLUS): |
|
|
|
|
|
""" |
|
|
AFlow-specific implementation of MBPP benchmark. |
|
|
""" |
|
|
|
|
|
def __init__(self, path: str = None, mode: str = "all", timeout: int = 60, k: Union[int, list] = 1,**kwargs): |
|
|
path = os.path.expanduser(path or "~/.evoagentx/data/aflow/mbpp") |
|
|
super().__init__(path=path, mode=mode, timeout=timeout, k=k, **kwargs) |
|
|
|
|
|
def _load_data_from_file(self, file_name: str): |
|
|
if file_name is None: |
|
|
return None |
|
|
file_path = os.path.join(self.path, file_name) |
|
|
if not os.path.exists(file_path): |
|
|
download_aflow_benchmark_data(dataset="mbpp", save_folder=self.path) |
|
|
|
|
|
return load_json(path=file_path, type="jsonl") |
|
|
|
|
|
def _load_data(self): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if self.mode == "train" or self.mode == "all": |
|
|
logger.info(f"Loading train data from {AFLOW_DATASET_FILES_MAP['mbpp']['train']}") |
|
|
self._train_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["mbpp"]["train"]) |
|
|
if self.mode == "dev" or self.mode == "all": |
|
|
logger.info(f"Loading dev data from {AFLOW_DATASET_FILES_MAP['mbpp']['dev']}") |
|
|
self._dev_data = self._load_data_from_file(file_name=AFLOW_DATASET_FILES_MAP["mbpp"]["dev"]) |
|
|
if self.mode == "test" or self.mode == "all": |
|
|
logger.info(f"Loading test data from {AFLOW_DATASET_FILES_MAP['mbpp']['test']}") |
|
|
self._test_data = self._load_data_from_file("/home/tl688/pitl688/selfevolve/EvoAgentX/mbppplus_info.jsonl") |
|
|
|
|
|
|
|
|
self._test_cases = self._load_data_from_file("/home/tl688/pitl688/selfevolve/EvoAgentX/mbppplus_info.jsonl") |
|
|
|
|
|
def _get_label(self, example: Any): |
|
|
return { |
|
|
"task_id": example["task_id"], |
|
|
"canonical_solution": example["code"], |
|
|
"test": example["test"], |
|
|
"entry_point": example["entry_point"] |
|
|
} |
|
|
|
|
|
def extract_test_cases_with_entry_point(self, entry_point: str): |
|
|
|
|
|
hardcoded_cases = { |
|
|
"remove_odd": "", |
|
|
"replace_spaces": "", |
|
|
"snake_to_camel": "", |
|
|
"Split": "", |
|
|
"swap_List": "", |
|
|
"square_Sum": "", |
|
|
"sort_sublists": "", |
|
|
"unique_sublists": "", |
|
|
} |
|
|
if entry_point in hardcoded_cases: |
|
|
return hardcoded_cases[entry_point] |
|
|
|
|
|
for case in self._test_cases: |
|
|
if case["entry_point"] == entry_point: |
|
|
return case["test"] |
|
|
|
|
|
return None |
|
|
|
|
|
async def async_evaluate(self, graph: Callable, example: Any) -> float: |
|
|
|
|
|
|
|
|
prompt, entry_point = example["prompt"], example["entry_point"] |
|
|
solution = await graph(prompt, entry_point) |
|
|
label = self._get_label(example) |
|
|
metrics = await super().async_evaluate(prediction=solution, label=label) |
|
|
return metrics["pass@1"] |
|
|
|
|
|
def evaluate(self, prediction: Any, label: Any) -> dict: |
|
|
""" |
|
|
Evaluate the solution code. |
|
|
|
|
|
Args: |
|
|
prediction (str | List[str]): The solution code(s). |
|
|
label (dict | List[dict]): The unit test code(s). |
|
|
|
|
|
Returns: |
|
|
dict: The evaluation metrics (pass@k). |
|
|
""" |
|
|
prediction, label = self._check_evaluation_inputs(prediction, label) |
|
|
|
|
|
results = [] |
|
|
for solution in prediction: |
|
|
solution_states = [] |
|
|
for label_data in label: |
|
|
task_id = label_data["task_id"] |
|
|
prompt = self.get_example_by_id(task_id)["prompt"] |
|
|
unit_test = label_data["test"] |
|
|
entry_point = label_data["entry_point"] |
|
|
state, message = self.check_solution_plus( |
|
|
task_id=task_id, |
|
|
solution=solution, |
|
|
test=unit_test, |
|
|
entry_point=entry_point, |
|
|
use_entrypoint_as_input=False |
|
|
) |
|
|
|
|
|
|
|
|
if state != self.SUCCESS: |
|
|
break |
|
|
solution_states.append(state) |
|
|
self.error_list[task_id] = message.split('\n')[0] |
|
|
results.append(len(solution_states)==len(label) and all(state==self.SUCCESS for state in solution_states)) |
|
|
|
|
|
k_list = [self.k] if isinstance(self.k, int) else self.k |
|
|
pass_at_k = self.compute_pass_at_k(results, k_list) |
|
|
|
|
|
|
|
|
return pass_at_k |
|
|
|
|
|
def evaluate_mbpp_witherror(self, prediction: Any, label: Any) -> dict: |
|
|
""" |
|
|
Evaluate the solution code. |
|
|
|
|
|
Args: |
|
|
prediction (str | List[str]): The solution code(s). |
|
|
label (dict | List[dict]): The unit test code(s). |
|
|
|
|
|
Returns: |
|
|
dict: The evaluation metrics (pass@k). |
|
|
""" |
|
|
prediction, label = self._check_evaluation_inputs(prediction, label) |
|
|
|
|
|
results = [] |
|
|
error_list = [] |
|
|
for solution in prediction: |
|
|
solution_states = [] |
|
|
for label_data in label: |
|
|
task_id = label_data["task_id"] |
|
|
prompt = self.get_example_by_id(task_id)["prompt"] |
|
|
unit_test = label_data["test"] |
|
|
entry_point = label_data["entry_point"] |
|
|
state, message = self.check_solution_plus( |
|
|
task_id=task_id, |
|
|
solution=prompt + "\n" + solution, |
|
|
test=unit_test, |
|
|
entry_point=entry_point, |
|
|
use_entrypoint_as_input=False |
|
|
) |
|
|
print(state) |
|
|
print(message) |
|
|
if state != self.SUCCESS: |
|
|
break |
|
|
solution_states.append(state) |
|
|
error_list.append(message) |
|
|
results.append(len(solution_states)==len(label) and all(state==self.SUCCESS for state in solution_states)) |
|
|
|
|
|
k_list = [self.k] if isinstance(self.k, int) else self.k |
|
|
pass_at_k = self.compute_pass_at_k(results, k_list) |
|
|
|
|
|
return pass_at_k |
|
|
|