|
|
import os |
|
|
|
|
|
from typing import Union, Any, List |
|
|
from ..core.logging import logger |
|
|
from .benchmark import CodingBenchmark |
|
|
from ..core.module_utils import extract_code_blocks |
|
|
from typing import Union, Any, List, Callable |
|
|
from .lcb_utils.code_generation import ( |
|
|
CodeGenerationProblem, |
|
|
load_code_generation_dataset |
|
|
) |
|
|
from .lcb_utils.test_output_prediction import ( |
|
|
TestOutputPredictionProblem, |
|
|
load_test_prediction_dataset |
|
|
) |
|
|
from .lcb_utils.code_execution import ( |
|
|
CodeExecutionProblem, |
|
|
load_code_execution_dataset |
|
|
) |
|
|
from .lcb_utils.evaluation import ( |
|
|
codegen_metrics, |
|
|
test_output_metrics, |
|
|
code_execution_metrics, |
|
|
reliability_guard |
|
|
) |
|
|
from .lcb_utils.utils import extract_test_output_code, extract_execution_code |
|
|
|
|
|
|
|
|
VALID_SCENARIO = ["code_generation", "test_output_prediction", "code_execution"] |
|
|
|
|
|
class LiveCodeBench(CodingBenchmark): |
|
|
|
|
|
"""Benchmark class for evaluating LLM capabilities on real-world programming tasks. |
|
|
|
|
|
LiveCodeBench provides a framework for evaluating different scenarios of code-related tasks: |
|
|
1. Code Generation: generating code from problem descriptions |
|
|
2. Test Output Prediction: predicting test outputs given test code |
|
|
3. Code Execution: generating code that executes correctly |
|
|
|
|
|
The benchmark supports different evaluation modes, metrics, and can be customized |
|
|
with various parameters like timeouts, sample dates, and processing options. |
|
|
|
|
|
Attributes: |
|
|
k: An integer or list of integers specifying which pass@k metrics to compute |
|
|
version: Release version of the dataset to use |
|
|
num_process: Number of processes to use for evaluation |
|
|
start_date: Filter problems to those after this date |
|
|
end_date: Filter problems to those before this date |
|
|
scenario: Type of programming task to evaluate ("code_generation", |
|
|
"test_output_prediction", or "code_execution") |
|
|
use_cot_for_execution: Whether to use chain-of-thought processing for code execution |
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
path: str = None, |
|
|
mode: str = "all", |
|
|
timeout: int = 60, |
|
|
k: Union[int, list] = 1, |
|
|
num_process: int = 6, |
|
|
scenario: str = "code_generation", |
|
|
version: str = "release_latest", |
|
|
start_date: str = None, |
|
|
end_date: str = None, |
|
|
use_cot_for_execution: bool = False, |
|
|
**kwargs |
|
|
): |
|
|
path = os.path.expanduser(path or "~/.evoagentx/data/livecodebench") |
|
|
self.k = k |
|
|
self.version = version |
|
|
self.num_process = num_process |
|
|
self.start_date = start_date |
|
|
self.end_date = end_date |
|
|
self.scenario = scenario |
|
|
self.name = 'livecodebench' |
|
|
self.use_cot_for_execution = use_cot_for_execution |
|
|
assert scenario in VALID_SCENARIO, f"Invalid scenario: {scenario}. Available choices: {VALID_SCENARIO}." |
|
|
super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs) |
|
|
|
|
|
def _load_data(self): |
|
|
if self.mode == "train" or self.mode == "all": |
|
|
self._train_data = None |
|
|
if self.mode == "dev" or self.mode == "all": |
|
|
self._dev_data = None |
|
|
if self.mode == "test" or self.mode == "all": |
|
|
self._test_data = self._load_test_data() |
|
|
|
|
|
def _load_test_data(self): |
|
|
|
|
|
if self.scenario == "code_generation": |
|
|
logger.info(f"Loading code generation dataset from {self.path} with version {self.version}.") |
|
|
data: List[CodeGenerationProblem] = load_code_generation_dataset( |
|
|
release_version=self.version, |
|
|
cache_dir=self.path, |
|
|
start_date=self.start_date, |
|
|
end_date=self.end_date |
|
|
) |
|
|
elif self.scenario == "test_output_prediction": |
|
|
logger.info(f"Loading test output prediction dataset from {self.path}.") |
|
|
data: List[TestOutputPredictionProblem] = load_test_prediction_dataset(cache_dir=self.path) |
|
|
elif self.scenario == "code_execution": |
|
|
logger.info(f"Loading code execution dataset from {self.path}.") |
|
|
data: List[CodeExecutionProblem] = load_code_execution_dataset(cache_dir=self.path) |
|
|
else: |
|
|
raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") |
|
|
|
|
|
return data |
|
|
|
|
|
def _get_id(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> str: |
|
|
return example.question_id |
|
|
|
|
|
def _get_label(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> dict: |
|
|
return example.get_evaluation_sample() |
|
|
|
|
|
async def async_evaluate(self, graph: Callable, example: Any) -> float: |
|
|
|
|
|
|
|
|
prompt, entry_point = example.question_content, example.starter_code |
|
|
solution = await graph(prompt, entry_point) |
|
|
label = self._get_label(example) |
|
|
metrics = await super().async_evaluate(prediction=solution, label=label) |
|
|
return metrics["pass@1"] |
|
|
|
|
|
|
|
|
def evaluate(self, prediction: Any, label: Any) -> dict: |
|
|
""" |
|
|
Evaluate the solution code. |
|
|
|
|
|
Args: |
|
|
prediction (str | List[str]): The solution code(s). |
|
|
label (dict | List[dict]): The test cases and expected outputs. |
|
|
|
|
|
Returns: |
|
|
dict: The evaluation metrics (pass@k). |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prediction, label = self._check_evaluation_inputs(prediction, label) |
|
|
k_list = [self.k] if isinstance(self.k, int) else self.k |
|
|
|
|
|
if self.scenario == "code_generation": |
|
|
solutions: List[str] = [extract_code_blocks(pred)[0] for pred in prediction] |
|
|
|
|
|
|
|
|
metrics, results, metadatas = codegen_metrics( |
|
|
samples_list=label, |
|
|
generations_list=[solutions], |
|
|
k_list=k_list, |
|
|
num_process_evaluate=self.num_process, |
|
|
timeout=self.timeout |
|
|
) |
|
|
|
|
|
self.met = metrics |
|
|
self.res = results |
|
|
self.metadatas = metadatas |
|
|
|
|
|
elif self.scenario == "test_output_prediction": |
|
|
pred_outputs = [extract_test_output_code(pred) for pred in prediction] |
|
|
metrics, results = test_output_metrics( |
|
|
samples=label, |
|
|
generations=[pred_outputs], |
|
|
k_list=k_list, |
|
|
) |
|
|
elif self.scenario == "code_execution": |
|
|
pred_outputs = [extract_execution_code(pred, self.use_cot_for_execution) for pred in prediction] |
|
|
metrics, results = code_execution_metrics( |
|
|
samples=label, |
|
|
generations=[pred_outputs], |
|
|
) |
|
|
else: |
|
|
raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") |
|
|
|
|
|
pass_at_k = {f"pass@{k}": float(metrics[f"pass@{k}"]) for k in k_list} |
|
|
return pass_at_k |
|
|
|
|
|
class AFlowLiveCodeBench(CodingBenchmark): |
|
|
|
|
|
"""Benchmark class for evaluating LLM capabilities on real-world programming tasks. |
|
|
|
|
|
LiveCodeBench provides a framework for evaluating different scenarios of code-related tasks: |
|
|
1. Code Generation: generating code from problem descriptions |
|
|
2. Test Output Prediction: predicting test outputs given test code |
|
|
3. Code Execution: generating code that executes correctly |
|
|
|
|
|
The benchmark supports different evaluation modes, metrics, and can be customized |
|
|
with various parameters like timeouts, sample dates, and processing options. |
|
|
|
|
|
Attributes: |
|
|
k: An integer or list of integers specifying which pass@k metrics to compute |
|
|
version: Release version of the dataset to use |
|
|
num_process: Number of processes to use for evaluation |
|
|
start_date: Filter problems to those after this date |
|
|
end_date: Filter problems to those before this date |
|
|
scenario: Type of programming task to evaluate ("code_generation", |
|
|
"test_output_prediction", or "code_execution") |
|
|
use_cot_for_execution: Whether to use chain-of-thought processing for code execution |
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
path: str = None, |
|
|
mode: str = "all", |
|
|
timeout: int = 60, |
|
|
k: Union[int, list] = 1, |
|
|
num_process: int = 6, |
|
|
scenario: str = "code_generation", |
|
|
version: str = "release_latest", |
|
|
start_date: str = None, |
|
|
end_date: str = None, |
|
|
use_cot_for_execution: bool = False, |
|
|
**kwargs |
|
|
): |
|
|
path = os.path.expanduser(path or "~/.evoagentx/data/livecodebench") |
|
|
self.k = k |
|
|
self.version = version |
|
|
self.num_process = num_process |
|
|
self.start_date = start_date |
|
|
self.end_date = end_date |
|
|
self.scenario = scenario |
|
|
self.use_cot_for_execution = use_cot_for_execution |
|
|
assert scenario in VALID_SCENARIO, f"Invalid scenario: {scenario}. Available choices: {VALID_SCENARIO}." |
|
|
super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs) |
|
|
|
|
|
def _load_data(self): |
|
|
if self.mode == "train" or self.mode == "all": |
|
|
self._train_data = None |
|
|
if self.mode == "dev" or self.mode == "all": |
|
|
self._dev_data = None |
|
|
if self.mode == "test" or self.mode == "all": |
|
|
self._test_data = self._load_test_data() |
|
|
|
|
|
def _load_test_data(self): |
|
|
|
|
|
if self.scenario == "code_generation": |
|
|
logger.info(f"Loading code generation dataset from {self.path} with version {self.version}.") |
|
|
data: List[CodeGenerationProblem] = load_code_generation_dataset( |
|
|
release_version=self.version, |
|
|
cache_dir=self.path, |
|
|
start_date=self.start_date, |
|
|
end_date=self.end_date |
|
|
) |
|
|
elif self.scenario == "test_output_prediction": |
|
|
logger.info(f"Loading test output prediction dataset from {self.path}.") |
|
|
data: List[TestOutputPredictionProblem] = load_test_prediction_dataset(cache_dir=self.path) |
|
|
elif self.scenario == "code_execution": |
|
|
logger.info(f"Loading code execution dataset from {self.path}.") |
|
|
data: List[CodeExecutionProblem] = load_code_execution_dataset(cache_dir=self.path) |
|
|
else: |
|
|
raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") |
|
|
|
|
|
return data |
|
|
|
|
|
def _get_id(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> str: |
|
|
return example.question_id |
|
|
|
|
|
def _get_label(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> dict: |
|
|
return example.get_evaluation_sample() |
|
|
|
|
|
async def async_evaluate(self, graph: Callable, example: Any) -> float: |
|
|
|
|
|
|
|
|
prompt, entry_point = example.question_content, example.question_title |
|
|
solution = await graph(prompt, entry_point) |
|
|
label = self._get_label(example) |
|
|
metrics = await super().async_evaluate(prediction=solution, label=label) |
|
|
return metrics["pass@1"] |
|
|
|
|
|
def extract_test_cases_with_entry_point(self, entry_point: str): |
|
|
|
|
|
hardcoded_cases = { |
|
|
"remove_odd": "", |
|
|
"replace_spaces": "", |
|
|
"snake_to_camel": "", |
|
|
"Split": "", |
|
|
"swap_List": "", |
|
|
"square_Sum": "", |
|
|
"sort_sublists": "", |
|
|
"unique_sublists": "", |
|
|
} |
|
|
if entry_point in hardcoded_cases: |
|
|
return hardcoded_cases[entry_point] |
|
|
|
|
|
for case in self._dev_data + self._test_data: |
|
|
print(entry_point) |
|
|
print(case.question_title) |
|
|
if case.question_title == entry_point: |
|
|
return case.private_test_cases |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def evaluate(self, prediction: Any, label: Any) -> dict: |
|
|
""" |
|
|
Evaluate the solution code. |
|
|
|
|
|
Args: |
|
|
prediction (str | List[str]): The solution code(s). |
|
|
label (dict | List[dict]): The test cases and expected outputs. |
|
|
|
|
|
Returns: |
|
|
dict: The evaluation metrics (pass@k). |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
prediction, label = self._check_evaluation_inputs(prediction, label) |
|
|
k_list = [self.k] if isinstance(self.k, int) else self.k |
|
|
|
|
|
if self.scenario == "code_generation": |
|
|
solutions: List[str] = [extract_code_blocks(pred)[0] for pred in prediction] |
|
|
|
|
|
|
|
|
metrics, results, metadatas = codegen_metrics( |
|
|
samples_list=label, |
|
|
generations_list=[solutions], |
|
|
k_list=k_list, |
|
|
num_process_evaluate=self.num_process, |
|
|
timeout=self.timeout |
|
|
) |
|
|
|
|
|
elif self.scenario == "test_output_prediction": |
|
|
pred_outputs = [extract_test_output_code(pred) for pred in prediction] |
|
|
metrics, results = test_output_metrics( |
|
|
samples=label, |
|
|
generations=[pred_outputs], |
|
|
k_list=k_list, |
|
|
) |
|
|
elif self.scenario == "code_execution": |
|
|
pred_outputs = [extract_execution_code(pred, self.use_cot_for_execution) for pred in prediction] |
|
|
metrics, results = code_execution_metrics( |
|
|
samples=label, |
|
|
generations=[pred_outputs], |
|
|
) |
|
|
else: |
|
|
raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.") |
|
|
|
|
|
pass_at_k = {f"pass@{k}": float(metrics[f"pass@{k}"]) for k in k_list} |
|
|
return pass_at_k |