selfevolveagent / evoagentx /benchmark /livecodebench.py

Upload 2846 files

5374a2d verified 12 days ago

14.1 kB

	import os
	# import regex
	from typing import Union, Any, List
	from ..core.logging import logger
	from .benchmark import CodingBenchmark
	from ..core.module_utils import extract_code_blocks
	from typing import Union, Any, List, Callable
	from .lcb_utils.code_generation import (
	CodeGenerationProblem,
	load_code_generation_dataset
	)
	from .lcb_utils.test_output_prediction import (
	TestOutputPredictionProblem,
	load_test_prediction_dataset
	)
	from .lcb_utils.code_execution import (
	CodeExecutionProblem,
	load_code_execution_dataset
	)
	from .lcb_utils.evaluation import (
	codegen_metrics,
	test_output_metrics,
	code_execution_metrics,
	reliability_guard
	)
	from .lcb_utils.utils import extract_test_output_code, extract_execution_code


	VALID_SCENARIO = ["code_generation", "test_output_prediction", "code_execution"]

	class LiveCodeBench(CodingBenchmark):

	"""Benchmark class for evaluating LLM capabilities on real-world programming tasks.

	LiveCodeBench provides a framework for evaluating different scenarios of code-related tasks:
	1. Code Generation: generating code from problem descriptions
	2. Test Output Prediction: predicting test outputs given test code
	3. Code Execution: generating code that executes correctly

	The benchmark supports different evaluation modes, metrics, and can be customized
	with various parameters like timeouts, sample dates, and processing options.

	Attributes:
	k: An integer or list of integers specifying which pass@k metrics to compute
	version: Release version of the dataset to use
	num_process: Number of processes to use for evaluation
	start_date: Filter problems to those after this date
	end_date: Filter problems to those before this date
	scenario: Type of programming task to evaluate ("code_generation",
	"test_output_prediction", or "code_execution")
	use_cot_for_execution: Whether to use chain-of-thought processing for code execution
	"""

	def __init__(
	self,
	path: str = None,
	mode: str = "all",
	timeout: int = 60,
	k: Union[int, list] = 1,
	num_process: int = 6,
	scenario: str = "code_generation",
	version: str = "release_latest",
	start_date: str = None,
	end_date: str = None,
	use_cot_for_execution: bool = False,
	**kwargs
	):
	path = os.path.expanduser(path or "~/.evoagentx/data/livecodebench")
	self.k = k
	self.version = version
	self.num_process = num_process
	self.start_date = start_date
	self.end_date = end_date
	self.scenario = scenario
	self.name = 'livecodebench'
	self.use_cot_for_execution = use_cot_for_execution
	assert scenario in VALID_SCENARIO, f"Invalid scenario: {scenario}. Available choices: {VALID_SCENARIO}."
	super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs)

	def _load_data(self):
	if self.mode == "train" or self.mode == "all":
	self._train_data = None
	if self.mode == "dev" or self.mode == "all":
	self._dev_data = None
	if self.mode == "test" or self.mode == "all":
	self._test_data = self._load_test_data()

	def _load_test_data(self):

	if self.scenario == "code_generation":
	logger.info(f"Loading code generation dataset from {self.path} with version {self.version}.")
	data: List[CodeGenerationProblem] = load_code_generation_dataset(
	release_version=self.version,
	cache_dir=self.path,
	start_date=self.start_date,
	end_date=self.end_date
	)
	elif self.scenario == "test_output_prediction":
	logger.info(f"Loading test output prediction dataset from {self.path}.")
	data: List[TestOutputPredictionProblem] = load_test_prediction_dataset(cache_dir=self.path)
	elif self.scenario == "code_execution":
	logger.info(f"Loading code execution dataset from {self.path}.")
	data: List[CodeExecutionProblem] = load_code_execution_dataset(cache_dir=self.path)
	else:
	raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.")

	return data

	def _get_id(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> str:
	return example.question_id

	def _get_label(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> dict:
	return example.get_evaluation_sample()

	async def async_evaluate(self, graph: Callable, example: Any) -> float:

	# generate solution
	prompt, entry_point = example.question_content, example.starter_code
	solution = await graph(prompt, entry_point)
	label = self._get_label(example)
	metrics = await super().async_evaluate(prediction=solution, label=label)
	return metrics["pass@1"]


	def evaluate(self, prediction: Any, label: Any) -> dict:
	"""
	Evaluate the solution code.

	Args:
	prediction (str \| List[str]): The solution code(s).
	label (dict \| List[dict]): The test cases and expected outputs.

	Returns:
	dict: The evaluation metrics (pass@k).
	"""

	# print("pred", prediction)
	# print("label", label)
	# reliability_guard(810241024*1024)
	prediction, label = self._check_evaluation_inputs(prediction, label)
	k_list = [self.k] if isinstance(self.k, int) else self.k

	if self.scenario == "code_generation":
	solutions: List[str] = [extract_code_blocks(pred)[0] for pred in prediction]
	# print(solutions)
	# print(label)
	metrics, results, metadatas = codegen_metrics(
	samples_list=label, # label is already a list
	generations_list=[solutions], # for a single example.
	k_list=k_list,
	num_process_evaluate=self.num_process,
	timeout=self.timeout
	)

	self.met = metrics
	self.res = results
	self.metadatas = metadatas

	elif self.scenario == "test_output_prediction":
	pred_outputs = [extract_test_output_code(pred) for pred in prediction]
	metrics, results = test_output_metrics(
	samples=label,
	generations=[pred_outputs],
	k_list=k_list,
	)
	elif self.scenario == "code_execution":
	pred_outputs = [extract_execution_code(pred, self.use_cot_for_execution) for pred in prediction]
	metrics, results = code_execution_metrics(
	samples=label,
	generations=[pred_outputs],
	)
	else:
	raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.")

	pass_at_k = {f"pass@{k}": float(metrics[f"pass@{k}"]) for k in k_list}
	return pass_at_k

	class AFlowLiveCodeBench(CodingBenchmark):

	"""Benchmark class for evaluating LLM capabilities on real-world programming tasks.

	LiveCodeBench provides a framework for evaluating different scenarios of code-related tasks:
	1. Code Generation: generating code from problem descriptions
	2. Test Output Prediction: predicting test outputs given test code
	3. Code Execution: generating code that executes correctly

	The benchmark supports different evaluation modes, metrics, and can be customized
	with various parameters like timeouts, sample dates, and processing options.

	Attributes:
	k: An integer or list of integers specifying which pass@k metrics to compute
	version: Release version of the dataset to use
	num_process: Number of processes to use for evaluation
	start_date: Filter problems to those after this date
	end_date: Filter problems to those before this date
	scenario: Type of programming task to evaluate ("code_generation",
	"test_output_prediction", or "code_execution")
	use_cot_for_execution: Whether to use chain-of-thought processing for code execution
	"""

	def __init__(
	self,
	path: str = None,
	mode: str = "all",
	timeout: int = 60,
	k: Union[int, list] = 1,
	num_process: int = 6,
	scenario: str = "code_generation",
	version: str = "release_latest",
	start_date: str = None,
	end_date: str = None,
	use_cot_for_execution: bool = False,
	**kwargs
	):
	path = os.path.expanduser(path or "~/.evoagentx/data/livecodebench")
	self.k = k
	self.version = version
	self.num_process = num_process
	self.start_date = start_date
	self.end_date = end_date
	self.scenario = scenario
	self.use_cot_for_execution = use_cot_for_execution
	assert scenario in VALID_SCENARIO, f"Invalid scenario: {scenario}. Available choices: {VALID_SCENARIO}."
	super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs)

	def _load_data(self):
	if self.mode == "train" or self.mode == "all":
	self._train_data = None
	if self.mode == "dev" or self.mode == "all":
	self._dev_data = None
	if self.mode == "test" or self.mode == "all":
	self._test_data = self._load_test_data()

	def _load_test_data(self):

	if self.scenario == "code_generation":
	logger.info(f"Loading code generation dataset from {self.path} with version {self.version}.")
	data: List[CodeGenerationProblem] = load_code_generation_dataset(
	release_version=self.version,
	cache_dir=self.path,
	start_date=self.start_date,
	end_date=self.end_date
	)
	elif self.scenario == "test_output_prediction":
	logger.info(f"Loading test output prediction dataset from {self.path}.")
	data: List[TestOutputPredictionProblem] = load_test_prediction_dataset(cache_dir=self.path)
	elif self.scenario == "code_execution":
	logger.info(f"Loading code execution dataset from {self.path}.")
	data: List[CodeExecutionProblem] = load_code_execution_dataset(cache_dir=self.path)
	else:
	raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.")

	return data

	def _get_id(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> str:
	return example.question_id

	def _get_label(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> dict:
	return example.get_evaluation_sample()

	async def async_evaluate(self, graph: Callable, example: Any) -> float:

	# generate solution
	prompt, entry_point = example.question_content, example.question_title
	solution = await graph(prompt, entry_point)
	label = self._get_label(example)
	metrics = await super().async_evaluate(prediction=solution, label=label)
	return metrics["pass@1"]

	def extract_test_cases_with_entry_point(self, entry_point: str):

	hardcoded_cases = {
	"remove_odd": "",
	"replace_spaces": "",
	"snake_to_camel": "",
	"Split": "",
	"swap_List": "",
	"square_Sum": "",
	"sort_sublists": "",
	"unique_sublists": "",
	}
	if entry_point in hardcoded_cases:
	return hardcoded_cases[entry_point]

	for case in self._dev_data + self._test_data:
	print(entry_point)
	print(case.question_title)
	if case.question_title == entry_point:
	return case.private_test_cases

	return None


	def evaluate(self, prediction: Any, label: Any) -> dict:
	"""
	Evaluate the solution code.

	Args:
	prediction (str \| List[str]): The solution code(s).
	label (dict \| List[dict]): The test cases and expected outputs.

	Returns:
	dict: The evaluation metrics (pass@k).
	"""

	# print("pred", prediction)
	# print("label", label)
	# reliability_guard(810241024*1024)
	prediction, label = self._check_evaluation_inputs(prediction, label)
	k_list = [self.k] if isinstance(self.k, int) else self.k

	if self.scenario == "code_generation":
	solutions: List[str] = [extract_code_blocks(pred)[0] for pred in prediction]
	# print(solutions)
	# print(label)
	metrics, results, metadatas = codegen_metrics(
	samples_list=label, # label is already a list
	generations_list=[solutions], # for a single example.
	k_list=k_list,
	num_process_evaluate=self.num_process,
	timeout=self.timeout
	)

	elif self.scenario == "test_output_prediction":
	pred_outputs = [extract_test_output_code(pred) for pred in prediction]
	metrics, results = test_output_metrics(
	samples=label,
	generations=[pred_outputs],
	k_list=k_list,
	)
	elif self.scenario == "code_execution":
	pred_outputs = [extract_execution_code(pred, self.use_cot_for_execution) for pred in prediction]
	metrics, results = code_execution_metrics(
	samples=label,
	generations=[pred_outputs],
	)
	else:
	raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.")

	pass_at_k = {f"pass@{k}": float(metrics[f"pass@{k}"]) for k in k_list}
	return pass_at_k