File size: 14,136 Bytes

5374a2d

import os 
# import regex
from typing import Union, Any, List
from ..core.logging import logger
from .benchmark import CodingBenchmark 
from ..core.module_utils import extract_code_blocks
from typing import Union, Any, List, Callable
from .lcb_utils.code_generation import (
    CodeGenerationProblem, 
    load_code_generation_dataset
)
from .lcb_utils.test_output_prediction import (
    TestOutputPredictionProblem, 
    load_test_prediction_dataset
)
from .lcb_utils.code_execution import (
    CodeExecutionProblem, 
    load_code_execution_dataset
)
from .lcb_utils.evaluation import (
    codegen_metrics, 
    test_output_metrics,
    code_execution_metrics,
    reliability_guard
)
from .lcb_utils.utils import extract_test_output_code, extract_execution_code


VALID_SCENARIO = ["code_generation", "test_output_prediction", "code_execution"]

class LiveCodeBench(CodingBenchmark):

    """Benchmark class for evaluating LLM capabilities on real-world programming tasks.
    
    LiveCodeBench provides a framework for evaluating different scenarios of code-related tasks:
    1. Code Generation: generating code from problem descriptions
    2. Test Output Prediction: predicting test outputs given test code
    3. Code Execution: generating code that executes correctly
    
    The benchmark supports different evaluation modes, metrics, and can be customized
    with various parameters like timeouts, sample dates, and processing options.
    
    Attributes:
        k: An integer or list of integers specifying which pass@k metrics to compute
        version: Release version of the dataset to use
        num_process: Number of processes to use for evaluation
        start_date: Filter problems to those after this date
        end_date: Filter problems to those before this date
        scenario: Type of programming task to evaluate ("code_generation", 
                  "test_output_prediction", or "code_execution")
        use_cot_for_execution: Whether to use chain-of-thought processing for code execution
    """

    def __init__(
        self, 
        path: str = None, 
        mode: str = "all", 
        timeout: int = 60, 
        k: Union[int, list] = 1, 
        num_process: int = 6, 
        scenario: str = "code_generation", 
        version: str = "release_latest", 
        start_date: str = None, 
        end_date: str = None, 
        use_cot_for_execution: bool = False, 
        **kwargs
    ):
        path = os.path.expanduser(path or "~/.evoagentx/data/livecodebench")
        self.k = k 
        self.version = version
        self.num_process = num_process
        self.start_date = start_date
        self.end_date = end_date
        self.scenario = scenario 
        self.name = 'livecodebench'
        self.use_cot_for_execution = use_cot_for_execution
        assert scenario in VALID_SCENARIO, f"Invalid scenario: {scenario}. Available choices: {VALID_SCENARIO}." 
        super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs)
    
    def _load_data(self):
        if self.mode == "train" or self.mode == "all":
            self._train_data = None 
        if self.mode == "dev" or self.mode == "all":
            self._dev_data = None 
        if self.mode == "test" or self.mode == "all":
            self._test_data = self._load_test_data()
    
    def _load_test_data(self):

        if self.scenario == "code_generation":
            logger.info(f"Loading code generation dataset from {self.path} with version {self.version}.")
            data: List[CodeGenerationProblem] = load_code_generation_dataset(
                release_version=self.version, 
                cache_dir=self.path, 
                start_date=self.start_date, 
                end_date=self.end_date
            )
        elif self.scenario == "test_output_prediction":
            logger.info(f"Loading test output prediction dataset from {self.path}.")
            data: List[TestOutputPredictionProblem] = load_test_prediction_dataset(cache_dir=self.path)
        elif self.scenario == "code_execution":
            logger.info(f"Loading code execution dataset from {self.path}.")
            data: List[CodeExecutionProblem] = load_code_execution_dataset(cache_dir=self.path)
        else:
            raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.")

        return data 
    
    def _get_id(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> str:
        return example.question_id  
    
    def _get_label(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> dict:
        return example.get_evaluation_sample()
    
    async def async_evaluate(self, graph: Callable, example: Any) -> float:

        # generate solution 
        prompt, entry_point = example.question_content, example.starter_code
        solution = await graph(prompt, entry_point)
        label = self._get_label(example)
        metrics = await super().async_evaluate(prediction=solution, label=label)
        return metrics["pass@1"]
    
    
    def evaluate(self, prediction: Any, label: Any) -> dict:
        """
        Evaluate the solution code.

        Args:
            prediction (str | List[str]): The solution code(s).
            label (dict | List[dict]): The test cases and expected outputs. 

        Returns:
            dict: The evaluation metrics (pass@k).
        """
        
#         print("pred", prediction)
#         print("label", label)
#         reliability_guard(8*1024*1024*1024)
        prediction, label = self._check_evaluation_inputs(prediction, label)
        k_list = [self.k] if isinstance(self.k, int) else self.k

        if self.scenario == "code_generation":
            solutions: List[str] = [extract_code_blocks(pred)[0] for pred in prediction]
#             print(solutions)
#             print(label)
            metrics, results, metadatas = codegen_metrics(
                samples_list=label, # label is already a list 
                generations_list=[solutions], # for a single example. 
                k_list=k_list, 
                num_process_evaluate=self.num_process,
                timeout=self.timeout
            )
    
            self.met = metrics
            self.res = results
            self.metadatas = metadatas
            
        elif self.scenario == "test_output_prediction":
            pred_outputs = [extract_test_output_code(pred) for pred in prediction]
            metrics, results = test_output_metrics(
                samples=label, 
                generations=[pred_outputs], 
                k_list=k_list, 
            )
        elif self.scenario == "code_execution":
            pred_outputs = [extract_execution_code(pred, self.use_cot_for_execution) for pred in prediction]
            metrics, results = code_execution_metrics(
                samples=label, 
                generations=[pred_outputs], 
            )
        else:
            raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.")
        
        pass_at_k = {f"pass@{k}": float(metrics[f"pass@{k}"]) for k in k_list}
        return pass_at_k
    
class AFlowLiveCodeBench(CodingBenchmark):

    """Benchmark class for evaluating LLM capabilities on real-world programming tasks.
    
    LiveCodeBench provides a framework for evaluating different scenarios of code-related tasks:
    1. Code Generation: generating code from problem descriptions
    2. Test Output Prediction: predicting test outputs given test code
    3. Code Execution: generating code that executes correctly
    
    The benchmark supports different evaluation modes, metrics, and can be customized
    with various parameters like timeouts, sample dates, and processing options.
    
    Attributes:
        k: An integer or list of integers specifying which pass@k metrics to compute
        version: Release version of the dataset to use
        num_process: Number of processes to use for evaluation
        start_date: Filter problems to those after this date
        end_date: Filter problems to those before this date
        scenario: Type of programming task to evaluate ("code_generation", 
                  "test_output_prediction", or "code_execution")
        use_cot_for_execution: Whether to use chain-of-thought processing for code execution
    """

    def __init__(
        self, 
        path: str = None, 
        mode: str = "all", 
        timeout: int = 60, 
        k: Union[int, list] = 1, 
        num_process: int = 6, 
        scenario: str = "code_generation", 
        version: str = "release_latest", 
        start_date: str = None, 
        end_date: str = None, 
        use_cot_for_execution: bool = False, 
        **kwargs
    ):
        path = os.path.expanduser(path or "~/.evoagentx/data/livecodebench")
        self.k = k 
        self.version = version
        self.num_process = num_process
        self.start_date = start_date
        self.end_date = end_date
        self.scenario = scenario 
        self.use_cot_for_execution = use_cot_for_execution
        assert scenario in VALID_SCENARIO, f"Invalid scenario: {scenario}. Available choices: {VALID_SCENARIO}." 
        super().__init__(name=type(self).__name__, path=path, mode=mode, timeout=timeout, **kwargs)
    
    def _load_data(self):
        if self.mode == "train" or self.mode == "all":
            self._train_data = None 
        if self.mode == "dev" or self.mode == "all":
            self._dev_data = None 
        if self.mode == "test" or self.mode == "all":
            self._test_data = self._load_test_data()
    
    def _load_test_data(self):

        if self.scenario == "code_generation":
            logger.info(f"Loading code generation dataset from {self.path} with version {self.version}.")
            data: List[CodeGenerationProblem] = load_code_generation_dataset(
                release_version=self.version, 
                cache_dir=self.path, 
                start_date=self.start_date, 
                end_date=self.end_date
            )
        elif self.scenario == "test_output_prediction":
            logger.info(f"Loading test output prediction dataset from {self.path}.")
            data: List[TestOutputPredictionProblem] = load_test_prediction_dataset(cache_dir=self.path)
        elif self.scenario == "code_execution":
            logger.info(f"Loading code execution dataset from {self.path}.")
            data: List[CodeExecutionProblem] = load_code_execution_dataset(cache_dir=self.path)
        else:
            raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.")

        return data 
    
    def _get_id(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> str:
        return example.question_id  
    
    def _get_label(self, example: Union[CodeGenerationProblem, TestOutputPredictionProblem]) -> dict:
        return example.get_evaluation_sample()
    
    async def async_evaluate(self, graph: Callable, example: Any) -> float:

        # generate solution 
        prompt, entry_point = example.question_content, example.question_title
        solution = await graph(prompt, entry_point)
        label = self._get_label(example)
        metrics = await super().async_evaluate(prediction=solution, label=label)
        return metrics["pass@1"]
    
    def extract_test_cases_with_entry_point(self, entry_point: str):

        hardcoded_cases = {
            "remove_odd": "",
            "replace_spaces": "",
            "snake_to_camel": "",
            "Split": "",
            "swap_List": "",
            "square_Sum": "",
            "sort_sublists": "",
            "unique_sublists": "",
        }
        if entry_point in hardcoded_cases:
            return hardcoded_cases[entry_point]
        
        for case in self._dev_data + self._test_data:
            print(entry_point)
            print(case.question_title)
            if case.question_title == entry_point:
                return case.private_test_cases
        
        return None
    
    
    def evaluate(self, prediction: Any, label: Any) -> dict:
        """
        Evaluate the solution code.

        Args:
            prediction (str | List[str]): The solution code(s).
            label (dict | List[dict]): The test cases and expected outputs. 

        Returns:
            dict: The evaluation metrics (pass@k).
        """
        
#         print("pred", prediction)
#         print("label", label)
#         reliability_guard(8*1024*1024*1024)
        prediction, label = self._check_evaluation_inputs(prediction, label)
        k_list = [self.k] if isinstance(self.k, int) else self.k

        if self.scenario == "code_generation":
            solutions: List[str] = [extract_code_blocks(pred)[0] for pred in prediction]
#             print(solutions)
#             print(label)
            metrics, results, metadatas = codegen_metrics(
                samples_list=label, # label is already a list 
                generations_list=[solutions], # for a single example. 
                k_list=k_list, 
                num_process_evaluate=self.num_process,
                timeout=self.timeout
            )
            
        elif self.scenario == "test_output_prediction":
            pred_outputs = [extract_test_output_code(pred) for pred in prediction]
            metrics, results = test_output_metrics(
                samples=label, 
                generations=[pred_outputs], 
                k_list=k_list, 
            )
        elif self.scenario == "code_execution":
            pred_outputs = [extract_execution_code(pred, self.use_cot_for_execution) for pred in prediction]
            metrics, results = code_execution_metrics(
                samples=label, 
                generations=[pred_outputs], 
            )
        else:
            raise ValueError(f"Invalid scenario: {self.scenario}. Available choices: {VALID_SCENARIO}.")
        
        pass_at_k = {f"pass@{k}": float(metrics[f"pass@{k}"]) for k in k_list}
        return pass_at_k