Spaces:
Sleeping
Sleeping
| diff --git a/evaluation/gpqa/README.md b/evaluation/gpqa/README.md | |
| index 150aa16..9f0160a 100644 | |
| s--- a/evaluation/gpqa/README.md | |
| +++ b/evaluation/gpqa/README.md | |
| Implements the evaluation of agents on the GPQA benchmark introduced in [GPQA: A Graduate-Level Google-Proof Q&A Benchmark](https://arxiv.org/abs/2308.07124). | |
| This code implements the evaluation of agents on the GPQA Benchmark with Open Book setting. | |
| -- The benchmark consists of 448 high-quality and extremely difficult multiple-choice questions in the domains of biology, physics, and chemistry. The questions are intentionally designed to be "Google-proof," meaning that even highly skilled non-expert validators achieve only 34% accuracy despite unrestricted access to the web. | |
| +- The benchmark consists of 448 high-quality and extremely difficult multiple-choice questions in the domains of biology, physics, and chemistry. The questions are intentionally designed to be "Google-proof," meaning that even highly skilled non-experst validators achieve only 34% accuracy despite unrestricted access to the web. | |
| - Even experts in the corresponding domains achieve only 65% accuracy. | |
| - State-of-the-art AI systems achieve only 39% accuracy on this challenging dataset. | |
| Further references: | |
| - https://github.com/idavidrein/gpqa | |
| ## TODOs | |
| +- [X] Complete full benchmark evaluation | |
| +- [X] Fix intermittent `BrowserException: Failed to start browser environment` error | |
| - [ ] Add support for other agents (currently only tested on `CodeActAgent`) | |
| -- [ ] Complete full benchmark evaluation | |
| -- [ ] Fix intermittent `BrowserException: Failed to start browser environment` error | |
| ## Setup Environment | |
| You can replace `model_config_name` with any model you set up in `config.toml`. | |
| ## Benchmark Evaluation Results | |
| -- [] TODO: Finish the evaluation run across the entire benchmark and compile results | |
| +Please refer https://huggingface.co/spaces/OpenDevin/evaluation for latest evaluation results and evaluation logs. | |
| diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py | |
| index 2152a9e..16d9c98 100644 | |
| --- a/evaluation/gpqa/run_infer.py | |
| +++ b/evaluation/gpqa/run_infer.py | |
| Further references: | |
| - https://arxiv.org/pdf/2311.12022 | |
| - https://paperswithcode.com/dataset/gpqa | |
| - https://github.com/idavidrein/gpqa | |
| - | |
| -TODOs: | |
| -- Add evaluation on other Agent classes (e.g., MonologueAgent) | |
| -- Batch inference and evaluation of agents on the GPQA Benchmark. | |
| """ | |
| import asyncio | |
| from opendevin.core.config import config, get_llm_config_arg, get_parser | |
| from opendevin.core.logger import get_console_handler | |
| from opendevin.core.logger import opendevin_logger as logger | |
| from opendevin.core.main import main | |
| -from opendevin.events.action import MessageAction | |
| +from opendevin.events.action import AgentFinishAction, MessageAction | |
| from opendevin.events.serialization.event import event_to_dict | |
| def codeact_user_response(state: State) -> str: | |
| msg = ( | |
| 'Please continue working on the task on whatever approach you think is suitable.\n' | |
| 'Feel free to use all tools for calculations and solving the problem, and web-search for finding relevant facts during the process if needed\n' | |
| - 'If you think you have reliably finished solving the problem, first generate a message reporting the final concise answer to the user. Once that is done, please run the following command: <execute_bash> exit </execute_bash>.\n' | |
| - 'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP TO SOLVE THIS TASK.\n' | |
| + 'If you have finished reporting the answer in the expected format, (and only once that is done), please run the following command to submit: <execute_bash> exit </execute_bash>.\n' | |
| + """Again you are being told a million times to first report the answer in the requested format (see again below for reference) before exiting. DO NOT EXIT WITHOUT REPORTING THE ANSWER FIRST. | |
| + \n\nThat is, when you have decided on the answer report in the following format: | |
| + <<FINAL_ANSWER|| | |
| + <insert correct answer here, must be one of A, B, C, D> (Please dont use any additional characters. Just the letter of the correct answer (A/B/C/D).) | |
| + ||FINAL_ANSWER>> | |
| + <execute_bash> exit </execute_bash> | |
| + """ | |
| + '\n\nIMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP TO SOLVE THIS TASK.\n' | |
| ) | |
| - if state.history: | |
| - user_msgs = [ | |
| - action | |
| - for action, _ in state.history | |
| - if isinstance(action, MessageAction) and action.source == 'user' | |
| - ] | |
| - if len(user_msgs) >= 2: | |
| - # let the agent know that it can give up when it has tried 3 times | |
| - return ( | |
| - msg | |
| - + 'If you want to give up, just generate a final answer message to the user and in the next turn --> run: <execute_bash> exit </execute_bash>.\n' | |
| - ) | |
| return msg | |
| def parse_final_answer(final_answer: str) -> str: | |
| <insert correct answer here> | |
| ||FINAL_ANSWER>> | |
| """ | |
| + # to do this first extract the part enclosed in the format <<FINAL_ANSWER|| ... ||FINAL_ANSWER>> | |
| pattern = re.compile(r'<<FINAL_ANSWER\|\|(.*?)\|\|FINAL_ANSWER>>', re.DOTALL) | |
| match = pattern.search(final_answer) | |
| - if match: | |
| - return match.group(1).strip() | |
| - else: | |
| - return 'No final answer found in the provided string.' | |
| + # and then strip it, remove any leading/trailing spaces line breaks etc. | |
| + answer = match.group(1).strip() | |
| + # finally capitalize it | |
| + answer = answer.upper() | |
| + # and then return A, B, C, D depending on whether the answer A, B, C, D is found in the final answer | |
| + for letter in ['A', 'B', 'C', 'D']: | |
| + if letter in answer: | |
| + return letter | |
| def compare_answers(predicted_answer, ground_truth): | |
| def get_test_result(model_output, ground_truth): | |
| Implements the evaluation logic for GPQA | |
| Checks if the output of a given instance is correct (as per the ground truth) | |
| """ | |
| - # parse the final answer from model output | |
| - predicted_answer = parse_final_answer(model_output) | |
| + try: | |
| + # parse the final answer from model output | |
| + predicted_answer = parse_final_answer(model_output) | |
| + except Exception as e: | |
| + # Log the exception | |
| + print(f'An error occurred: {e}\n defaulting to random guess ...') | |
| + # choose a random answer if the model output is not in the correct format | |
| + predicted_answer = random.choice(['A', 'B', 'C', 'D']) | |
| + logger.info('#############################################') | |
| + logger.info(f'Predicted answer: {predicted_answer}') | |
| + logger.info(f'Ground truth answer: {ground_truth}') | |
| + logger.info('#############################################') | |
| # check if the model output matches the ground truth | |
| result = compare_answers(predicted_answer, ground_truth) | |
| def process_instance( | |
| config.workspace_base = workspace_mount_path | |
| config.workspace_mount_path = workspace_mount_path | |
| - # workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace') | |
| - # workspace_mount_path = os.path.abspath(workspace_mount_path) | |
| - # # create process-specific workspace dir | |
| - # # if `not skip_workspace_mount` - we will create a workspace directory for EACH process | |
| - # # so that different agent don't interfere with each other. | |
| - # if not skip_workspace_mount: | |
| - # workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid())) | |
| - # pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True) | |
| - | |
| # Setup the logger properly, so you can run multi-processing to parallize the evaluation | |
| if reset_logger: | |
| # Set up logger | |
| def process_instance( | |
| # ======= Run the agent on the instance ======= | |
| # Prepare instruction for the agent using suggested format in gpqa codebase | |
| + # browsing_instruction = """- You should try using the browser to find relevant information to answer the question if required. | |
| + # 1. for instance to look up the atomic number of carbon, you can use: | |
| + # <execute_browse> | |
| + # goto("https://www.google.com/search?q=atomic+number+of+carbon") | |
| + # </execute_browse> | |
| + # 2. similarly for looking up "What is the product of benzene diazotization followed by reaction with anisole?" | |
| + # <execute_browse> | |
| + # goto("https://www.google.com/search?q=product+of+benzene+diazotization+followed+by+reaction+with+anisole") | |
| + # </execute_browse> | |
| + # """ | |
| + | |
| instruction = f""" | |
| What is the correct answer to this question:\n | |
| {instance['question']}\n | |
| def process_instance( | |
| <insert correct answer here, must be one of A, B, C, D> (Please dont use any additional characters. Just the letter of the correct answer (A/B/C/D).) | |
| ||FINAL_ANSWER>> | |
| + | |
| Additional Instructions: | |
| + - Do not try to solve the question in a single step. Break it down into smaller steps. | |
| + | |
| - You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP. | |
| + | |
| + - SUPER IMPORTANT: When you have reported the answer to the user in the requested format, (and only once that is done) in the next turn, please run the following command: <execute_bash> exit </execute_bash>. | |
| + - Again you are being told a million times to first report the answer in the requested format (see again below for reference) before exiting. DO NOT EXIT WITHOUT REPORTING THE ANSWER FIRST. | |
| + That is, when you have decided on the answer report in the following format: | |
| + | |
| + <<FINAL_ANSWER|| | |
| + <insert correct answer here, must be one of A, B, C, D> (Please dont use any additional characters. Just the letter of the correct answer (A/B/C/D).) | |
| + ||FINAL_ANSWER>> | |
| + <execute_bash> exit </execute_bash> | |
| + | |
| + | |
| + Again do not quit without reporting the answer first. | |
| + Ok now its time to start solving the question. Good luck! | |
| """ | |
| # NOTE: You can actually set slightly different instruction for different agents | |
| - instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '') | |
| + # instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '') | |
| # Here's how you can run the agent (similar to the `main` function) and get the final task state | |
| state: State = asyncio.run( | |
| def process_instance( | |
| # ======= Attempt to evaluate the agent's edits ======= | |
| # get the final message from the state history (default to None if not found) | |
| - final_message = next( | |
| - ( | |
| - act.content | |
| - for act in reversed(state.history) | |
| - if isinstance(act, MessageAction) | |
| - ), | |
| - None, | |
| - ) | |
| + for action, _ in reversed(state.history): | |
| + if ( | |
| + isinstance(action, AgentFinishAction) | |
| + and action.source != 'user' | |
| + and '<<FINAL_ANSWER||' in action.thought | |
| + ): | |
| + final_message = action.thought | |
| + break | |
| + elif ( | |
| + isinstance(action, MessageAction) | |
| + and action.source != 'user' | |
| + and '<<FINAL_ANSWER||' in action.content | |
| + ): | |
| + final_message = action.content | |
| + break | |
| + else: | |
| + final_message = None | |
| + | |
| + logger.info('#############################################') | |
| logger.info(f'Final message generated by the agent: {final_message}') | |
| + logger.info('#############################################') | |
| test_result = get_test_result(final_message, instance.correct_solution) | |
| + logger.info('#############################################') | |
| + logger.info(f'Test result: {test_result}') | |
| + logger.info('#############################################') | |
| # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction) | |
| # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation. | |
| if state is None: | |
| raise ValueError('State should not be None.') | |
| + metrics = state.metrics.get() if state.metrics else None | |
| # Save the output | |
| output = { | |
| def process_instance( | |
| 'instance_id': instance.instance_id, | |
| 'instruction': instruction, | |
| 'metadata': metadata, | |
| + 'metrics': metrics, | |
| 'history': [ | |
| (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history | |
| ], | |
| 'error': state.error if state and state.error else None, | |
| - 'test_result': test_result, | |
| + 'test_result': {'result': test_result}, | |
| } | |
| config.workspace_mount_path = old_workspace_mount_path | |
| if __name__ == '__main__': | |
| '--data-split', | |
| type=str, | |
| choices=['gpqa_main', 'gpqa_diamond', 'gpqa_experts', 'gpqa_extended'], | |
| - default='gpqa_diamond', | |
| + default='gpqa_extended', | |
| help='data split to evaluate, eg. gpqa_diamond', | |
| ) | |
| + # add start index to the args | |
| + parser.add_argument( | |
| + '--start-index', | |
| + type=int, | |
| + default=0, | |
| + help='start index to evaluate the dataset', | |
| + ) | |
| args, _ = parser.parse_known_args() | |
| # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing | |
| if __name__ == '__main__': | |
| eval_note += '_N_' + args.eval_note | |
| eval_output_dir = os.path.join( | |
| args.eval_output_dir, | |
| - 'gpqa', | |
| + args.data_split, # one of 'gpqa_main', 'gpqa_diamond', 'gpqa_experts', 'gpqa_extended' | |
| agent_class, | |
| model_name + '_maxiter_' + str(max_iterations) + eval_note, | |
| ) | |
| if __name__ == '__main__': | |
| # LIMIT EVALUATION | |
| eval_n_limit = args.eval_n_limit # NOTE: This is useful for debugging and testing using a smaller subset of the dataset | |
| if eval_n_limit: | |
| - # start_index = 20 | |
| - # gpqa_dataset = gpqa_dataset.iloc[start_index:] | |
| + if args.start_index != 0: | |
| + logger.info( | |
| + f'Using start index: {args.start_index}. This should be used with eval_n_limit to limit the evaluation to a subset of the dataset for debugging.' | |
| + ) | |
| + gpqa_dataset = gpqa_dataset.iloc[args.start_index :] | |
| gpqa_dataset = gpqa_dataset.head(eval_n_limit) | |
| logger.info(f'Limiting evaluation to first {eval_n_limit} instances.') | |
| diff --git a/evaluation/gpqa/scripts/run_infer.sh b/evaluation/gpqa/scripts/run_infer.sh | |
| index 182fd10..408b2e5 100755 | |
| --- a/evaluation/gpqa/scripts/run_infer.sh | |
| +++ b/evaluation/gpqa/scripts/run_infer.sh | |
| #!/bin/bash | |
| MODEL_CONFIG=$1 | |
| -EVAL_LIMIT=$2 | |
| -DATA_SPLIT=$3 | |
| -AGENT=$4 | |
| +DATA_SPLIT=$2 | |
| +EVAL_LIMIT=$3 | |
| +START_IDX=$4 | |
| +AGENT=$5 | |
| if [ -z "$AGENT" ]; then | |
| echo "Agent not specified, use default CodeActAgent ..." | |
| fi | |
| # NOTE: if data split is not provided, use the default value 'gpqa_diamond' | |
| if [ -z "$DATA_SPLIT" ]; then | |
| - echo "Data split not specified, using default gpqa_diamond ..." | |
| DATA_SPLIT="gpqa_diamond" | |
| + echo "Data split not specified, using default 'gpqa_diamond' ..." | |
| +fi | |
| + | |
| +# NOTE: if start index is not provided, use the default value 0 | |
| +if [ -z "$START_IDX" ]; then | |
| + echo "Start index not specified, using default 0 ..." | |
| + START_IDX=0 | |
| fi | |
| # IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin | |
| COMMAND="poetry run python evaluation/gpqa/run_infer.py \ | |
| --llm-config $MODEL_CONFIG \ | |
| --max-iterations 10 \ | |
| --max-chars 10000000 \ | |
| - --eval-num-workers 1 \ | |
| + --eval-num-workers 8 \ | |
| --data-split $DATA_SPLIT \ | |
| + --start-index $START_IDX \ | |
| --eval-note $AGENT_VERSION" | |
| if [ -n "$EVAL_LIMIT" ]; then | |
| -- | |
| 2.25.1 | |
| diff --git a/agenthub/codeact_agent/codeact_agent.py b/agenthub/codeact_agent/codeact_agent.py | |
| index 8bbc9fb..b63a0dc 100644 | |
| --- a/agenthub/codeact_agent/codeact_agent.py | |
| +++ b/agenthub/codeact_agent/codeact_agent.py | |
| from opendevin.runtime.plugins import ( | |
| JupyterRequirement, | |
| PluginRequirement, | |
| ) | |
| +from opendevin.core.logger import opendevin_logger as logger | |
| -ENABLE_GITHUB = True | |
| +ENABLE_GITHUB = False | |
| def parse_response(response) -> str: | |
| class CodeActAgent(Agent): | |
| ] | |
| jupyter_kernel_init_code: str = 'from agentskills import *' | |
| - system_message: str = ( | |
| + system_message_large: str = ( | |
| f'{SYSTEM_PREFIX}\n{GITHUB_MESSAGE}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}' | |
| if ENABLE_GITHUB | |
| else f'{SYSTEM_PREFIX}\n\n{COMMAND_DOCS}\n\n{SYSTEM_SUFFIX}' | |
| ) | |
| + # alternate system message with much less information to avoid overwhelming the agent | |
| + system_message: str = f"{SYSTEM_PREFIX}" | |
| + | |
| def __init__( | |
| self, | |
| llm: LLM, | |
| class CodeActAgent(Agent): | |
| ], | |
| temperature=0.0, | |
| ) | |
| + logger.info("################################################") | |
| + logger.info(f'LLM response: {response}') | |
| + logger.info("################################################") | |
| action_str: str = parse_response(response) | |
| state.num_of_chars += sum( | |
| class CodeActAgent(Agent): | |
| command_group = bash_command.group(1).strip() | |
| if command_group.strip() == 'exit': | |
| - return AgentFinishAction() | |
| + return AgentFinishAction(thought=thought) | |
| return CmdRunAction(command=command_group, thought=thought) | |
| elif python_code := re.search( | |
| r'<execute_ipython>(.*?)</execute_ipython>', action_str, re.DOTALL | |
| diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py | |
| index 16d9c98..c06b1ad 100644 | |
| --- a/evaluation/gpqa/run_infer.py | |
| +++ b/evaluation/gpqa/run_infer.py | |
| def process_instance( | |
| ||FINAL_ANSWER>> | |
| <execute_bash> exit </execute_bash> | |
| - | |
| - Again do not quit without reporting the answer first. | |
| Ok now its time to start solving the question. Good luck! | |
| """ | |
| diff --git a/opendevin/core/main.py b/opendevin/core/main.py | |
| index 76df3a9..cf15ff3 100644 | |
| --- a/opendevin/core/main.py | |
| +++ b/opendevin/core/main.py | |
| async def main( | |
| AgentCls: Type[Agent] = Agent.get_cls(args.agent_cls) | |
| agent = AgentCls(llm=llm) | |
| + logger.info("################################################") | |
| + logger.info(f"Running agent: {args.agent_cls}\n\n {agent.system_message}") | |
| + logger.info("################################################") | |
| + | |
| event_stream = EventStream('main') | |
| controller = AgentController( | |
| agent=agent, | |