| import asyncio
|
| import json
|
| import os
|
| import tempfile
|
| from typing import Any
|
|
|
| import pandas as pd
|
| import toml
|
| from datasets import load_dataset
|
|
|
| import openhands.agenthub
|
| from evaluation.benchmarks.swe_bench.resource.mapping import (
|
| get_instance_resource_factor,
|
| )
|
| from evaluation.utils.shared import (
|
| EvalException,
|
| EvalMetadata,
|
| EvalOutput,
|
| assert_and_raise,
|
| codeact_user_response,
|
| get_metrics,
|
| is_fatal_evaluation_error,
|
| make_metadata,
|
| prepare_dataset,
|
| reset_logger_for_multiprocessing,
|
| run_evaluation,
|
| update_llm_config_for_completions_logging,
|
| )
|
| from openhands.controller.state.state import State
|
| from openhands.core.config import (
|
| AgentConfig,
|
| AppConfig,
|
| SandboxConfig,
|
| get_llm_config_arg,
|
| get_parser,
|
| )
|
| from openhands.core.logger import openhands_logger as logger
|
| from openhands.core.main import create_runtime, run_controller
|
| from openhands.events.action import CmdRunAction, MessageAction
|
| from openhands.events.observation import CmdOutputObservation, ErrorObservation
|
| from openhands.events.serialization.event import event_to_dict
|
| from openhands.runtime.base import Runtime
|
| from openhands.utils.async_utils import call_async_from_sync
|
| from openhands.utils.shutdown_listener import sleep_if_should_continue
|
|
|
| USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
|
| USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'true').lower() == 'true'
|
| RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
|
|
|
|
|
| AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
| 'CodeActAgent': codeact_user_response,
|
| }
|
|
|
|
|
| def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
|
| return f'{instance.repo}__{instance.version}'.replace('/', '__')
|
|
|
|
|
| def get_instruction(instance: pd.Series, metadata: EvalMetadata):
|
| workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
|
|
|
|
|
|
|
|
| instruction = (
|
| '<uploaded_files>\n'
|
| f'/workspace/{workspace_dir_name}\n'
|
| '</uploaded_files>\n'
|
| f"I've uploaded a python code repository in the directory {workspace_dir_name}. Consider the following PR description:\n\n"
|
| f'<pr_description>\n'
|
| f'{instance.problem_statement}\n'
|
| '</pr_description>\n\n'
|
| 'Can you help me implement the necessary changes to the repository so that the requirements specified in the <pr_description> are met?\n'
|
| "I've already taken care of all changes to any of the test files described in the <pr_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!\n"
|
| 'Your task is to make the minimal changes to non-tests files in the /workspace directory to ensure the <pr_description> is satisfied.\n'
|
| 'Follow these steps to resolve the issue:\n'
|
| '1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure.\n'
|
| '2. Create a script to reproduce the error and execute it with `python <filename.py>` using the BashTool, to confirm the error\n'
|
| '3. Edit the sourcecode of the repo to resolve the issue\n'
|
| '4. Rerun your reproduce script and confirm that the error is fixed!\n'
|
| '5. Think about edgecases and make sure your fix handles them as well\n'
|
| "Your thinking should be thorough and so it's fine if it's very long.\n"
|
| )
|
|
|
| if RUN_WITH_BROWSING:
|
| instruction += (
|
| '<IMPORTANT!>\n'
|
| 'You SHOULD NEVER attempt to browse the web. '
|
| '</IMPORTANT!>\n'
|
| )
|
| return instruction
|
|
|
|
|
|
|
| DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
|
| logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
|
|
|
|
|
| def get_instance_docker_image(instance_id: str) -> str:
|
| image_name = 'sweb.eval.x86_64.' + instance_id
|
| image_name = image_name.replace(
|
| '__', '_s_'
|
| )
|
| return (DOCKER_IMAGE_PREFIX.rstrip('/') + '/' + image_name).lower()
|
|
|
|
|
| def get_config(
|
| instance: pd.Series,
|
| metadata: EvalMetadata,
|
| ) -> AppConfig:
|
| SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1'
|
| if USE_INSTANCE_IMAGE:
|
|
|
| base_container_image = get_instance_docker_image(instance['instance_id'])
|
| logger.info(
|
| f'Using instance container image: {base_container_image}. '
|
| f'Please make sure this image exists. '
|
| f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
|
| )
|
| else:
|
| base_container_image = SWE_BENCH_CONTAINER_IMAGE
|
| logger.info(f'Using swe-bench container image: {base_container_image}')
|
|
|
| config = AppConfig(
|
| default_agent=metadata.agent_class,
|
| run_as_openhands=False,
|
| max_iterations=metadata.max_iterations,
|
| runtime=os.environ.get('RUNTIME', 'docker'),
|
| sandbox=SandboxConfig(
|
| base_container_image=base_container_image,
|
| enable_auto_lint=True,
|
| use_host_network=False,
|
|
|
| timeout=300,
|
|
|
| platform='linux/amd64',
|
| api_key=os.environ.get('ALLHANDS_API_KEY', None),
|
| remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
|
| keep_runtime_alive=False,
|
| remote_runtime_init_timeout=3600,
|
| remote_runtime_resource_factor=get_instance_resource_factor(
|
| dataset_name=metadata.dataset,
|
| instance_id=instance['instance_id'],
|
| ),
|
| ),
|
|
|
| workspace_base=None,
|
| workspace_mount_path=None,
|
| )
|
| config.set_llm_config(
|
| update_llm_config_for_completions_logging(
|
| metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
|
| )
|
| )
|
| agent_config = AgentConfig(
|
| codeact_enable_jupyter=False,
|
| codeact_enable_browsing=RUN_WITH_BROWSING,
|
| codeact_enable_llm_editor=False,
|
| condenser=metadata.condenser_config,
|
| )
|
| config.set_agent_config(agent_config)
|
| return config
|
|
|
|
|
| def initialize_runtime(
|
| runtime: Runtime,
|
| instance: pd.Series,
|
| ):
|
| """Initialize the runtime for the agent.
|
|
|
| This function is called before the runtime is used to run the agent.
|
| """
|
| logger.info('-' * 30)
|
| logger.info('BEGIN Runtime Initialization Fn')
|
| logger.info('-' * 30)
|
| workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
| obs: CmdOutputObservation
|
|
|
|
|
| action = CmdRunAction(
|
| command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc"""
|
| )
|
| action.set_hard_timeout(600)
|
| logger.info(action, extra={'msg_type': 'ACTION'})
|
| obs = runtime.run_action(action)
|
| logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
| assert_and_raise(
|
| obs.exit_code == 0, f'Failed to export SWE_INSTANCE_ID: {str(obs)}'
|
| )
|
|
|
| action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
|
| action.set_hard_timeout(600)
|
| logger.info(action, extra={'msg_type': 'ACTION'})
|
| obs = runtime.run_action(action)
|
| logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
| assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
|
|
|
| if USE_INSTANCE_IMAGE:
|
|
|
| script_dir = os.path.dirname(__file__)
|
|
|
|
|
| action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances')
|
| action.set_hard_timeout(600)
|
| logger.info(action, extra={'msg_type': 'ACTION'})
|
| obs = runtime.run_action(action)
|
| logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
| assert_and_raise(
|
| obs.exit_code == 0,
|
| f'Failed to create /swe_util/eval_data/instances: {str(obs)}',
|
| )
|
|
|
| swe_instance_json_name = 'swe-bench-instance.json'
|
| with tempfile.TemporaryDirectory() as temp_dir:
|
|
|
| temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
|
|
|
| with open(temp_file_path, 'w') as f:
|
| if not isinstance(instance, dict):
|
| json.dump([instance.to_dict()], f)
|
| else:
|
| json.dump([instance], f)
|
|
|
|
|
| runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
|
|
|
|
|
| runtime.copy_to(
|
| str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
|
| '/swe_util/',
|
| )
|
| action = CmdRunAction(command='cat ~/.bashrc')
|
| action.set_hard_timeout(600)
|
| logger.info(action, extra={'msg_type': 'ACTION'})
|
| obs = runtime.run_action(action)
|
| logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
| assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}')
|
|
|
| action = CmdRunAction(command='source ~/.bashrc')
|
| action.set_hard_timeout(600)
|
| logger.info(action, extra={'msg_type': 'ACTION'})
|
| obs = runtime.run_action(action)
|
| logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
| if isinstance(obs, ErrorObservation):
|
| logger.error(f'Failed to source ~/.bashrc: {str(obs)}')
|
| assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
|
|
|
| action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
|
| action.set_hard_timeout(600)
|
| logger.info(action, extra={'msg_type': 'ACTION'})
|
| obs = runtime.run_action(action)
|
| logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
| assert_and_raise(
|
| obs.exit_code == 0,
|
| f'Failed to source /swe_util/instance_swe_entry.sh: {str(obs)}',
|
| )
|
| else:
|
| action = CmdRunAction(command='source /swe_util/swe_entry.sh')
|
| action.set_hard_timeout(1800)
|
| logger.info(action, extra={'msg_type': 'ACTION'})
|
| obs = runtime.run_action(action)
|
| logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
| assert_and_raise(
|
| obs.exit_code == 0,
|
| f'Failed to source /swe_util/swe_entry.sh: {str(obs)}',
|
| )
|
|
|
| action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
| action.set_hard_timeout(600)
|
| logger.info(action, extra={'msg_type': 'ACTION'})
|
| obs = runtime.run_action(action)
|
| logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
| assert_and_raise(
|
| obs.exit_code == 0,
|
| f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
|
| )
|
|
|
| action = CmdRunAction(command='git reset --hard')
|
| action.set_hard_timeout(600)
|
| logger.info(action, extra={'msg_type': 'ACTION'})
|
| obs = runtime.run_action(action)
|
| logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
| assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {str(obs)}')
|
|
|
| action = CmdRunAction(
|
| command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
|
| )
|
| action.set_hard_timeout(600)
|
| logger.info(action, extra={'msg_type': 'ACTION'})
|
| obs = runtime.run_action(action)
|
| logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
| assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
|
|
|
| action = CmdRunAction(command='which python')
|
| action.set_hard_timeout(600)
|
| logger.info(action, extra={'msg_type': 'ACTION'})
|
| obs = runtime.run_action(action)
|
| logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
| assert_and_raise(
|
| obs.exit_code == 0 and 'testbed' in obs.content,
|
| f'Expected to find python interpreter from testbed, but got: {str(obs)}',
|
| )
|
|
|
| logger.info('-' * 30)
|
| logger.info('END Runtime Initialization Fn')
|
| logger.info('-' * 30)
|
|
|
|
|
| def complete_runtime(
|
| runtime: Runtime,
|
| instance: pd.Series,
|
| ) -> dict[str, Any]:
|
| """Complete the runtime for the agent.
|
|
|
| This function is called before the runtime is used to run the agent.
|
| If you need to do something in the sandbox to get the correctness metric after
|
| the agent has run, modify this function.
|
| """
|
| logger.info('-' * 30)
|
| logger.info('BEGIN Runtime Completion Fn')
|
| logger.info('-' * 30)
|
| obs: CmdOutputObservation
|
| workspace_dir_name = _get_swebench_workspace_dir_name(instance)
|
|
|
| action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
|
| action.set_hard_timeout(600)
|
| logger.info(action, extra={'msg_type': 'ACTION'})
|
| obs = runtime.run_action(action)
|
| logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
| assert_and_raise(
|
| isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
| f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
|
| )
|
|
|
| action = CmdRunAction(command='git config --global core.pager ""')
|
| action.set_hard_timeout(600)
|
| logger.info(action, extra={'msg_type': 'ACTION'})
|
| obs = runtime.run_action(action)
|
| logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
| assert_and_raise(
|
| isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
| f'Failed to git config --global core.pager "": {str(obs)}',
|
| )
|
|
|
| action = CmdRunAction(command='git add -A')
|
| action.set_hard_timeout(600)
|
| logger.info(action, extra={'msg_type': 'ACTION'})
|
| obs = runtime.run_action(action)
|
| logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
| assert_and_raise(
|
| isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
|
| f'Failed to git add -A: {str(obs)}',
|
| )
|
|
|
| n_retries = 0
|
| git_patch = None
|
| while n_retries < 5:
|
| action = CmdRunAction(
|
| command=f'git diff --no-color --cached {instance["base_commit"]}'
|
| )
|
| action.set_hard_timeout(max(300 + 100 * n_retries, 600))
|
| logger.info(action, extra={'msg_type': 'ACTION'})
|
| obs = runtime.run_action(action)
|
| logger.info(obs, extra={'msg_type': 'OBSERVATION'})
|
| n_retries += 1
|
| if isinstance(obs, CmdOutputObservation):
|
| if obs.exit_code == 0:
|
| git_patch = obs.content.strip()
|
| break
|
| else:
|
| logger.info('Failed to get git diff, retrying...')
|
| sleep_if_should_continue(10)
|
| elif isinstance(obs, ErrorObservation):
|
| logger.error(f'Error occurred: {obs.content}. Retrying...')
|
| sleep_if_should_continue(10)
|
| else:
|
| assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
|
|
|
| assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')
|
|
|
| logger.info('-' * 30)
|
| logger.info('END Runtime Completion Fn')
|
| logger.info('-' * 30)
|
| return {'git_patch': git_patch}
|
|
|
|
|
| def process_instance(
|
| instance: pd.Series,
|
| metadata: EvalMetadata,
|
| reset_logger: bool = True,
|
| runtime_failure_count: int = 0,
|
| ) -> EvalOutput:
|
| config = get_config(instance, metadata)
|
|
|
|
|
| if reset_logger:
|
| log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
|
| reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
|
| else:
|
| logger.info(f'Starting evaluation for instance {instance.instance_id}.')
|
|
|
|
|
| if runtime_failure_count > 0:
|
| config.sandbox.remote_runtime_resource_factor = min(
|
| config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
|
| 8,
|
| )
|
| logger.warning(
|
| f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
|
| )
|
| runtime = create_runtime(config)
|
| call_async_from_sync(runtime.connect)
|
|
|
| try:
|
| initialize_runtime(runtime, instance)
|
|
|
| instruction = get_instruction(instance, metadata)
|
|
|
|
|
| state: State | None = asyncio.run(
|
| run_controller(
|
| config=config,
|
| initial_user_action=MessageAction(content=instruction),
|
| runtime=runtime,
|
| fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
| metadata.agent_class
|
| ],
|
| )
|
| )
|
|
|
|
|
| if is_fatal_evaluation_error(state.last_error):
|
| raise EvalException('Fatal error detected: ' + state.last_error)
|
|
|
|
|
|
|
| return_val = complete_runtime(runtime, instance)
|
| git_patch = return_val['git_patch']
|
| logger.info(
|
| f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------'
|
| )
|
| finally:
|
| runtime.close()
|
|
|
|
|
|
|
|
|
|
|
| test_result = {
|
| 'git_patch': git_patch,
|
| }
|
|
|
|
|
|
|
| if state is None:
|
| raise ValueError('State should not be None.')
|
|
|
|
|
| histories = [event_to_dict(event) for event in state.history]
|
| metrics = get_metrics(state)
|
|
|
|
|
| output = EvalOutput(
|
| instance_id=instance.instance_id,
|
| instruction=instruction,
|
| instance=instance.to_dict(),
|
| test_result=test_result,
|
| metadata=metadata,
|
| history=histories,
|
| metrics=metrics,
|
| error=state.last_error if state and state.last_error else None,
|
| )
|
| return output
|
|
|
|
|
| def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
|
| file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.toml')
|
| if os.path.exists(file_path):
|
| with open(file_path, 'r') as file:
|
| data = toml.load(file)
|
| if 'selected_ids' in data:
|
| selected_ids = data['selected_ids']
|
| logger.info(
|
| f'Filtering {len(selected_ids)} tasks from "selected_ids"...'
|
| )
|
| subset = dataset[dataset[filter_column].isin(selected_ids)]
|
| logger.info(f'Retained {subset.shape[0]} tasks after filtering')
|
| return subset
|
| skip_ids = os.environ.get('SKIP_IDS', '').split(',')
|
| if len(skip_ids) > 0:
|
| logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...')
|
| return dataset[~dataset[filter_column].isin(skip_ids)]
|
| return dataset
|
|
|
|
|
| if __name__ == '__main__':
|
| parser = get_parser()
|
| parser.add_argument(
|
| '--dataset',
|
| type=str,
|
| default='princeton-nlp/SWE-bench',
|
| help='data set to evaluate on, either full-test or lite-test',
|
| )
|
| parser.add_argument(
|
| '--split',
|
| type=str,
|
| default='test',
|
| help='split to evaluate on',
|
| )
|
| args, _ = parser.parse_known_args()
|
|
|
|
|
|
|
| dataset = load_dataset(args.dataset, split=args.split)
|
| swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
|
| logger.info(
|
| f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
|
| )
|
|
|
| llm_config = None
|
| if args.llm_config:
|
| llm_config = get_llm_config_arg(args.llm_config)
|
| llm_config.log_completions = True
|
|
|
| llm_config.modify_params = False
|
|
|
| if llm_config is None:
|
| raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
|
|
|
| details = {}
|
| _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
|
|
|
| dataset_descrption = (
|
| args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__')
|
| )
|
| metadata = make_metadata(
|
| llm_config,
|
| dataset_descrption,
|
| args.agent_cls,
|
| args.max_iterations,
|
| args.eval_note,
|
| args.eval_output_dir,
|
| details=details,
|
| )
|
|
|
| output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
| print(f'### OUTPUT FILE: {output_file} ###')
|
| instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit)
|
|
|
| if len(instances) > 0 and not isinstance(
|
| instances['PASS_TO_PASS'][instances['PASS_TO_PASS'].index[0]], str
|
| ):
|
| for col in ['PASS_TO_PASS', 'FAIL_TO_PASS']:
|
| instances[col] = instances[col].apply(lambda x: str(x))
|
|
|
| run_evaluation(
|
| instances,
|
| metadata,
|
| output_file,
|
| args.eval_num_workers,
|
| process_instance,
|
| timeout_seconds=120 * 60,
|
| max_retries=5,
|
| )
|
|
|