| | """Script to run end-to-end evaluation on the benchmark. |
| | Utils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py. |
| | """ |
| |
|
| | import argparse |
| | import datetime |
| | import json |
| | import logging |
| | import os |
| | import sys |
| | import math |
| | import ast |
| | import time |
| | import backoff |
| | import httpx |
| | import requests |
| | from tqdm import tqdm |
| | from typing import Optional, Dict, Any |
| | from multiprocessing import Pool |
| | from openai import APIConnectionError, APIError, RateLimitError |
| | from types import SimpleNamespace |
| |
|
| |
|
| | |
| | sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../..")) |
| |
|
| | import lib_run_single |
| | from run_autoglm_v import DesktopEnv, get_unfinished, get_result |
| | from desktop_env.desktop_env import MAX_RETRIES, DesktopEnv as DesktopEnvBase |
| | from mm_agents.autoglm_v import AutoGLMAgent |
| | from openai import OpenAI |
| |
|
| | logger = logging.getLogger("desktopenv.experiment") |
| |
|
| | def config() -> argparse.Namespace: |
| | parser = argparse.ArgumentParser(description="Run end-to-end evaluation on the benchmark") |
| |
|
| | |
| | parser.add_argument("--path_to_vm", type=str) |
| | parser.add_argument( |
| | "--provider_name", |
| | type=str, |
| | default="docker", |
| | help="Virtualization provider (vmware, docker, aws, azure, gcp, virtualbox)", |
| | ) |
| | parser.add_argument("--headless", action="store_true", default=True, help="Run in headless machine") |
| | parser.add_argument("--action_space", type=str, default="autoglm_computer_use", help="Action type") |
| | parser.add_argument( |
| | "--observation_type", |
| | choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"], |
| | default="a11y_tree", |
| | help="Observation type", |
| | ) |
| | parser.add_argument("--screen_width", type=int, default=1920) |
| | parser.add_argument("--screen_height", type=int, default=1080) |
| | parser.add_argument("--sleep_after_execution", type=float, default=1.0) |
| | parser.add_argument("--max_steps", type=int, default=30) |
| |
|
| | |
| | parser.add_argument("--max_trajectory_length", type=int, default=3) |
| | parser.add_argument("--test_config_base_dir", type=str, default="evaluation_examples/examples") |
| |
|
| | |
| | parser.add_argument("--model", type=str, default="autoglm-os") |
| | parser.add_argument("--temperature", type=float, default=0.4) |
| | parser.add_argument("--top_p", type=float, default=0.5) |
| | parser.add_argument("--max_tokens", type=int, default=2048) |
| | parser.add_argument("--stop_token", type=str, default=None) |
| | parser.add_argument("--image_width", type=int, default=1280) |
| | parser.add_argument("--image_height", type=int, default=720) |
| |
|
| | |
| | parser.add_argument("--domain", type=str, default="all") |
| | parser.add_argument("--test_all_meta_path", type=str, default="evaluation_examples/test_nogdrive.json") |
| |
|
| | |
| | parser.add_argument( |
| | "--region", type=str, default="us-east-1", help="AWS region for the VM" |
| | ) |
| | parser.add_argument("--client_password", type=str, default="", help="Client password") |
| |
|
| | |
| | parser.add_argument("--result_dir", type=str, default="./results") |
| | |
| | |
| | parser.add_argument("--num_workers", type=int, default=20, help="Number of parallel workers") |
| | args = parser.parse_args() |
| |
|
| | return args |
| |
|
| | def _worker_run(task): |
| | domain, example_id, args = task |
| | logger = logging.getLogger("desktopenv.experiment") |
| | try: |
| | config_file = os.path.join(args.test_config_base_dir, f"{domain}/{example_id}.json") |
| | with open(config_file, "r", encoding="utf-8") as f: |
| | example = json.load(f) |
| | instruction = example["instruction"] |
| |
|
| | @backoff.on_exception(backoff.constant, (RateLimitError, APIConnectionError), interval=0.1) |
| | def call_llm(messages): |
| | logger.info("Calling LLM...") |
| | |
| | |
| | data = { |
| | "model": args.model, |
| | "messages": messages, |
| | "max_tokens": args.max_tokens, |
| | "temperature": args.temperature, |
| | "top_p": args.top_p, |
| | "skip_special_tokens": False, |
| | "stream": False, |
| | "include_stop_str_in_output": True, |
| | "stop": ["<|user|>", "<|observation|>", "</answer>"] |
| | } |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | headers = { |
| | "Content-Type": "application/json", |
| | "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY', '')}" |
| | } |
| | |
| | |
| | base_url = os.environ.get('OPENAI_BASE_URL', 'https://api.openai.com/v1') |
| | url = f"{base_url}/chat/completions" |
| | |
| | response = requests.post( |
| | url, |
| | json=data, |
| | headers=headers, |
| | |
| | timeout=60.0 |
| | ) |
| | response.raise_for_status() |
| | |
| | result = response.json() |
| | logger.info("LLM called successfully.") |
| | return result['choices'][0]['message']['content'] |
| |
|
| | env = DesktopEnv( |
| | provider_name=args.provider_name, |
| | region=args.region, |
| | client_password=args.client_password, |
| | path_to_vm=args.path_to_vm, |
| | action_space=args.action_space, |
| | screen_size=(args.screen_width, args.screen_height), |
| | headless=args.headless, |
| | os_type="Ubuntu", |
| | require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"], |
| | ) |
| | agent = AutoGLMAgent( |
| | action_space=args.action_space, |
| | observation_type=args.observation_type, |
| | screen_size=(args.screen_width, args.screen_height), |
| | image_size=(args.image_width, args.image_height), |
| | max_trajectory_length=args.max_trajectory_length, |
| | client_password=args.client_password, |
| | gen_func=call_llm, |
| | ) |
| |
|
| | example_result_dir = os.path.join( |
| | args.result_dir, |
| | args.action_space, |
| | args.observation_type, |
| | args.model, |
| | domain, |
| | example_id, |
| | ) |
| | os.makedirs(example_result_dir, exist_ok=True) |
| |
|
| | local_scores = [] |
| | try: |
| | lib_run_single.run_single_example_autoglm( |
| | agent, |
| | env, |
| | example, |
| | args.max_steps, |
| | instruction, |
| | args, |
| | example_result_dir, |
| | local_scores, |
| | ) |
| | except Exception as e: |
| | logger.error(f"[并发任务异常] {domain}/{example_id}: {e}") |
| | if hasattr(env, "controller") and env.controller is not None: |
| | try: |
| | env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4")) |
| | except Exception: |
| | pass |
| | with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: |
| | f.write(json.dumps({"Error": f"Exception in {domain}/{example_id}: {str(e)}"}) + "\n") |
| | finally: |
| | try: |
| | env.close() |
| | except Exception: |
| | pass |
| |
|
| | score = None |
| | result_path = os.path.join(example_result_dir, "result.txt") |
| | if os.path.exists(result_path): |
| | try: |
| | with open(result_path, "r") as rf: |
| | res = rf.read().strip() |
| | if res.lower() == "true": |
| | score = 1.0 |
| | else: |
| | score = float(res) |
| | except Exception: |
| | score = 0.0 |
| | else: |
| | score = 0.0 |
| | logger.info(f"[Finish] {domain}/{example_id} score={score}") |
| | return (domain, example_id, score) |
| | except Exception as e: |
| | logger = logging.getLogger("desktopenv.experiment") |
| | logger.error(f"[Initializing Fail] {domain}/{example_id}: {e}") |
| | return (domain, example_id, 0.0) |
| |
|
| | def test_parallel(args: argparse.Namespace, test_all_meta: dict): |
| | tasks = [] |
| | for domain in test_all_meta: |
| | for example_id in test_all_meta[domain]: |
| | tasks.append((domain, example_id, args)) |
| | if not tasks: |
| | logger.info("No pending tasks") |
| | return |
| | logger.info(f"Starting parallel execution: {args.num_workers} processes, {len(tasks)} tasks total") |
| |
|
| | results = [] |
| | with Pool(processes=args.num_workers) as pool: |
| | for res in tqdm(pool.imap_unordered(_worker_run, tasks), total=len(tasks), desc="Parallel execution"): |
| | results.append(res) |
| |
|
| | scores = [s for (_, _, s) in results if s is not None] |
| | if scores: |
| | avg = sum(scores) / len(scores) |
| | logger.info(f"Parallel execution completed. Average score: {avg}") |
| | else: |
| | logger.info("No scores obtained.") |
| |
|
| | if __name__ == "__main__": |
| | |
| | os.environ["TOKENIZERS_PARALLELISM"] = "false" |
| | args = config() |
| | if args.client_password == "": |
| | if args.provider_name == "aws": |
| | args.client_password = "osworld-public-evaluation" |
| | else: |
| | args.client_password = "password" |
| | else: |
| | args.client_password = args.client_password |
| |
|
| | |
| | path_to_args = os.path.join( |
| | args.result_dir, |
| | args.action_space, |
| | args.observation_type, |
| | args.model, |
| | "args.json", |
| | ) |
| | os.makedirs(os.path.dirname(path_to_args), exist_ok=True) |
| | with open(path_to_args, "w", encoding="utf-8") as f: |
| | json.dump(vars(args), f, indent=4) |
| |
|
| | with open(args.test_all_meta_path, "r", encoding="utf-8") as f: |
| | test_all_meta = json.load(f) |
| |
|
| | if args.domain != "all": |
| | test_all_meta = {args.domain: test_all_meta[args.domain]} |
| |
|
| | test_file_list = get_unfinished( |
| | args.action_space, |
| | args.model, |
| | args.observation_type, |
| | args.result_dir, |
| | test_all_meta, |
| | ) |
| | left_info = "" |
| | for domain in test_file_list: |
| | left_info += f"{domain}: {len(test_file_list[domain])}\n" |
| | logger.info(f"Left tasks:\n{left_info}") |
| |
|
| | get_result( |
| | args.action_space, |
| | args.model, |
| | args.observation_type, |
| | args.result_dir, |
| | test_all_meta, |
| | ) |
| | test_parallel(args, test_file_list) |
| |
|