| import json |
| import yaml |
| import argparse |
| import os |
| import datetime |
| from typing import Optional, Dict, Any, Callable |
| from magentic_ui.eval.core import run_evaluate_benchmark_func, evaluate_benchmark_func |
| from systems.magentic_ui_sim_user_system import MagenticUISimUserSystem |
| from magentic_ui.eval.systems import LLMSystem |
| from magentic_ui.eval.benchmarks import WebVoyagerBenchmark |
| from magentic_ui.eval.benchmark import Benchmark |
| from autogen_core.models import ChatCompletionClient |
|
|
|
|
| def save_experiment_args(args: argparse.Namespace, system_name: str) -> None: |
| """ |
| Save experiment arguments to a timestamped JSON file. |
| |
| Args: |
| args (argparse.Namespace): The arguments namespace containing experiment parameters. |
| system_name (str): The name of the system being evaluated. |
| """ |
| timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
| filename = f"args_{timestamp}.json" |
|
|
| |
| save_dir = os.path.join( |
| args.current_dir, |
| "runs", |
| system_name, |
| args.dataset, |
| args.split or "all_benchmark", |
| str(args.run_id), |
| ) |
| os.makedirs(save_dir, exist_ok=True) |
|
|
| |
| args_dict = vars(args).copy() |
|
|
| |
| if args.config and os.path.exists(args.config): |
| config_contents = load_config(args.config) |
| if config_contents is not None: |
| client_keys = [ |
| "orchestrator_client", |
| "web_surfer_client", |
| "coder_client", |
| "file_surfer_client", |
| "user_proxy_client", |
| ] |
| args_dict["client_configs"] = { |
| k: config_contents.get(k) for k in client_keys if k in config_contents |
| } |
| args_dict["config_path"] = os.path.abspath(args.config) |
|
|
| filepath = os.path.join(save_dir, filename) |
| with open(filepath, "w") as f: |
| json.dump(args_dict, f, indent=4) |
|
|
| print(f"Experiment args saved to {filepath}") |
|
|
|
|
| def load_config(config_path: Optional[str]) -> Optional[Dict[str, Any]]: |
| """ |
| Load configuration from either YAML or JSON file. |
| |
| Args: |
| config_path (Optional[str]): Path to the configuration file (YAML or JSON). |
| |
| Returns: |
| Optional[Dict[str, Any]]: The loaded configuration as a dictionary, or None if not found. |
| """ |
| if config_path is None: |
| return None |
|
|
| with open(config_path, "r") as f: |
| if config_path.endswith((".yml", ".yaml")): |
| config = yaml.safe_load(f) |
| return config if config else None |
| else: |
| return json.load(f) |
|
|
|
|
| def run_system_evaluation( |
| args: argparse.Namespace, |
| system_constructor: Any, |
| system_name: str, |
| config: Optional[Dict[str, Any]] = None, |
| ) -> None: |
| """ |
| Common function to run system evaluation to avoid code duplication. |
| |
| Args: |
| args (argparse.Namespace): The arguments namespace containing experiment parameters. |
| system_constructor (Any): The system instance or constructor to evaluate. |
| system_name (str): The name of the system being evaluated. |
| config (Optional[Dict[str, Any]]): Optional configuration dictionary. |
| """ |
| benchmark_constructor: Optional[Callable[..., Benchmark]] = None |
| if args.dataset == "WebVoyager": |
| |
| client = ChatCompletionClient.load_component( |
| { |
| "provider": "OpenAIChatCompletionClient", |
| "config": { |
| "model": "gpt-4o-2024-08-06", |
| }, |
| "max_retries": 10, |
| } |
| ) |
|
|
| def create_benchmark(data_dir="WebVoyager", name="WebVoyager"): |
| benchmark = WebVoyagerBenchmark( |
| data_dir=data_dir, |
| eval_method="gpt_eval", |
| model_client=client, |
| ) |
| return benchmark |
|
|
| benchmark_constructor = create_benchmark |
| |
| if args.mode == "eval": |
| evaluate_benchmark_func( |
| benchmark_name=args.dataset, |
| benchmark_constructor=benchmark_constructor, |
| system_name=system_name, |
| parallel=args.parallel, |
| benchmark_dir=args.current_dir, |
| runs_dir=args.current_dir, |
| split=args.split, |
| run_id=args.run_id, |
| system_constructor=system_constructor, |
| redo_eval=args.redo_eval, |
| ) |
| else: |
| run_evaluate_benchmark_func( |
| benchmark_name=args.dataset, |
| benchmark_constructor=benchmark_constructor, |
| system_name=system_name, |
| parallel=args.parallel, |
| benchmark_dir=args.current_dir, |
| runs_dir=args.current_dir, |
| split=args.split, |
| run_id=args.run_id, |
| system_constructor=system_constructor, |
| subsample=args.subsample if args.subsample < 1 else None, |
| redo_eval=args.redo_eval, |
| ) |
|
|
|
|
| def run_system_sim_user(args: argparse.Namespace, system_name: str) -> None: |
| """ |
| Run evaluation using the MagenticUISystem, which simulates user interactions. |
| |
| Args: |
| args (argparse.Namespace): The arguments namespace containing experiment parameters. |
| system_name (str): The name of the system being evaluated. |
| """ |
| config = load_config(args.config) |
|
|
| if system_name == "LLM": |
| |
| system = LLMSystem( |
| system_name=system_name, |
| endpoint_config=config.get("model_client") if config else None, |
| ) |
| else: |
| system = MagenticUISimUserSystem( |
| simulated_user_type=args.simulated_user_type, |
| endpoint_config_orch=config.get("orchestrator_client") if config else None, |
| endpoint_config_websurfer=config.get("web_surfer_client") if config else None, |
| endpoint_config_coder=config.get("coder_client") if config else None, |
| endpoint_config_file_surfer=config.get("file_surfer_client") |
| if config |
| else None, |
| endpoint_config_user_proxy=config.get("user_proxy_client") if config else None, |
| web_surfer_only=args.web_surfer_only, |
| how_helpful_user_proxy=args.how_helpful_user_proxy, |
| dataset_name=args.dataset, |
| ) |
|
|
| run_system_evaluation(args, system, system_name, config) |
|
|
|
|
| def main() -> None: |
| """ |
| Main entry point for running or evaluating the Magentic-UI system on benchmarks. |
| Parses command-line arguments and dispatches to the appropriate system runner. |
| """ |
| parser = argparse.ArgumentParser( |
| description="Run or evaluate Magentic-UI system on benchmarks" |
| ) |
| parser.add_argument( |
| "--mode", |
| choices=["run", "eval"], |
| default="run", |
| help="Mode to run: 'run' for running benchmarks, 'eval' for evaluation", |
| ) |
| parser.add_argument( |
| "--current-dir", default=os.getcwd(), help="Current working directory" |
| ) |
| parser.add_argument("--split", default="validation-1", help="Dataset split to use") |
| parser.add_argument("--dataset", default="Gaia", help="Dataset name") |
| parser.add_argument( |
| "--config", required=False, help="Path to endpoint configuration file for LLMs" |
| ) |
| parser.add_argument( |
| "--run-id", type=int, default=1, help="Run ID for the experiment" |
| ) |
| parser.add_argument( |
| "--parallel", type=int, default=1, help="Number of parallel processes to use" |
| ) |
| parser.add_argument( |
| "--subsample", |
| type=float, |
| default=1, |
| help="Subsample ratio for the dataset (only used in run mode)", |
| ) |
| parser.add_argument( |
| "--simulated-user-type", |
| type=str, |
| default="none", |
| help="Type of simulated user (co-planning, co-execution, co-planning-and-execution, dummy, none)", |
| ) |
| parser.add_argument( |
| "--how-helpful-user-proxy", |
| type=str, |
| default="soft", |
| help="How helpful the user proxy should be (strict, soft, no_hints)", |
| ) |
|
|
| parser.add_argument( |
| "--user-messages-data", |
| type=str, |
| help="Path to user messages data CSV file", |
| ) |
| parser.add_argument( |
| "--system-type", |
| type=str, |
| default="MagenticUI", |
| choices=["MagenticUI", "magentic-ui-sim-user", "LLM"], |
| help="Type of system to run", |
| ) |
| parser.add_argument( |
| "--web-surfer-only", |
| type=bool, |
| default=False, |
| help="Run only the web surfer agent", |
| ) |
| parser.add_argument( |
| "--redo-eval", |
| action="store_true", |
| default=False, |
| help="Redo evaluation even if results exist (default: False)", |
| ) |
|
|
| args = parser.parse_args() |
|
|
| |
|
|
| system_name = args.system_type |
|
|
| if args.simulated_user_type != "none": |
| system_name += f"_{args.simulated_user_type}_{args.how_helpful_user_proxy}" |
| if args.web_surfer_only: |
| system_name += "_web_surfer_only" |
|
|
| |
| save_experiment_args(args, system_name) |
|
|
| |
| run_system_sim_user(args, system_name) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|