File size: 2,431 Bytes
6e38ce1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | from autogen_core.models import ChatCompletionClient
from systems import MagenticUIAutonomousSystem
from systems.magentic_one_system import MagenticOneSystem
from magentic_ui.eval.benchmarks import WebVoyagerBenchmark
import os
def test_magentic_ui_system():
default_client_config = {
"provider": "OpenAIChatCompletionClient",
"config": {
"model": "gpt-4o-2024-08-06",
},
"max_retries": 10,
}
system = MagenticUIAutonomousSystem(
endpoint_config_orch=default_client_config,
endpoint_config_websurfer=default_client_config,
endpoint_config_coder=default_client_config,
endpoint_config_file_surfer=default_client_config,
use_local_browser=True,
web_surfer_only=True,
)
client = ChatCompletionClient.load_component(default_client_config)
benchmark = WebVoyagerBenchmark(
data_dir="WebVoyager",
eval_method="gpt_eval",
model_client=client,
)
benchmark.download_dataset()
benchmark.load_dataset()
test_task = benchmark.tasks["Allrecipes--0"]
print(test_task)
os.makedirs("test_output_magentic_ui", exist_ok=True)
answer = system.get_answer(
task_id="Allrecipes--0",
task=test_task,
output_dir="test_output_magentic_ui",
)
print(answer)
score = benchmark.evaluator(test_task, answer)
print(score)
def test_magentic_one_system():
default_client_config = {
"provider": "OpenAIChatCompletionClient",
"config": {
"model": "gpt-4o-2024-08-06",
},
"max_retries": 10,
}
system = MagenticOneSystem(
model_client_config=default_client_config,
web_surfer_only=True,
)
client = ChatCompletionClient.load_component(default_client_config)
benchmark = WebVoyagerBenchmark(
data_dir="WebVoyager",
eval_method="gpt_eval",
model_client=client,
)
benchmark.download_dataset()
benchmark.load_dataset()
test_task = benchmark.tasks["Allrecipes--0"]
print(test_task)
os.makedirs("test_output_magentic_one", exist_ok=True)
answer = system.get_answer(
task_id="Allrecipes--0",
task=test_task,
output_dir="test_output_magentic_one",
)
print(answer)
score = benchmark.evaluator(test_task, answer)
print(score)
if __name__ == "__main__":
test_magentic_one_system()
|