File size: 2,431 Bytes
6e38ce1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from autogen_core.models import ChatCompletionClient
from systems import MagenticUIAutonomousSystem
from systems.magentic_one_system import MagenticOneSystem
from magentic_ui.eval.benchmarks import WebVoyagerBenchmark
import os

def test_magentic_ui_system():
    default_client_config = {
        "provider": "OpenAIChatCompletionClient",
        "config": {
            "model": "gpt-4o-2024-08-06",
        },
        "max_retries": 10,
    }

    system = MagenticUIAutonomousSystem(
        endpoint_config_orch=default_client_config,
        endpoint_config_websurfer=default_client_config,
        endpoint_config_coder=default_client_config,
        endpoint_config_file_surfer=default_client_config,
        use_local_browser=True,
        web_surfer_only=True,
    )

    client = ChatCompletionClient.load_component(default_client_config)

    benchmark = WebVoyagerBenchmark(
        data_dir="WebVoyager",
        eval_method="gpt_eval",
        model_client=client,
    )
    benchmark.download_dataset()
    benchmark.load_dataset()
    test_task = benchmark.tasks["Allrecipes--0"]
    print(test_task)
    os.makedirs("test_output_magentic_ui", exist_ok=True)
    answer = system.get_answer(
        task_id="Allrecipes--0",
        task=test_task,
        output_dir="test_output_magentic_ui",
    )
    print(answer)
    score = benchmark.evaluator(test_task, answer)
    print(score)


def test_magentic_one_system():
    default_client_config = {
        "provider": "OpenAIChatCompletionClient",
        "config": {
            "model": "gpt-4o-2024-08-06",
        },
        "max_retries": 10,
    }

    system = MagenticOneSystem(
        model_client_config=default_client_config,
        web_surfer_only=True,
    )

    client = ChatCompletionClient.load_component(default_client_config)

    benchmark = WebVoyagerBenchmark(
        data_dir="WebVoyager",
        eval_method="gpt_eval",
        model_client=client,
    )
    benchmark.download_dataset()
    benchmark.load_dataset()
    test_task = benchmark.tasks["Allrecipes--0"]
    print(test_task)
    os.makedirs("test_output_magentic_one", exist_ok=True)
    answer = system.get_answer(
        task_id="Allrecipes--0",
        task=test_task,
        output_dir="test_output_magentic_one",
    )
    print(answer)
    score = benchmark.evaluator(test_task, answer)
    print(score)


if __name__ == "__main__":
    test_magentic_one_system()