File size: 3,406 Bytes
2a6e16b
 
607841c
 
 
 
 
 
 
 
1abcd94
20e486a
 
 
 
 
 
 
 
 
 
 
 
 
 
1abcd94
e515870
 
 
 
 
 
 
1abcd94
 
 
 
 
 
 
 
 
20e486a
1abcd94
 
 
 
20e486a
 
 
 
 
 
 
 
 
 
 
 
 
 
1abcd94
2a6e16b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def test_questions():
    return [
        {
            "task_id": "q4",
            "question": (
                "Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, "
                "but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it"
            ),
            "expected_keywords": ["clarify", "incomplete", "missing", "please provide", "need more information"]
        },
        {
            "task_id": "q7",
            "question": (
                "Given this table defining * on the set S = {a, b, c, d, e}\n\n"
                "|*|a|b|c|d|e|\n"
                "|---|---|---|---|---|---|\n"
                "|a|a|b|c|b|d|\n"
                "|b|b|c|a|e|c|\n"
                "|c|c|a|b|b|a|\n"
                "|d|b|e|b|e|d|\n"
                "|e|d|b|a|d|c|\n\n"
                "Provide the subset of S involved in any possible counter-examples that prove * is not commutative. "
                "Provide your answer as a comma-separated list of the elements in the set in alphabetical order."
            ),
            "expected_keywords": ["b, e"]
        },
        {
            "task_id": "q3",
            "question": (
                "'.rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI'"
            ),
            "expected_keywords": ["right"]
        },
        {
            "task_id": "q2",
            "question": (
                "How many studio albums did Mercedes Sosa release between 2000 and 2009 (inclusive)? "
                "Use Wikipedia to find the answer."
            ),
            "expected_keywords": ["3", "three"]
        },
        {
            "task_id": "q4b",
            "question": (
                "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?"
            ),
            "expected_keywords": ["FunkMonk"]
        },
        {
            "task_id": "q5",
            "question": (
                "Who is the CEO of OpenAI?"
            ),
            "expected_keywords": ["sam altman"]
        },
        {
            "task_id": "q6",
            "question": (
                "When was the Eiffel Tower built?"
            ),
            "expected_keywords": ["1889"]
        }
    ]


def evaluate_agent(agent, questions):
    print("\n\n========= Running GAIA Evaluation =========\n")
    correct = 0
    total = len(questions)
    logs = []

    for q in questions:
        print(f"🟨 Q: {q['question']}")
        try:
            answer = agent(q["question"])
            print(f"🟩 A: {answer}\n")
            matched = any(keyword.lower() in answer.lower() for keyword in q["expected_keywords"])
            expected = ", ".join(q["expected_keywords"])
            result = "βœ… Correct" if matched else "❌ Incorrect"
            print(f"{result} β€” Expected one of: [{expected}]\n")
            if matched:
                correct += 1
        except Exception as e:
            print(f"πŸŸ₯ Error: {e}\n")
            result = f"πŸŸ₯ Error: {e}"

        logs.append({
            "question": q["question"],
            "expected_keywords": q["expected_keywords"],
            "result": result
        })

    score_report = f"βœ… Score: {correct} / {total} correct"
    print(score_report + "\n")
    return score_report