alisamak commited on
Commit
2a6e16b
Β·
verified Β·
1 Parent(s): be9fa7f

Create evaluate_agent.py

Browse files
Files changed (1) hide show
  1. evaluate_agent.py +51 -0
evaluate_agent.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def test_questions():
2
+ return [
3
+ {
4
+ "task_id": "q2",
5
+ "question": (
6
+ "How many studio albums did Mercedes Sosa release between 2000 and 2009 (inclusive)? "
7
+ "Use Wikipedia to find the answer."
8
+ ),
9
+ "expected_keywords": ["2", "two"]
10
+ },
11
+ {
12
+ "task_id": "q3",
13
+ "question": (
14
+ "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species "
15
+ "to be on camera simultaneously?"
16
+ ),
17
+ "expected_keywords": ["14", "fourteen"]
18
+ },
19
+ ]
20
+
21
+
22
+ def evaluate_agent(agent, questions):
23
+ print("\n\n========= Running GAIA Evaluation =========\n")
24
+ correct = 0
25
+ total = len(questions)
26
+ logs = []
27
+
28
+ for q in questions:
29
+ print(f"🟨 Q: {q['question']}")
30
+ try:
31
+ answer = agent(q["question"])
32
+ print(f"🟩 A: {answer}\n")
33
+ matched = any(keyword.lower() in answer.lower() for keyword in q["expected_keywords"])
34
+ expected = ", ".join(q["expected_keywords"])
35
+ result = "βœ… Correct" if matched else "❌ Incorrect"
36
+ print(f"{result} β€” Expected one of: [{expected}]\n")
37
+ if matched:
38
+ correct += 1
39
+ except Exception as e:
40
+ print(f"πŸŸ₯ Error: {e}\n")
41
+ result = f"πŸŸ₯ Error: {e}"
42
+
43
+ logs.append({
44
+ "question": q["question"],
45
+ "expected_keywords": q["expected_keywords"],
46
+ "result": result
47
+ })
48
+
49
+ score_report = f"βœ… Score: {correct} / {total} correct"
50
+ print(score_report + "\n")
51
+ return score_report