Final_Assignment_Template_alisa

Runtime error

App Files Files Community

alisamak commited on May 2, 2025

Commit

2a6e16b

verified ·

1 Parent(s): be9fa7f

Create evaluate_agent.py

Browse files

Files changed (1) hide show

evaluate_agent.py +51 -0

evaluate_agent.py ADDED Viewed

	@@ -0,0 +1,51 @@

+def test_questions():
+    return [
+        {
+            "task_id": "q2",
+            "question": (
+                "How many studio albums did Mercedes Sosa release between 2000 and 2009 (inclusive)? "
+                "Use Wikipedia to find the answer."
+            ),
+            "expected_keywords": ["2", "two"]
+        },
+        {
+            "task_id": "q3",
+            "question": (
+                "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species "
+                "to be on camera simultaneously?"
+            ),
+            "expected_keywords": ["14", "fourteen"]
+        },
+    ]
+def evaluate_agent(agent, questions):
+    print("\n\n========= Running GAIA Evaluation =========\n")
+    correct = 0
+    total = len(questions)
+    logs = []
+    for q in questions:
+        print(f"🟨 Q: {q['question']}")
+        try:
+            answer = agent(q["question"])
+            print(f"🟩 A: {answer}\n")
+            matched = any(keyword.lower() in answer.lower() for keyword in q["expected_keywords"])
+            expected = ", ".join(q["expected_keywords"])
+            result = "✅ Correct" if matched else "❌ Incorrect"
+            print(f"{result} — Expected one of: [{expected}]\n")
+            if matched:
+                correct += 1
+        except Exception as e:
+            print(f"🟥 Error: {e}\n")
+            result = f"🟥 Error: {e}"
+        logs.append({
+            "question": q["question"],
+            "expected_keywords": q["expected_keywords"],
+            "result": result
+        })
+    score_report = f"✅ Score: {correct} / {total} correct"
+    print(score_report + "\n")
+    return score_report