Final_Assignment_Template_alisa

Runtime error

App Files Files Community

alisamak commited on Apr 29, 2025

Commit

ac0b6fe

verified ·

1 Parent(s): 9bdf55c

Update LG_agent.py

Browse files

Files changed (1) hide show

LG_agent.py +64 -0

LG_agent.py CHANGED Viewed

@@ -70,3 +70,67 @@ class BasicAgent:
             return final_message.content if hasattr(final_message, "content") else "No final message."
         else:
             return "No response."

             return final_message.content if hasattr(final_message, "content") else "No final message."
         else:
             return "No response."
+# 6. Local GAIA-style test set
+test_questions = [
+    {
+        "task_id": "q1",
+        "question": (
+            "I'm making a grocery list for my mom, who's a strict botanist. "
+            "Here's the list: milk, eggs, sweet potatoes, basil, green beans, peanuts. "
+            "Give me only the botanical vegetables, comma-separated, alphabetically."
+        ),
+        "expected_keywords": ["basil", "green beans"]
+    },
+    {
+        "task_id": "q2",
+        "question": (
+            "How many studio albums did Mercedes Sosa release between 2000 and 2009 (inclusive)? "
+            "Use Wikipedia to find the answer."
+        ),
+        "expected_keywords": ["5", "five"]
+    },
+    {
+        "task_id": "q3",
+        "question": (
+            "What was the name of the dinosaur featured in the only Featured Article promoted in November 2016?"
+        ),
+        "expected_keywords": ["Qianzhousaurus"]
+    },
+    {
+        "task_id": "q4",
+        "question": (
+            "Write the opposite of the word 'left' as the answer to this sentence."
+        ),
+        "expected_keywords": ["right"]
+    },
+    {
+        "task_id": "q5",
+        "question": (
+            "Given this table defining * on S = {a,b,c}, with a*c = b and c*a = c, is * commutative?"
+        ),
+        "expected_keywords": ["not commutative"]
+    },
+]
+# 7. Evaluation function
+def evaluate_agent(agent: BasicAgent, questions):
+    print("\n\n========= Running GAIA Evaluation =========\n")
+    correct = 0
+    for q in questions:
+        print(f"🟨 Q: {q['question']}")
+        try:
+            answer = agent(q["question"])
+            print(f"🟩 A: {answer}\n")
+            if any(keyword.lower() in answer.lower() for keyword in q["expected_keywords"]):
+                correct += 1
+        except Exception as e:
+            print(f"🟥 Error: {e}\n")
+    print(f"✅ Score: {correct} / {len(questions)} correct\n")
+if __name__ == "__main__":
+    agent = BasicAgent()
+    evaluate_agent(agent, test_questions)