alisamak commited on
Commit
ac0b6fe
·
verified ·
1 Parent(s): 9bdf55c

Update LG_agent.py

Browse files
Files changed (1) hide show
  1. LG_agent.py +64 -0
LG_agent.py CHANGED
@@ -70,3 +70,67 @@ class BasicAgent:
70
  return final_message.content if hasattr(final_message, "content") else "No final message."
71
  else:
72
  return "No response."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  return final_message.content if hasattr(final_message, "content") else "No final message."
71
  else:
72
  return "No response."
73
+
74
+
75
+ # 6. Local GAIA-style test set
76
+
77
+ test_questions = [
78
+ {
79
+ "task_id": "q1",
80
+ "question": (
81
+ "I'm making a grocery list for my mom, who's a strict botanist. "
82
+ "Here's the list: milk, eggs, sweet potatoes, basil, green beans, peanuts. "
83
+ "Give me only the botanical vegetables, comma-separated, alphabetically."
84
+ ),
85
+ "expected_keywords": ["basil", "green beans"]
86
+ },
87
+ {
88
+ "task_id": "q2",
89
+ "question": (
90
+ "How many studio albums did Mercedes Sosa release between 2000 and 2009 (inclusive)? "
91
+ "Use Wikipedia to find the answer."
92
+ ),
93
+ "expected_keywords": ["5", "five"]
94
+ },
95
+ {
96
+ "task_id": "q3",
97
+ "question": (
98
+ "What was the name of the dinosaur featured in the only Featured Article promoted in November 2016?"
99
+ ),
100
+ "expected_keywords": ["Qianzhousaurus"]
101
+ },
102
+ {
103
+ "task_id": "q4",
104
+ "question": (
105
+ "Write the opposite of the word 'left' as the answer to this sentence."
106
+ ),
107
+ "expected_keywords": ["right"]
108
+ },
109
+ {
110
+ "task_id": "q5",
111
+ "question": (
112
+ "Given this table defining * on S = {a,b,c}, with a*c = b and c*a = c, is * commutative?"
113
+ ),
114
+ "expected_keywords": ["not commutative"]
115
+ },
116
+ ]
117
+
118
+ # 7. Evaluation function
119
+
120
+ def evaluate_agent(agent: BasicAgent, questions):
121
+ print("\n\n========= Running GAIA Evaluation =========\n")
122
+ correct = 0
123
+ for q in questions:
124
+ print(f"🟨 Q: {q['question']}")
125
+ try:
126
+ answer = agent(q["question"])
127
+ print(f"🟩 A: {answer}\n")
128
+ if any(keyword.lower() in answer.lower() for keyword in q["expected_keywords"]):
129
+ correct += 1
130
+ except Exception as e:
131
+ print(f"🟥 Error: {e}\n")
132
+ print(f"✅ Score: {correct} / {len(questions)} correct\n")
133
+
134
+ if __name__ == "__main__":
135
+ agent = BasicAgent()
136
+ evaluate_agent(agent, test_questions)