Final_Assignment_Template_alisa / evaluate_agent.py
alisamak's picture
Update evaluate_agent.py
20e486a verified
def test_questions():
return [
{
"task_id": "q4",
"question": (
"Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, "
"but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it"
),
"expected_keywords": ["clarify", "incomplete", "missing", "please provide", "need more information"]
},
{
"task_id": "q7",
"question": (
"Given this table defining * on the set S = {a, b, c, d, e}\n\n"
"|*|a|b|c|d|e|\n"
"|---|---|---|---|---|---|\n"
"|a|a|b|c|b|d|\n"
"|b|b|c|a|e|c|\n"
"|c|c|a|b|b|a|\n"
"|d|b|e|b|e|d|\n"
"|e|d|b|a|d|c|\n\n"
"Provide the subset of S involved in any possible counter-examples that prove * is not commutative. "
"Provide your answer as a comma-separated list of the elements in the set in alphabetical order."
),
"expected_keywords": ["b, e"]
},
{
"task_id": "q3",
"question": (
"'.rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI'"
),
"expected_keywords": ["right"]
},
{
"task_id": "q2",
"question": (
"How many studio albums did Mercedes Sosa release between 2000 and 2009 (inclusive)? "
"Use Wikipedia to find the answer."
),
"expected_keywords": ["3", "three"]
},
{
"task_id": "q4b",
"question": (
"Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?"
),
"expected_keywords": ["FunkMonk"]
},
{
"task_id": "q5",
"question": (
"Who is the CEO of OpenAI?"
),
"expected_keywords": ["sam altman"]
},
{
"task_id": "q6",
"question": (
"When was the Eiffel Tower built?"
),
"expected_keywords": ["1889"]
}
]
def evaluate_agent(agent, questions):
print("\n\n========= Running GAIA Evaluation =========\n")
correct = 0
total = len(questions)
logs = []
for q in questions:
print(f"🟨 Q: {q['question']}")
try:
answer = agent(q["question"])
print(f"🟩 A: {answer}\n")
matched = any(keyword.lower() in answer.lower() for keyword in q["expected_keywords"])
expected = ", ".join(q["expected_keywords"])
result = "βœ… Correct" if matched else "❌ Incorrect"
print(f"{result} β€” Expected one of: [{expected}]\n")
if matched:
correct += 1
except Exception as e:
print(f"πŸŸ₯ Error: {e}\n")
result = f"πŸŸ₯ Error: {e}"
logs.append({
"question": q["question"],
"expected_keywords": q["expected_keywords"],
"result": result
})
score_report = f"βœ… Score: {correct} / {total} correct"
print(score_report + "\n")
return score_report