Final_Assignment_Template_alisa

Runtime error

App Files Files Community

Final_Assignment_Template_alisa / evaluate_agent.py

alisamak

Update evaluate_agent.py

20e486a verified 8 months ago

raw

history blame contribute delete

3.41 kB

	def test_questions():
	return [
	{
	"task_id": "q4",
	"question": (
	"Hi, I'm making a pie but I could use some help with my shopping list. I have everything I need for the crust, "
	"but I'm not sure about the filling. I got the recipe from my friend Aditi, but she left it"
	),
	"expected_keywords": ["clarify", "incomplete", "missing", "please provide", "need more information"]
	},
	{
	"task_id": "q7",
	"question": (
	"Given this table defining * on the set S = {a, b, c, d, e}\n\n"
	"\|*\|a\|b\|c\|d\|e\|\n"
	"\|---\|---\|---\|---\|---\|---\|\n"
	"\|a\|a\|b\|c\|b\|d\|\n"
	"\|b\|b\|c\|a\|e\|c\|\n"
	"\|c\|c\|a\|b\|b\|a\|\n"
	"\|d\|b\|e\|b\|e\|d\|\n"
	"\|e\|d\|b\|a\|d\|c\|\n\n"
	"Provide the subset of S involved in any possible counter-examples that prove * is not commutative. "
	"Provide your answer as a comma-separated list of the elements in the set in alphabetical order."
	),
	"expected_keywords": ["b, e"]
	},
	{
	"task_id": "q3",
	"question": (
	"'.rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI'"
	),
	"expected_keywords": ["right"]
	},
	{
	"task_id": "q2",
	"question": (
	"How many studio albums did Mercedes Sosa release between 2000 and 2009 (inclusive)? "
	"Use Wikipedia to find the answer."
	),
	"expected_keywords": ["3", "three"]
	},
	{
	"task_id": "q4b",
	"question": (
	"Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?"
	),
	"expected_keywords": ["FunkMonk"]
	},
	{
	"task_id": "q5",
	"question": (
	"Who is the CEO of OpenAI?"
	),
	"expected_keywords": ["sam altman"]
	},
	{
	"task_id": "q6",
	"question": (
	"When was the Eiffel Tower built?"
	),
	"expected_keywords": ["1889"]
	}
	]


	def evaluate_agent(agent, questions):
	print("\n\n========= Running GAIA Evaluation =========\n")
	correct = 0
	total = len(questions)
	logs = []

	for q in questions:
	print(f"🟨 Q: {q['question']}")
	try:
	answer = agent(q["question"])
	print(f"🟩 A: {answer}\n")
	matched = any(keyword.lower() in answer.lower() for keyword in q["expected_keywords"])
	expected = ", ".join(q["expected_keywords"])
	result = "✅ Correct" if matched else "❌ Incorrect"
	print(f"{result} — Expected one of: [{expected}]\n")
	if matched:
	correct += 1
	except Exception as e:
	print(f"🟥 Error: {e}\n")
	result = f"🟥 Error: {e}"

	logs.append({
	"question": q["question"],
	"expected_keywords": q["expected_keywords"],
	"result": result
	})

	score_report = f"✅ Score: {correct} / {total} correct"
	print(score_report + "\n")
	return score_report