GAIA_Evaluation

Sleeping

GAIA_Evaluation / test_cases.py

denven

Leverage LangGraph & LLM models to implement

bd62c9a 7 months ago

1.75 kB

	import time
	from agent import build_graph
	from langchain_core.messages import AnyMessage, HumanMessage
	from questions import QuestionsAPI
	from typing import Dict, Any

	# timeit decorator
	def timeit(func):
	def wrapper(args, *kwargs):
	start_time = time.time()
	result = func(args, *kwargs)
	end_time = time.time()
	print(f"{func.__name__}() completed in {end_time - start_time:.2f} seconds.")
	return result
	return wrapper

	@timeit
	def answer_question(provider, question):
	graph = build_graph(provider=provider) # Build the graph
	messages: list[AnyMessage] = [HumanMessage(content=question)] # Run the graph
	result = graph.invoke({"messages": messages})
	for m in result["messages"]:
	m.pretty_print()

	# Test by checking the FINAL ANSWER of the full 20 questions
	def test_agent_qa(provider: str = "deepseek"):
	question: str = "When was a picture of St. Thomas Aquinas first added to the Wikipedia page on the Principle of double effect?"
	gaia_api = QuestionsAPI()

	try:
	questions = gaia_api.get_questions(level=1, limit=20)
	print(f"Retrieved {len(questions)} questions")
	# Get a specific question to answer
	for index, sub in enumerate(questions):
	task_id = sub.get("task_id", "unknown")
	question = sub.get("question", "No question provided")
	file_name = sub.get("file_name", "")
	print(f"Start to answer question {index+1}: {task_id}, {question}")
	answer_question(provider=provider, question=question)
	except Exception as e:
	print(f"Error: {e}")
	finally:
	gaia_api.close()

	test_agent_qa("groq")