GAIA_Evaluation / test_cases.py
denven
Leverage LangGraph & LLM models to implement
bd62c9a
import time
from agent import build_graph
from langchain_core.messages import AnyMessage, HumanMessage
from questions import QuestionsAPI
from typing import Dict, Any
# timeit decorator
def timeit(func):
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
print(f"{func.__name__}() completed in {end_time - start_time:.2f} seconds.")
return result
return wrapper
@timeit
def answer_question(provider, question):
graph = build_graph(provider=provider) # Build the graph
messages: list[AnyMessage] = [HumanMessage(content=question)] # Run the graph
result = graph.invoke({"messages": messages})
for m in result["messages"]:
m.pretty_print()
# Test by checking the FINAL ANSWER of the full 20 questions
def test_agent_qa(provider: str = "deepseek"):
question: str = "When was a picture of St. Thomas Aquinas first added to the Wikipedia page on the Principle of double effect?"
gaia_api = QuestionsAPI()
try:
questions = gaia_api.get_questions(level=1, limit=20)
print(f"Retrieved {len(questions)} questions")
# Get a specific question to answer
for index, sub in enumerate(questions):
task_id = sub.get("task_id", "unknown")
question = sub.get("question", "No question provided")
file_name = sub.get("file_name", "")
print(f"Start to answer question {index+1}: {task_id}, {question}")
answer_question(provider=provider, question=question)
except Exception as e:
print(f"Error: {e}")
finally:
gaia_api.close()
test_agent_qa("groq")