import os import requests from langchain_core.messages import HumanMessage from agent import build_graph from huggingface_hub import hf_hub_download import pyarrow.parquet as pq from dotenv import load_dotenv load_dotenv(override=True) DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" # Initialize agent graph = build_graph() # Fetch 1 question resp = requests.get(f"{DEFAULT_API_URL}/questions") questions = resp.json()[:1] # Load ground truth token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token) df = pq.read_table(path).to_pandas() answer_map = dict(zip(df['task_id'], df['Final answer'])) # Test q = questions[0] task_id = q['task_id'] question = q['question'] ground_truth = answer_map.get(task_id, "NOT FOUND") print(f"Question: {question[:100]}...") print(f"Ground Truth: {ground_truth}") print("-" * 40) result = graph.invoke({"messages": [HumanMessage(content=question)]}) answer = result['messages'][-1].content print(f"Agent Answer: {answer}") print("-" * 40) is_correct = answer.strip().lower() == str(ground_truth).strip().lower() print(f"Correct: {is_correct}")