| import os |
| import requests |
| import re |
| from langchain_core.messages import HumanMessage |
| from agent import build_graph |
| from huggingface_hub import hf_hub_download |
| import pyarrow.parquet as pq |
| from dotenv import load_dotenv |
|
|
| load_dotenv(override=True) |
|
|
| DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" |
|
|
| def extract_answer(content) -> str: |
| if isinstance(content, str): |
| match = re.search(r'FINAL ANSWER:\s*(.+?)(?:\n|$)', content, re.IGNORECASE) |
| if match: |
| return match.group(1).strip() |
| return content.strip() |
| return str(content) |
|
|
| graph = build_graph() |
| resp = requests.get(f"{DEFAULT_API_URL}/questions") |
| questions = resp.json() |
|
|
| token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") |
| path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token) |
| df = pq.read_table(path).to_pandas() |
| answer_map = dict(zip(df['task_id'], df['Final answer'])) |
|
|
| |
| for i in range(20): |
| q = questions[i] |
| task_id = q['task_id'] |
| question = q['question'] |
| ground_truth = answer_map.get(task_id, "NOT FOUND") |
| file_name = q.get('file_name', '') |
| |
| result = graph.invoke({"messages": [HumanMessage(content=question)]}) |
| answer_raw = result['messages'][-1].content |
| answer = extract_answer(answer_raw) |
| |
| is_correct = answer.strip().lower() == str(ground_truth).strip().lower() |
| status = "OK" if is_correct else "FAIL" |
| print(f"[Q{i+1:2d}] {status} | GT: {str(ground_truth)[:20]} | Ans: {answer[:20]}") |
|
|