File size: 1,596 Bytes
b70c4a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
import requests
import re
from langchain_core.messages import HumanMessage
from agent import build_graph
from huggingface_hub import hf_hub_download
import pyarrow.parquet as pq
from dotenv import load_dotenv

load_dotenv(override=True)

DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

def extract_answer(content) -> str:
    if isinstance(content, str):
        match = re.search(r'FINAL ANSWER:\s*(.+?)(?:\n|$)', content, re.IGNORECASE)
        if match:
            return match.group(1).strip()
        return content.strip()
    return str(content)

graph = build_graph()
resp = requests.get(f"{DEFAULT_API_URL}/questions")
questions = resp.json()

token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
df = pq.read_table(path).to_pandas()
answer_map = dict(zip(df['task_id'], df['Final answer']))

# Test all questions to see current state
for i in range(20):
    q = questions[i]
    task_id = q['task_id']
    question = q['question']
    ground_truth = answer_map.get(task_id, "NOT FOUND")
    file_name = q.get('file_name', '')
    
    result = graph.invoke({"messages": [HumanMessage(content=question)]})
    answer_raw = result['messages'][-1].content
    answer = extract_answer(answer_raw)
    
    is_correct = answer.strip().lower() == str(ground_truth).strip().lower()
    status = "OK" if is_correct else "FAIL"
    print(f"[Q{i+1:2d}] {status} | GT: {str(ground_truth)[:20]} | Ans: {answer[:20]}")