File size: 1,647 Bytes
eccf8e4
01ce061
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import requests
from datasets import load_dataset
from transformers import pipeline

# ---------------------------
# CONFIG
# ---------------------------
SCORING_API = "https://agents-course-unit4-scoring.hf.space"
MODEL_NAME = "google/flan-t5-base"

# ---------------------------
# Load model
# ---------------------------
print("Loading model...")
qa = pipeline("text2text-generation", model=MODEL_NAME, max_new_tokens=64)

# ---------------------------
# Fetch the 20 questions
# ---------------------------
print("Fetching GAIA questions...")
questions = requests.get(f"{SCORING_API}/questions").json()

task_ids = [q["task_id"] for q in questions]

# ---------------------------
# Load GAIA validation dataset
# ---------------------------
print("Loading GAIA validation set...")
dataset = load_dataset(
    "gaia-benchmark/GAIA",
    "2023_level1",
    split="validation"
)

# Map task_id β†’ correct answer
ground_truth = {
    item["task_id"]: item["Final answer"]
    for item in dataset
    if item["task_id"] in task_ids
}

# ---------------------------
# Evaluate
# ---------------------------
correct = 0

for q in questions:
    task_id = q["task_id"]
    question = q["question"]
    true_answer = ground_truth.get(task_id, "").strip().lower()

    model_output = qa(question)[0]["generated_text"].strip().lower()

    match = model_output == true_answer
    correct += int(match)

    print("\n" + "="*80)
    print(f"QUESTION:\n{question}")
    print(f"\nEXPECTED:\n{true_answer}")
    print(f"\nMODEL:\n{model_output}")
    print(f"\nMATCH: {'βœ…' if match else '❌'}")

print("\n" + "="*80)
print(f"FINAL SCORE: {correct}/20")