File size: 1,647 Bytes
eccf8e4 01ce061 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import requests
from datasets import load_dataset
from transformers import pipeline
# ---------------------------
# CONFIG
# ---------------------------
SCORING_API = "https://agents-course-unit4-scoring.hf.space"
MODEL_NAME = "google/flan-t5-base"
# ---------------------------
# Load model
# ---------------------------
print("Loading model...")
qa = pipeline("text2text-generation", model=MODEL_NAME, max_new_tokens=64)
# ---------------------------
# Fetch the 20 questions
# ---------------------------
print("Fetching GAIA questions...")
questions = requests.get(f"{SCORING_API}/questions").json()
task_ids = [q["task_id"] for q in questions]
# ---------------------------
# Load GAIA validation dataset
# ---------------------------
print("Loading GAIA validation set...")
dataset = load_dataset(
"gaia-benchmark/GAIA",
"2023_level1",
split="validation"
)
# Map task_id β correct answer
ground_truth = {
item["task_id"]: item["Final answer"]
for item in dataset
if item["task_id"] in task_ids
}
# ---------------------------
# Evaluate
# ---------------------------
correct = 0
for q in questions:
task_id = q["task_id"]
question = q["question"]
true_answer = ground_truth.get(task_id, "").strip().lower()
model_output = qa(question)[0]["generated_text"].strip().lower()
match = model_output == true_answer
correct += int(match)
print("\n" + "="*80)
print(f"QUESTION:\n{question}")
print(f"\nEXPECTED:\n{true_answer}")
print(f"\nMODEL:\n{model_output}")
print(f"\nMATCH: {'β
' if match else 'β'}")
print("\n" + "="*80)
print(f"FINAL SCORE: {correct}/20")
|