import requests from datasets import load_dataset from transformers import pipeline # --------------------------- # CONFIG # --------------------------- SCORING_API = "https://agents-course-unit4-scoring.hf.space" MODEL_NAME = "google/flan-t5-base" # --------------------------- # Load model # --------------------------- print("Loading model...") qa = pipeline("text2text-generation", model=MODEL_NAME, max_new_tokens=64) # --------------------------- # Fetch the 20 questions # --------------------------- print("Fetching GAIA questions...") questions = requests.get(f"{SCORING_API}/questions").json() task_ids = [q["task_id"] for q in questions] # --------------------------- # Load GAIA validation dataset # --------------------------- print("Loading GAIA validation set...") dataset = load_dataset( "gaia-benchmark/GAIA", "2023_level1", split="validation" ) # Map task_id → correct answer ground_truth = { item["task_id"]: item["Final answer"] for item in dataset if item["task_id"] in task_ids } # --------------------------- # Evaluate # --------------------------- correct = 0 for q in questions: task_id = q["task_id"] question = q["question"] true_answer = ground_truth.get(task_id, "").strip().lower() model_output = qa(question)[0]["generated_text"].strip().lower() match = model_output == true_answer correct += int(match) print("\n" + "="*80) print(f"QUESTION:\n{question}") print(f"\nEXPECTED:\n{true_answer}") print(f"\nMODEL:\n{model_output}") print(f"\nMATCH: {'✅' if match else '❌'}") print("\n" + "="*80) print(f"FINAL SCORE: {correct}/20")