|
|
import requests |
|
|
from datasets import load_dataset |
|
|
from transformers import pipeline |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SCORING_API = "https://agents-course-unit4-scoring.hf.space" |
|
|
MODEL_NAME = "google/flan-t5-base" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Loading model...") |
|
|
qa = pipeline("text2text-generation", model=MODEL_NAME, max_new_tokens=64) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Fetching GAIA questions...") |
|
|
questions = requests.get(f"{SCORING_API}/questions").json() |
|
|
|
|
|
task_ids = [q["task_id"] for q in questions] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("Loading GAIA validation set...") |
|
|
dataset = load_dataset( |
|
|
"gaia-benchmark/GAIA", |
|
|
"2023_level1", |
|
|
split="validation" |
|
|
) |
|
|
|
|
|
|
|
|
ground_truth = { |
|
|
item["task_id"]: item["Final answer"] |
|
|
for item in dataset |
|
|
if item["task_id"] in task_ids |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
correct = 0 |
|
|
|
|
|
for q in questions: |
|
|
task_id = q["task_id"] |
|
|
question = q["question"] |
|
|
true_answer = ground_truth.get(task_id, "").strip().lower() |
|
|
|
|
|
model_output = qa(question)[0]["generated_text"].strip().lower() |
|
|
|
|
|
match = model_output == true_answer |
|
|
correct += int(match) |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print(f"QUESTION:\n{question}") |
|
|
print(f"\nEXPECTED:\n{true_answer}") |
|
|
print(f"\nMODEL:\n{model_output}") |
|
|
print(f"\nMATCH: {'β
' if match else 'β'}") |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print(f"FINAL SCORE: {correct}/20") |
|
|
|