Final_Assignment_Template

Running

App Files Files Community

Final_Assignment_Template / extract_gaia_parquet.py

Paperbag

update openrouter model

8d79810 25 days ago

raw

history blame contribute delete

2.07 kB

	import requests
	import os
	import pandas as pd
	from huggingface_hub import hf_hub_download
	from dotenv import load_dotenv

	load_dotenv(override=True)

	# 1. Fetch current questions
	QUESTIONS_URL = "https://agents-course-unit4-scoring.hf.space/questions"
	print(f"Fetching questions from {QUESTIONS_URL}...")
	try:
	resp = requests.get(QUESTIONS_URL)
	resp.raise_for_status()
	current_questions = resp.json()
	except Exception as e:
	print(f"Error fetching questions: {e}")
	current_questions = []

	# 2. Download ground truth via hf_hub_download
	print("Downloading GAIA validation metadata...")
	try:
	token = os.getenv("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HF_TOKEN")
	file_path = hf_hub_download(
	repo_id="gaia-benchmark/GAIA",
	filename="2023/validation/metadata.parquet",
	repo_type="dataset",
	token=token
	)

	# 3. Read Parquet and match
	df = pd.read_parquet(file_path)
	# Ensure column names match (Question, Final answer, task_id)
	# The GAIA dataset columns are usually: task_id, Question, Final answer, etc.

	print("\n--- GAIA GROUND TRUTH ANSWERS ---")
	results = []
	for i, q in enumerate(current_questions):
	tid = q.get("task_id")
	match = df[df["task_id"] == tid]
	if not match.empty:
	answer = match.iloc[0]["Final answer"]
	results.append({
	"Index": i + 1,
	"Task ID": tid,
	"Question": q.get("question")[:100] + "...",
	"Answer": answer
	})
	print(f"{i+1}. [ID: {tid[:8]}] Answer: {answer}")
	else:
	print(f"{i+1}. [ID: {tid[:8]}] NOT FOUND in validation set.")

	# Save to a nice CSV for the user
	if results:
	res_df = pd.DataFrame(results)
	res_df.to_csv("gaia_ground_truth.csv", index=False)
	print("\nFull list saved to 'gaia_ground_truth.csv'")

	except Exception as e:
	print(f"Error processing Parquet: {e}")
	print("Tip: You might need 'pip install pyarrow' to read parquet files.")