File size: 4,019 Bytes
21be703 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | import os
import requests
import pandas as pd
import pyarrow.parquet as pq
import json
import time
from langchain_core.messages import HumanMessage
from agent import build_graph
from huggingface_hub import hf_hub_download
from dotenv import load_dotenv
load_dotenv(override=True)
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
class BasicAgent:
def __init__(self):
print("BasicAgent initialized.")
self.graph = build_graph()
def __call__(self, question: str) -> str:
messages = [HumanMessage(content=question)]
result = self.graph.invoke({"messages": messages})
answer = result['messages'][-1].content
return answer
def file_extract(local_file_path, task_id):
if not local_file_path:
return None
token = os.getenv("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HF_TOKEN")
prefixes = ["2023/validation/", "2023/test/", "2023/train/", ""]
for prefix in prefixes:
try:
resolved_path = hf_hub_download(
repo_id="gaia-benchmark/GAIA",
filename=f"{prefix}{local_file_path}",
repo_type="dataset",
token=token
)
return resolved_path
except Exception:
continue
return None
def main():
# 1. Fetch questions
print("Fetching questions...")
questions_url = f"{DEFAULT_API_URL}/questions"
response = requests.get(questions_url, timeout=15)
questions_data = response.json()
print(f"Fetched {len(questions_data)} questions")
# 2. Load ground truth
print("Loading ground truth...")
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
df = pq.read_table(path).to_pandas()
answer_map = dict(zip(df['task_id'], df['Final answer']))
# 3. Initialize agent
agent = BasicAgent()
# 4. Run on all questions (can slice for testing)
results = []
# Run ALL questions
for i, item in enumerate(questions_data):
task_id = item.get("task_id")
question_text = item.get("question")
file_name = item.get("file_name")
if not task_id or question_text is None:
continue
if file_name:
resolved_path = file_extract(file_name, task_id)
if resolved_path:
question_text += f"\n\n[Attached File Local Path: {resolved_path}]"
print(f"\n[{i+1}/{len(questions_data)}] Task: {task_id[:20]}...")
try:
answer = agent(question_text)
except Exception as e:
answer = f"ERROR: {e}"
ground_truth = answer_map.get(task_id, "NOT FOUND")
is_correct = str(answer).strip().lower() == str(ground_truth).strip().lower()
results.append({
"task_id": task_id,
"question": item.get("question"),
"submitted_answer": answer,
"ground_truth": ground_truth,
"correct": is_correct
})
status = "✅" if is_correct else "❌"
print(f" {status} Submitted: {str(answer)[:40]}")
print(f" Ground: {str(ground_truth)[:40]}")
time.sleep(1.5)
# 5. Calculate score
correct_count = sum(1 for r in results if r["correct"])
total = len(results)
score_pct = correct_count / total * 100 if total > 0 else 0
print("\n" + "="*60)
print(f"FINAL SCORE: {correct_count}/{total} = {score_pct:.0f}%")
print("="*60)
# 6. Save results
output = {"score": score_pct, "correct": correct_count, "total": total, "results": results}
with open("gaia_results.json", "w") as f:
json.dump(output, f, indent=2)
pd.DataFrame(results).to_csv("gaia_results.csv", index=False)
print("Results saved!")
if __name__ == "__main__":
main()
|