Final_Assignment_Template

Running

File size: 4,019 Bytes

21be703

import os
import requests
import pandas as pd
import pyarrow.parquet as pq
import json
import time
from langchain_core.messages import HumanMessage
from agent import build_graph 
from huggingface_hub import hf_hub_download
from dotenv import load_dotenv

load_dotenv(override=True)

DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"

class BasicAgent:
    def __init__(self):
        print("BasicAgent initialized.")
        self.graph = build_graph()

    def __call__(self, question: str) -> str:
        messages = [HumanMessage(content=question)]
        result = self.graph.invoke({"messages": messages})
        answer = result['messages'][-1].content
        return answer

def file_extract(local_file_path, task_id):
    if not local_file_path:
        return None
        
    token = os.getenv("HUGGINGFACEHUB_API_TOKEN") or os.getenv("HF_TOKEN")
    prefixes = ["2023/validation/", "2023/test/", "2023/train/", ""]
    
    for prefix in prefixes:
        try:
            resolved_path = hf_hub_download(
                repo_id="gaia-benchmark/GAIA",
                filename=f"{prefix}{local_file_path}",
                repo_type="dataset",
                token=token
            )
            return resolved_path
        except Exception:
            continue
    return None

def main():
    # 1. Fetch questions
    print("Fetching questions...")
    questions_url = f"{DEFAULT_API_URL}/questions"
    response = requests.get(questions_url, timeout=15)
    questions_data = response.json()
    print(f"Fetched {len(questions_data)} questions")

    # 2. Load ground truth
    print("Loading ground truth...")
    token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
    path = hf_hub_download(repo_id='gaia-benchmark/GAIA', filename='2023/validation/metadata.parquet', repo_type='dataset', token=token)
    df = pq.read_table(path).to_pandas()
    answer_map = dict(zip(df['task_id'], df['Final answer']))

    # 3. Initialize agent
    agent = BasicAgent()

    # 4. Run on all questions (can slice for testing)
    results = []
    
    # Run ALL questions
    for i, item in enumerate(questions_data):
        task_id = item.get("task_id")
        question_text = item.get("question")
        file_name = item.get("file_name")
        
        if not task_id or question_text is None:
            continue
        
        if file_name:
            resolved_path = file_extract(file_name, task_id)
            if resolved_path:
                question_text += f"\n\n[Attached File Local Path: {resolved_path}]"
        
        print(f"\n[{i+1}/{len(questions_data)}] Task: {task_id[:20]}...")
        
        try:
            answer = agent(question_text)
        except Exception as e:
            answer = f"ERROR: {e}"
        
        ground_truth = answer_map.get(task_id, "NOT FOUND")
        is_correct = str(answer).strip().lower() == str(ground_truth).strip().lower()
        
        results.append({
            "task_id": task_id,
            "question": item.get("question"),
            "submitted_answer": answer,
            "ground_truth": ground_truth,
            "correct": is_correct
        })
        
        status = "✅" if is_correct else "❌"
        print(f"   {status} Submitted: {str(answer)[:40]}")
        print(f"      Ground:   {str(ground_truth)[:40]}")
        
        time.sleep(1.5)

    # 5. Calculate score
    correct_count = sum(1 for r in results if r["correct"])
    total = len(results)
    score_pct = correct_count / total * 100 if total > 0 else 0
    
    print("\n" + "="*60)
    print(f"FINAL SCORE: {correct_count}/{total} = {score_pct:.0f}%")
    print("="*60)

    # 6. Save results
    output = {"score": score_pct, "correct": correct_count, "total": total, "results": results}
    
    with open("gaia_results.json", "w") as f:
        json.dump(output, f, indent=2)
    pd.DataFrame(results).to_csv("gaia_results.csv", index=False)
    print("Results saved!")

if __name__ == "__main__":
    main()