"""
GAIA Final Challenge agent for the HF AI Agents course.
Uses Claude Haiku 4.5 with tool use.
"""
import os, sys, json, subprocess, tempfile, traceback, base64, mimetypes
sys.stdout.reconfigure(encoding='utf-8')
import requests
import anthropic
API_BASE = "https://agents-course-unit4-scoring.hf.space"
MODEL = "claude-haiku-4-5"
MAX_TURNS = 12
WORK_DIR = "C:/Users/22678/Downloads/test/test/gaia_work"
os.makedirs(WORK_DIR, exist_ok=True)
client = anthropic.Anthropic()
# ---------- TOOLS ----------
def tool_wikipedia_search(query: str) -> str:
"""Search English Wikipedia and return top result extracts (summary text)."""
try:
r = requests.get(
"https://en.wikipedia.org/w/api.php",
params={
"action": "query",
"list": "search",
"srsearch": query,
"format": "json",
"srlimit": 5,
},
timeout=20,
headers={"User-Agent": "gaia-agent/0.1 (course exercise)"},
)
results = r.json().get("query", {}).get("search", [])
if not results:
return f"No results for '{query}'."
out = [f"Top {len(results)} Wikipedia hits for '{query}':"]
for hit in results:
title = hit["title"]
snippet = hit.get("snippet", "").replace('', '**').replace("", "**")
url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
out.append(f"\n- **{title}** — {url}\n {snippet}")
return "\n".join(out)
except Exception as e:
return f"Error: {e}"
def tool_fetch_url(url: str, max_chars: int = 8000) -> str:
"""Fetch a URL and return its text content (stripped of HTML)."""
try:
r = requests.get(url, timeout=30, headers={"User-Agent": "Mozilla/5.0 gaia-agent"})
ct = r.headers.get("content-type", "")
if "html" in ct or url.endswith(".html") or "wikipedia.org" in url:
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, "html.parser")
for s in soup(["script", "style", "nav", "footer"]):
s.decompose()
text = soup.get_text(separator="\n")
text = "\n".join(line.strip() for line in text.splitlines() if line.strip())
else:
text = r.text
if len(text) > max_chars:
text = text[:max_chars] + f"\n[...truncated {len(text)-max_chars} chars]"
return text
except Exception as e:
return f"Error fetching {url}: {e}"
def tool_download_task_file(task_id: str) -> str:
"""Download the file attached to a GAIA task. Returns local file path."""
try:
r = requests.get(f"{API_BASE}/files/{task_id}", timeout=60)
r.raise_for_status()
# Try to get filename from header
cd = r.headers.get("content-disposition", "")
fname = task_id
if "filename=" in cd:
fname = cd.split("filename=")[1].strip('"; ')
local = os.path.join(WORK_DIR, fname)
with open(local, "wb") as f:
f.write(r.content)
return f"Downloaded to {local} ({len(r.content)} bytes)"
except Exception as e:
return f"Error: {e}"
def tool_run_python(code: str, working_file: str = "") -> str:
"""Execute Python code. If working_file points to a .py file, just run that file."""
try:
if working_file and working_file.endswith(".py"):
r = subprocess.run(
[sys.executable, working_file],
capture_output=True, text=True, timeout=60, cwd=WORK_DIR,
)
return f"stdout:\n{r.stdout}\n\nstderr:\n{r.stderr}\nreturncode={r.returncode}"
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, encoding="utf-8") as f:
f.write(code)
tmp = f.name
try:
r = subprocess.run(
[sys.executable, tmp],
capture_output=True, text=True, timeout=60, cwd=WORK_DIR,
)
return f"stdout:\n{r.stdout}\n\nstderr:\n{r.stderr}\nreturncode={r.returncode}"
finally:
os.unlink(tmp)
except subprocess.TimeoutExpired:
return "Error: Timed out after 60s"
except Exception as e:
return f"Error: {e}\n{traceback.format_exc()}"
def tool_youtube_transcript(video_url: str) -> str:
"""Try to fetch YouTube transcript."""
try:
from youtube_transcript_api import YouTubeTranscriptApi
vid = video_url.split("v=")[1].split("&")[0]
transcript = YouTubeTranscriptApi.get_transcript(vid)
return "\n".join(f"[{t['start']:.1f}s] {t['text']}" for t in transcript)
except Exception as e:
return f"Error: {e}"
TOOLS = [
{
"name": "wikipedia_search",
"description": "Search English Wikipedia and get top 5 results with snippets and URLs. Use this FIRST for any factual question.",
"input_schema": {"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]},
},
{
"name": "fetch_url",
"description": "Fetch a URL (usually a Wikipedia page) and return its cleaned text content.",
"input_schema": {"type": "object", "properties": {"url": {"type": "string"}}, "required": ["url"]},
},
{
"name": "download_task_file",
"description": "Download the file attached to the current GAIA task. Returns the local file path.",
"input_schema": {"type": "object", "properties": {"task_id": {"type": "string"}}, "required": ["task_id"]},
},
{
"name": "run_python",
"description": "Execute Python code OR run an existing .py file. For .xlsx parsing, use pandas. For .py files just pass working_file=.",
"input_schema": {
"type": "object",
"properties": {
"code": {"type": "string", "description": "Python code to run (ignored if working_file is set)"},
"working_file": {"type": "string", "description": "Path to a .py file to execute directly"},
},
"required": ["code"],
},
},
{
"name": "youtube_transcript",
"description": "Fetch transcript of a YouTube video by URL.",
"input_schema": {"type": "object", "properties": {"video_url": {"type": "string"}}, "required": ["video_url"]},
},
{
"name": "submit_final_answer",
"description": "Submit your final answer. The `answer` string will be scored via exact match - no preamble, no explanation. Call this exactly once at the end.",
"input_schema": {"type": "object", "properties": {"answer": {"type": "string", "description": "The final answer string, formatted exactly as the question requests"}}, "required": ["answer"]},
},
]
TOOL_FNS = {
"wikipedia_search": lambda i: tool_wikipedia_search(i["query"]),
"fetch_url": lambda i: tool_fetch_url(i["url"]),
"download_task_file": lambda i: tool_download_task_file(i["task_id"]),
"run_python": lambda i: tool_run_python(i.get("code", ""), i.get("working_file", "")),
"youtube_transcript": lambda i: tool_youtube_transcript(i["video_url"]),
}
SYSTEM = """You are a research agent solving GAIA benchmark questions for an EXACT-MATCH scoring system.
CRITICAL: You MUST end every task by calling the `submit_final_answer` tool with the clean answer string.
The `answer` argument is what gets scored - no preamble, no explanation, exact format only.
Workflow:
1. For ANY factual / lookup question (people, dates, statistics, geography, articles, history, sports, etc.):
ALWAYS call wikipedia_search FIRST. Do not answer from memory - your memory is often wrong on specifics.
Then call fetch_url on the most relevant Wikipedia URL to read details.
2. For attached file questions: call download_task_file. If it returns "No file path associated",
the file is permanently unavailable - just guess in the right format.
3. For pure reasoning (math, logic, reversed text, group theory): you may answer directly, but use run_python to verify.
4. For YouTube questions: try youtube_transcript with the URL.
Format rules (CRITICAL for exact-match):
- "comma-separated list, alphabetical order" → "apple, banana, cherry" (lowercase, space after comma)
- "first name only" → just one word like "Sarah"
- "IOC country code" → 3 uppercase letters like "USA"
- "USD with two decimal places" → "1234.56" (no $ sign unless asked)
- "just the city name without abbreviations" → "Boston" (full name, no state)
- "last names only, in Roman characters" → "Smith, Jones"
- Numeric → bare number, no unit unless requested
- Never include "FINAL ANSWER:" or quotes
- If you can't determine the answer, still submit your best guess in the correct format
You can use up to 10 tool calls. Then you MUST call submit_final_answer."""
def solve_question(q: dict) -> str:
"""Run agent loop for a single question, return final answer string."""
task_id = q["task_id"]
question = q["question"]
file_name = q.get("file_name", "")
user_content = f"task_id: {task_id}\n\nQuestion:\n{question}"
if file_name:
user_content += f"\n\nAttached file: {file_name} (call download_task_file with the task_id above to get it)"
# For chess image (Q4), include image in initial message
image_content = None
if file_name.lower().endswith((".png", ".jpg", ".jpeg")):
# Download the image first
tool_download_task_file(task_id)
local_img = os.path.join(WORK_DIR, file_name)
if os.path.exists(local_img):
with open(local_img, "rb") as f:
img_data = base64.standard_b64encode(f.read()).decode("utf-8")
media_type = mimetypes.guess_type(local_img)[0] or "image/png"
image_content = {"type": "image", "source": {"type": "base64", "media_type": media_type, "data": img_data}}
messages = [{"role": "user", "content": ([image_content, {"type": "text", "text": user_content}] if image_content else user_content)}]
final_answer = None
for turn in range(MAX_TURNS):
resp = client.messages.create(
model=MODEL,
max_tokens=4096,
system=SYSTEM,
tools=TOOLS,
messages=messages,
)
if resp.stop_reason == "tool_use":
messages.append({"role": "assistant", "content": resp.content})
tool_results = []
for block in resp.content:
if block.type == "tool_use":
if block.name == "submit_final_answer":
final_answer = block.input.get("answer", "").strip()
print(f" [turn {turn}] >>> submit_final_answer: {final_answer!r}")
return final_answer
print(f" [turn {turn}] tool: {block.name}({json.dumps(block.input)[:120]})")
try:
result = TOOL_FNS[block.name](block.input)
except Exception as e:
result = f"Tool error: {e}"
if len(result) > 12000:
result = result[:12000] + "\n[truncated]"
tool_results.append({"type": "tool_result", "tool_use_id": block.id, "content": result})
messages.append({"role": "user", "content": tool_results})
continue
# Reached end_turn without submitting — force a final answer
text_blocks = [b.text for b in resp.content if b.type == "text"]
partial_text = " ".join(text_blocks).strip()
print(f" [turn {turn}] end_turn without submit, forcing final answer...")
messages.append({"role": "assistant", "content": resp.content})
messages.append({"role": "user", "content": "You did not call submit_final_answer. Please call it now with your best answer in the exact format requested."})
# Loop one more time to force the tool call
continue
# Hit max turns - force one more attempt
if final_answer is None:
messages.append({"role": "user", "content": "Max turns reached. Call submit_final_answer NOW with your best guess in the right format."})
try:
resp = client.messages.create(model=MODEL, max_tokens=512, system=SYSTEM, tools=TOOLS, messages=messages, tool_choice={"type": "tool", "name": "submit_final_answer"})
for block in resp.content:
if block.type == "tool_use" and block.name == "submit_final_answer":
return block.input.get("answer", "").strip()
except Exception:
pass
return final_answer or "(no answer)"
def extract_clean_answer(question: str, agent_response: str) -> str:
"""Second-pass cleanup: extract just the answer in the exact format requested."""
if not agent_response.strip():
return agent_response
resp = client.messages.create(
model=MODEL,
max_tokens=200,
system=EXTRACTOR_SYSTEM,
messages=[{
"role": "user",
"content": f"QUESTION:\n{question}\n\nAGENT'S REASONING:\n{agent_response}\n\nNow output ONLY the final answer string (no quotes, no preamble):",
}],
)
text = "".join(b.text for b in resp.content if b.type == "text").strip()
# Strip surrounding quotes
if (text.startswith('"') and text.endswith('"')) or (text.startswith("'") and text.endswith("'")):
text = text[1:-1]
return text
def main():
with open("C:/Users/22678/Downloads/test/test/gaia_questions.json", "r", encoding="utf-8") as f:
questions = json.load(f)
only = sys.argv[1:] if len(sys.argv) > 1 else None
results = {}
out_path = "C:/Users/22678/Downloads/test/test/gaia_answers.json"
if os.path.exists(out_path):
with open(out_path, "r", encoding="utf-8") as f:
results = json.load(f)
for i, q in enumerate(questions):
tid = q["task_id"]
if only and tid not in only and str(i+1) not in only and f"Q{i+1}" not in only:
continue
if tid in results and not only:
print(f"Q{i+1} {tid[:8]} already answered, skipping")
continue
print(f"\n{'='*60}\nQ{i+1} task_id={tid[:8]} file={q.get('file_name','')}\n{'='*60}")
print(f"Q: {q['question'][:200]}")
try:
answer = solve_question(q)
print(f"\n>>> FINAL: {answer}")
results[tid] = answer
except Exception as e:
print(f"\nERROR: {e}")
traceback.print_exc()
results[tid] = f"(error: {e})"
with open(out_path, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\n\nSaved {len(results)} answers to {out_path}")
if __name__ == "__main__":
main()