import os import sys import json import base64 import tempfile import requests import pandas as pd import gradio as gr import anthropic from io import StringIO from pathlib import Path # --- Constants --- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" # ============================================================ # Tool Implementations # ============================================================ def web_search(query: str) -> str: """Search the web using DuckDuckGo (no API key needed).""" try: from duckduckgo_search import DDGS with DDGS() as ddgs: results = list(ddgs.text(query, max_results=6)) if not results: return "No results found." return "\n\n".join( f"Title: {r['title']}\nURL: {r['href']}\nSnippet: {r['body']}" for r in results ) except Exception as e: return f"Search error: {e}" def visit_webpage(url: str) -> str: """Fetch and return the text content of a webpage.""" try: headers = {"User-Agent": "Mozilla/5.0 (compatible; GAIABot/1.0)"} resp = requests.get(url, headers=headers, timeout=15) resp.raise_for_status() try: from bs4 import BeautifulSoup soup = BeautifulSoup(resp.text, "html.parser") for tag in soup(["script", "style", "nav", "footer", "header"]): tag.decompose() text = soup.get_text(separator=" ", strip=True) except ImportError: from html.parser import HTMLParser class _Strip(HTMLParser): def __init__(self): super().__init__() self._parts, self._skip = [], False def handle_starttag(self, t, _): if t in ("script", "style"): self._skip = True def handle_endtag(self, t): if t in ("script", "style"): self._skip = False def handle_data(self, d): if not self._skip: self._parts.append(d) p = _Strip() p.feed(resp.text) text = " ".join(p._parts) import re text = re.sub(r"\s+", " ", text).strip() return text[:8000] except Exception as e: return f"Failed to fetch {url}: {e}" def run_python(code: str) -> str: """Execute Python code in a sandboxed namespace and return stdout.""" buf_out, buf_err = StringIO(), StringIO() old_out, old_err = sys.stdout, sys.stderr sys.stdout, sys.stderr = buf_out, buf_err try: namespace = {"pd": pd, "__builtins__": __builtins__} exec(code, namespace) out = buf_out.getvalue() err = buf_err.getvalue() if err: out += f"\n[stderr]: {err}" return out.strip() or "(executed — no output)" except Exception as exc: return f"{type(exc).__name__}: {exc}" finally: sys.stdout, sys.stderr = old_out, old_err def read_file_as_text(file_bytes: bytes, file_name: str) -> str: """Convert various file types to a text representation.""" ext = Path(file_name).suffix.lower() try: if ext in (".txt", ".py", ".md", ".json", ".xml", ".html", ".css", ".js"): return file_bytes.decode("utf-8", errors="replace")[:6000] elif ext == ".csv": df = pd.read_csv(StringIO(file_bytes.decode("utf-8", errors="replace"))) return df.to_string(max_rows=50) elif ext in (".xlsx", ".xls"): import io df = pd.read_excel(io.BytesIO(file_bytes), sheet_name=None) parts = [] for sheet, frame in df.items(): parts.append(f"=== Sheet: {sheet} ===\n{frame.to_string(max_rows=50)}") return "\n\n".join(parts)[:6000] elif ext == ".pdf": import io try: import pypdf reader = pypdf.PdfReader(io.BytesIO(file_bytes)) return "\n".join(p.extract_text() for p in reader.pages)[:6000] except ImportError: return "[PDF reading requires pypdf — install with: pip install pypdf]" elif ext in (".mp3", ".wav", ".m4a", ".flac"): return f"[Audio file: {file_name}, {len(file_bytes):,} bytes — transcription not available without Whisper API]" else: # Try decoding as UTF-8 as a last resort try: return file_bytes.decode("utf-8", errors="replace")[:4000] except Exception: return f"[Binary file: {file_name}, {len(file_bytes):,} bytes]" except Exception as e: return f"Error reading file {file_name}: {e}" # ============================================================ # Tool Schema (for Anthropic tool_use) # ============================================================ TOOLS = [ { "name": "web_search", "description": ( "Search the web for current information, facts, Wikipedia content, " "news, etc. Returns titles, URLs, and snippets." ), "input_schema": { "type": "object", "properties": { "query": {"type": "string", "description": "The search query"} }, "required": ["query"], }, }, { "name": "visit_webpage", "description": ( "Fetch the full text of a specific webpage. Use when you need more " "detail than a search snippet, e.g. to read a Wikipedia article." ), "input_schema": { "type": "object", "properties": { "url": {"type": "string", "description": "Full URL to fetch"} }, "required": ["url"], }, }, { "name": "run_python", "description": ( "Execute Python code. Great for arithmetic, counting, sorting, " "string manipulation, or processing data. Use print() for output. " "pandas (as pd) is pre-imported." ), "input_schema": { "type": "object", "properties": { "code": { "type": "string", "description": "Python code to run. Always use print() to show results.", } }, "required": ["code"], }, }, ] SYSTEM_PROMPT = """You are an expert research assistant solving GAIA benchmark questions. These are real-world questions requiring careful research and precise answers. Strategy: - Use web_search to find facts; follow up with visit_webpage for detail - Use run_python for any calculation, counting, sorting, or data manipulation - For files provided in the question, analyse them carefully - Cross-check facts when accuracy is critical Answer format (VERY IMPORTANT): - Provide ONLY the final answer — no preamble, no explanation - Give exactly what is asked: a number, a name, a date, a word, a short phrase - Numbers: digits only, unless units are part of the question's expected format - Lists: comma-separated values unless another format is specified - Yes/No questions: just "Yes" or "No" Think step by step, then output your final concise answer.""" # ============================================================ # Agent # ============================================================ class GAIAAgent: """Agentic loop backed by Claude with tool use.""" MAX_ITERATIONS = 15 def __init__(self): api_key = os.getenv("ANTHROPIC_API_KEY") if not api_key: raise EnvironmentError("ANTHROPIC_API_KEY environment variable not set.") self.client = anthropic.Anthropic(api_key=api_key) self.model = "claude-sonnet-4-20250514" print(f"GAIAAgent initialised (model: {self.model})") # ---- internal helpers ---- def _dispatch_tool(self, name: str, inputs: dict) -> str: if name == "web_search": return web_search(inputs["query"]) if name == "visit_webpage": return visit_webpage(inputs["url"]) if name == "run_python": return run_python(inputs["code"]) return f"[unknown tool: {name}]" def _build_initial_content( self, question: str, file_bytes: bytes | None, file_name: str | None ) -> list: """Return the content list for the first user message.""" content = [] if file_bytes and file_name: ext = Path(file_name).suffix.lower() image_exts = {".jpg", ".jpeg", ".png", ".gif", ".webp"} if ext in image_exts: media_map = { ".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png", ".gif": "image/gif", ".webp": "image/webp", } content.append({ "type": "image", "source": { "type": "base64", "media_type": media_map[ext], "data": base64.b64encode(file_bytes).decode(), }, }) content.append({ "type": "text", "text": f"The image above is the attached file '{file_name}'.\n\n{question}", }) else: file_text = read_file_as_text(file_bytes, file_name) content.append({ "type": "text", "text": ( f"A file named '{file_name}' is attached. Its contents:\n\n" f"{file_text}\n\n---\n\nQuestion: {question}" ), }) else: content.append({"type": "text", "text": question}) return content # ---- public interface ---- def solve( self, question: str, file_bytes: bytes | None = None, file_name: str | None = None, ) -> str: print(f"\n[Agent] Question: {question[:120]}{'...' if len(question)>120 else ''}") messages = [ {"role": "user", "content": self._build_initial_content(question, file_bytes, file_name)} ] for iteration in range(self.MAX_ITERATIONS): response = self.client.messages.create( model=self.model, max_tokens=4096, system=SYSTEM_PROMPT, tools=TOOLS, messages=messages, ) if response.stop_reason == "end_turn": for block in response.content: if hasattr(block, "text"): answer = block.text.strip() print(f"[Agent] Answer: {answer[:100]}") return answer return "No answer generated." if response.stop_reason == "tool_use": tool_results = [] for block in response.content: if block.type == "tool_use": print(f" [Tool] {block.name}({json.dumps(block.input)[:80]})") result = self._dispatch_tool(block.name, block.input) print(f" [Tool] → {result[:120]}") tool_results.append({ "type": "tool_result", "tool_use_id": block.id, "content": result, }) messages.append({"role": "assistant", "content": response.content}) messages.append({"role": "user", "content": tool_results}) else: # Unexpected stop reason print(f"[Agent] Unexpected stop_reason: {response.stop_reason}") break return "Could not determine answer within iteration limit." def __call__(self, question: str) -> str: """Compatibility shim for the template's agent(question) calls.""" return self.solve(question) # ============================================================ # Evaluation runner # ============================================================ def run_and_submit_all(profile: gr.OAuthProfile | None): """Fetch questions, run the agent, submit answers, display results.""" space_id = os.getenv("SPACE_ID") if profile: username = profile.username print(f"Logged in as: {username}") else: return "Please log in to Hugging Face first.", None api_url = DEFAULT_API_URL questions_url = f"{api_url}/questions" submit_url = f"{api_url}/submit" # 1. Build agent try: agent = GAIAAgent() except Exception as e: return f"Error initialising agent: {e}", None agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "unknown" # 2. Fetch questions print(f"Fetching questions from {questions_url} …") try: resp = requests.get(questions_url, timeout=15) resp.raise_for_status() questions_data = resp.json() if not questions_data: return "Questions list is empty.", None print(f"Fetched {len(questions_data)} questions.") except Exception as e: return f"Error fetching questions: {e}", None # 3. Run agent on each question results_log = [] answers_payload = [] for item in questions_data: task_id = item.get("task_id") question_text = item.get("question") file_name = item.get("file_name", "") if not task_id or question_text is None: print(f"Skipping malformed item: {item}") continue # Download attached file if present file_bytes = None if file_name: try: file_url = f"{api_url}/files/{task_id}" file_resp = requests.get(file_url, timeout=30) file_resp.raise_for_status() file_bytes = file_resp.content print(f" Downloaded '{file_name}' ({len(file_bytes):,} bytes)") except Exception as e: print(f" Could not download file for task {task_id}: {e}") try: submitted_answer = agent.solve(question_text, file_bytes, file_name) except Exception as e: submitted_answer = f"AGENT ERROR: {e}" print(f" Agent error on {task_id}: {e}") answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer}) results_log.append({ "Task ID": task_id, "Question": question_text[:120], "File": file_name or "—", "Submitted Answer": submitted_answer, }) if not answers_payload: return "Agent produced no answers.", pd.DataFrame(results_log) # 4. Submit submission = { "username": username.strip(), "agent_code": agent_code, "answers": answers_payload, } print(f"Submitting {len(answers_payload)} answers …") try: resp = requests.post(submit_url, json=submission, timeout=120) resp.raise_for_status() result = resp.json() status = ( f"Submission Successful!\n" f"User: {result.get('username')}\n" f"Score: {result.get('score', 'N/A')}% " f"({result.get('correct_count', '?')}/{result.get('total_attempted', '?')} correct)\n" f"Message: {result.get('message', '')}" ) except requests.exceptions.HTTPError as e: detail = "" try: detail = e.response.json().get("detail", e.response.text) except Exception: detail = e.response.text[:500] status = f"Submission failed (HTTP {e.response.status_code}): {detail}" except Exception as e: status = f"Submission error: {e}" print(status) return status, pd.DataFrame(results_log) # ============================================================ # Gradio UI # ============================================================ with gr.Blocks() as demo: gr.Markdown("# GAIA Agent Evaluation Runner") gr.Markdown( """ **Setup:** 1. Set `ANTHROPIC_API_KEY` as a Space secret. 2. Log in with your Hugging Face account below. 3. Click **Run Evaluation** to fetch questions, run the agent, and submit. The agent uses Claude with web search, code execution, and file analysis. """ ) gr.LoginButton() run_btn = gr.Button("Run Evaluation & Submit All Answers", variant="primary") status_box = gr.Textbox(label="Status / Result", lines=6, interactive=False) results_table = gr.DataFrame(label="Questions & Answers", wrap=True) run_btn.click(fn=run_and_submit_all, outputs=[status_box, results_table]) if __name__ == "__main__": print("\n" + "=" * 60) space_host = os.getenv("SPACE_HOST") space_id = os.getenv("SPACE_ID") if space_host: print(f"SPACE_HOST : {space_host}") if space_id: print(f"SPACE_ID : {space_id}") if not os.getenv("ANTHROPIC_API_KEY"): print("⚠️ ANTHROPIC_API_KEY is NOT set — agent will fail.") else: print("✅ ANTHROPIC_API_KEY found.") print("=" * 60 + "\n") demo.launch(debug=True, share=False)