Spaces:

W01fAI
/

gaia_unit4_space

Sleeping

App Files Files Community

hawkdev commited on 28 days ago

Commit

772f123

1 Parent(s): f11f984

fixing errors

Browse files

Files changed (9) hide show

README.md +2 -1
__pycache__/agent.cpython-312.pyc +0 -0
__pycache__/answer_normalize.cpython-312.pyc +0 -0
agent.py +153 -37
answer_normalize.py +16 -1
tools/__pycache__/gaia_deterministic.cpython-312.pyc +0 -0
tools/__pycache__/registry.cpython-312.pyc +0 -0
tools/gaia_deterministic.py +16 -6
tools/registry.py +4 -1

README.md CHANGED Viewed

@@ -36,7 +36,8 @@ This folder is a **drop-in replacement** for the course Space
    - `GAIA_ASR_MODEL` — HF-only default `openai/whisper-large-v3`
    - `GAIA_VISION_MODEL` — HF-only default `meta-llama/Llama-3.2-11B-Vision-Instruct`
    - `GAIA_API_URL` — default `https://agents-course-unit4-scoring.hf.space`
-   - `GAIA_USE_CACHE` — `1` (default) or `0` to disable `gaia_answers_cache.json`
 Keep the Space **public** so `agent_code` (`…/tree/main`) verifies for the leaderboard.

    - `GAIA_ASR_MODEL` — HF-only default `openai/whisper-large-v3`
    - `GAIA_VISION_MODEL` — HF-only default `meta-llama/Llama-3.2-11B-Vision-Instruct`
    - `GAIA_API_URL` — default `https://agents-course-unit4-scoring.hf.space`
+   - `GAIA_USE_CACHE` — `1` (default) or `0` to disable `gaia_answers_cache.json` (set **`0`** once after changing the agent so old wrong answers are not resubmitted).
+   - **Groq free-tier TPM (413 “request too large”)**: the agent truncates tool outputs and total context. Tune with `GAIA_GROQ_MAX_TOOL_CHARS` (default `3200`), `GAIA_GROQ_CONTEXT_CHARS` (default `26000`), and `GAIA_AUTO_TRANSCRIPT_CHARS` (default `12000` for inlined MP3 transcripts).
 Keep the Space **public** so `agent_code` (`…/tree/main`) verifies for the leaderboard.

__pycache__/agent.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/agent.cpython-312.pyc and b/__pycache__/agent.cpython-312.pyc differ

__pycache__/answer_normalize.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/answer_normalize.cpython-312.pyc and b/__pycache__/answer_normalize.cpython-312.pyc differ

agent.py CHANGED Viewed

@@ -3,6 +3,8 @@
 from __future__ import annotations
 import os
 from typing import Any, Optional
 from answer_normalize import normalize_answer
@@ -15,6 +17,7 @@ from llm_backends import (
     make_openai_sdk_client,
     openai_chat_model,
 )
 from tools.registry import TOOL_DEFINITIONS, deterministic_attempt, dispatch_tool
 try:
@@ -27,6 +30,8 @@ SYSTEM_PROMPT = """You solve GAIA benchmark questions for the Hugging Face Agent
 Hard rules:
 - Call tools as needed (search, Wikipedia, fetch URL, Python, audio, image, Excel).
 - Your final assistant message must contain ONLY the answer text required by the question — no labels like "FINAL ANSWER", no markdown fences, no extra sentences, no preamble.
 - Match the question's format exactly: comma-separated lists alphabetized when asked; numbers without commas/thousands separators and without $ or % unless the question asks; short strings without leading articles (a/the); city names spelled out as requested; algebraic chess notation when asked.
 - For English Wikipedia tasks, use wikipedia_* tools and verify years/names against retrieved text.
 - For YouTube URLs, try youtube_transcript first; if missing, say you cannot access video and avoid guessing.
@@ -34,13 +39,77 @@ Hard rules:
 """
 class GaiaAgent:
     def __init__(
         self,
         *,
         hf_token: Optional[str] = None,
         text_model: Optional[str] = None,
-        max_iterations: int = 14,
     ):
         self.hf_token = (
             hf_token
@@ -76,6 +145,34 @@ class GaiaAgent:
             self._hf_client = InferenceClient(**kw)
         return self._hf_client
     def __call__(
         self,
         question: str,
@@ -93,53 +190,44 @@ class GaiaAgent:
             )
         user_text = _build_user_payload(question, attachment_path, task_id)
         messages: list[dict[str, Any]] = [
             {"role": "system", "content": SYSTEM_PROMPT},
             {"role": "user", "content": user_text},
         ]
         last_text = ""
         for _ in range(self.max_iterations):
-            try:
-                if self.backend in ("groq", "openai"):
-                    assert self._oa_client is not None
-                    completion = chat_complete_openai(
-                        self._oa_client,
-                        model=self.text_model,
-                        messages=messages,
-                        tools=TOOL_DEFINITIONS,
-                        max_tokens=1024,
-                        temperature=0.15,
-                    )
-                    msg = completion.choices[0].message
-                else:
-                    client = self._get_hf_client()
-                    completion = client.chat_completion(
-                        messages=messages,
-                        model=self.text_model,
-                        tools=TOOL_DEFINITIONS,
-                        tool_choice="auto",
-                        max_tokens=1024,
-                        temperature=0.15,
-                    )
-                    msg = completion.choices[0].message
-            except Exception as e:
-                es = str(e)
-                if "402" in es or "Payment Required" in es or "depleted" in es.lower():
-                    last_text = (
-                        "Error: Hugging Face Inference credits exhausted (402). "
-                        "Set Space secret GROQ_API_KEY (free at https://console.groq.com) "
-                        "to use Groq instead, or add HF billing."
-                    )
-                else:
-                    last_text = f"Inference error: {e}"
-                break
             last_text = (msg.content or "").strip()
             tool_calls = getattr(msg, "tool_calls", None)
             if tool_calls:
                 messages.append(
                     {
                         "role": "assistant",
@@ -161,11 +249,13 @@ class GaiaAgent:
                     name = tc.function.name
                     args = tc.function.arguments or "{}"
                     result = dispatch_tool(name, args, hf_token=self.hf_token)
                     messages.append(
                         {
                             "role": "tool",
                             "tool_call_id": tc.id,
-                            "content": result[:24_000],
                         }
                     )
                 continue
@@ -191,7 +281,33 @@ def _build_user_payload(
         parts.append(f"task_id: {task_id}")
     parts.append(f"Question:\n{question.strip()}")
     if attachment_path:
-        parts.append(f"\nAttachment path (use with tools): {attachment_path}")
     else:
         parts.append("\nNo attachment.")
     return "\n".join(parts)

 from __future__ import annotations
 import os
+import time
+from pathlib import Path
 from typing import Any, Optional
 from answer_normalize import normalize_answer
     make_openai_sdk_client,
     openai_chat_model,
 )
+from tools.media_tools import transcribe_audio
 from tools.registry import TOOL_DEFINITIONS, deterministic_attempt, dispatch_tool
 try:
 Hard rules:
 - Call tools as needed (search, Wikipedia, fetch URL, Python, audio, image, Excel).
 - Your final assistant message must contain ONLY the answer text required by the question — no labels like "FINAL ANSWER", no markdown fences, no extra sentences, no preamble.
+- Never type fake tool calls such as <web_search>...</function>; the platform invokes tools for you. If you need search, emit a real tool call via the API, not XML-like text in the reply.
+- When the user message includes an attachment path: for audio, a transcript may already be inlined — use it. For images (png/jpg), call analyze_image with that exact file_path. For .xlsx/.py use the appropriate tools with that path.
 - Match the question's format exactly: comma-separated lists alphabetized when asked; numbers without commas/thousands separators and without $ or % unless the question asks; short strings without leading articles (a/the); city names spelled out as requested; algebraic chess notation when asked.
 - For English Wikipedia tasks, use wikipedia_* tools and verify years/names against retrieved text.
 - For YouTube URLs, try youtube_transcript first; if missing, say you cannot access video and avoid guessing.
 """
+def _tool_char_cap(backend: str, *, shrink_pass: int = 0) -> int:
+    if backend == "groq":
+        base = int(os.environ.get("GAIA_GROQ_MAX_TOOL_CHARS", "3200"))
+    elif backend == "openai":
+        base = int(os.environ.get("GAIA_OPENAI_MAX_TOOL_CHARS", "12000"))
+    else:
+        base = int(os.environ.get("GAIA_MAX_TOOL_CHARS", "24000"))
+    if shrink_pass > 0:
+        base = max(600, base // (2**shrink_pass))
+    return base
+def _groq_context_budget() -> int:
+    return int(os.environ.get("GAIA_GROQ_CONTEXT_CHARS", "26000"))
+def _maybe_retryable_llm_error(exc: Exception) -> bool:
+    es = str(exc).lower()
+    return (
+        "413" in es
+        or "429" in es
+        or "rate_limit" in es
+        or "tokens per minute" in es
+        or "tpm" in es
+        or "too many tokens" in es
+    )
+def _truncate_tool_messages(
+    messages: list[dict[str, Any]],
+    backend: str,
+    *,
+    shrink_pass: int = 0,
+) -> None:
+    cap = _tool_char_cap(backend, shrink_pass=shrink_pass)
+    for m in messages:
+        if m.get("role") != "tool":
+            continue
+        c = m.get("content")
+        if isinstance(c, str) and len(c) > cap:
+            m["content"] = c[:cap] + "\n[truncated]"
+def _enforce_context_budget(messages: list[dict[str, Any]], backend: str) -> None:
+    if backend != "groq":
+        return
+    budget = _groq_context_budget()
+    for _ in range(24):
+        total = sum(len(str(m.get("content") or "")) for m in messages)
+        if total <= budget:
+            return
+        trimmed = False
+        for m in messages[2:]:
+            if m.get("role") != "tool":
+                continue
+            c = m.get("content")
+            if isinstance(c, str) and len(c) > 800:
+                m["content"] = c[: max(600, len(c) * 2 // 3)] + "\n[truncated]"
+                trimmed = True
+                break
+        if not trimmed:
+            break
 class GaiaAgent:
     def __init__(
         self,
         *,
         hf_token: Optional[str] = None,
         text_model: Optional[str] = None,
+        max_iterations: int = 12,
     ):
         self.hf_token = (
             hf_token
             self._hf_client = InferenceClient(**kw)
         return self._hf_client
+    def _chat_round(
+        self,
+        messages: list[dict[str, Any]],
+        *,
+        shrink_pass: int = 0,
+    ) -> Any:
+        _truncate_tool_messages(messages, self.backend, shrink_pass=shrink_pass)
+        _enforce_context_budget(messages, self.backend)
+        if self.backend in ("groq", "openai"):
+            assert self._oa_client is not None
+            return chat_complete_openai(
+                self._oa_client,
+                model=self.text_model,
+                messages=messages,
+                tools=TOOL_DEFINITIONS,
+                max_tokens=768,
+                temperature=0.15,
+            )
+        client = self._get_hf_client()
+        return client.chat_completion(
+            messages=messages,
+            model=self.text_model,
+            tools=TOOL_DEFINITIONS,
+            tool_choice="auto",
+            max_tokens=1024,
+            temperature=0.15,
+        )
     def __call__(
         self,
         question: str,
             )
         user_text = _build_user_payload(question, attachment_path, task_id)
+        user_text += _maybe_inline_audio_transcript(attachment_path, self.hf_token)
         messages: list[dict[str, Any]] = [
             {"role": "system", "content": SYSTEM_PROMPT},
             {"role": "user", "content": user_text},
         ]
         last_text = ""
+        retry_delays = (2.0, 6.0, 14.0)
         for _ in range(self.max_iterations):
+            completion = None
+            shrink_pass = 0
+            for attempt in range(len(retry_delays) + 1):
+                try:
+                    completion = self._chat_round(messages, shrink_pass=shrink_pass)
+                    break
+                except Exception as e:
+                    es = str(e)
+                    if "402" in es or "Payment Required" in es or "depleted" in es.lower():
+                        last_text = (
+                            "Error: Hugging Face Inference credits exhausted (402). "
+                            "Set Space secret GROQ_API_KEY (free at https://console.groq.com) "
+                            "to use Groq instead, or add HF billing."
+                        )
+                        return normalize_answer(last_text)
+                    if attempt < len(retry_delays) and _maybe_retryable_llm_error(e):
+                        shrink_pass = attempt + 1
+                        time.sleep(retry_delays[attempt])
+                        continue
+                    return normalize_answer(f"Inference error: {e}")
+            msg = completion.choices[0].message
             last_text = (msg.content or "").strip()
             tool_calls = getattr(msg, "tool_calls", None)
             if tool_calls:
+                cap = _tool_char_cap(self.backend, shrink_pass=0)
                 messages.append(
                     {
                         "role": "assistant",
                     name = tc.function.name
                     args = tc.function.arguments or "{}"
                     result = dispatch_tool(name, args, hf_token=self.hf_token)
+                    if isinstance(result, str) and len(result) > cap:
+                        result = result[:cap] + "\n[truncated]"
                     messages.append(
                         {
                             "role": "tool",
                             "tool_call_id": tc.id,
+                            "content": result,
                         }
                     )
                 continue
         parts.append(f"task_id: {task_id}")
     parts.append(f"Question:\n{question.strip()}")
     if attachment_path:
+        p = Path(attachment_path)
+        parts.append(
+            f"\nAttachment path (pass this exact string to tools): {attachment_path}"
+        )
+        if p.is_file():
+            parts.append(f"Attachment exists on disk: yes ({p.name})")
+        else:
+            parts.append("Attachment exists on disk: NO — report that you cannot read it.")
     else:
         parts.append("\nNo attachment.")
     return "\n".join(parts)
+def _maybe_inline_audio_transcript(
+    attachment_path: Optional[str],
+    hf_token: Optional[str],
+) -> str:
+    if not attachment_path:
+        return ""
+    p = Path(attachment_path)
+    if not p.is_file():
+        return ""
+    ext = p.suffix.lower()
+    if ext not in (".mp3", ".wav", ".m4a", ".ogg", ".flac", ".webm"):
+        return ""
+    tx = transcribe_audio(str(p), hf_token=hf_token)
+    if not tx or tx.lower().startswith(("error", "asr error")):
+        return f"\n\n[Automatic transcription failed: {tx[:500]}]\n"
+    cap = int(os.environ.get("GAIA_AUTO_TRANSCRIPT_CHARS", "12000"))
+    return f"\n\n[Audio transcript — use for your answer]\n{tx[:cap]}\n"

answer_normalize.py CHANGED Viewed

@@ -8,6 +8,11 @@ _FINAL_ANSWER_RE = re.compile(
     r"^\s*(?:FINAL\s*ANSWER\s*[:：]?\s*)",
     re.IGNORECASE,
 )
 def normalize_answer(raw: Union[str, int, float, None]) -> Union[str, int, float]:
@@ -21,6 +26,7 @@ def normalize_answer(raw: Union[str, int, float, None]) -> Union[str, int, float
     text = str(raw).strip()
     if not text:
         return ""
     text = _FINAL_ANSWER_RE.sub("", text, count=1).strip()
     # Strip common wrappers (single line)
     for prefix in ("The answer is", "Answer:", "ANSWER:", "```", "`"):
@@ -31,7 +37,16 @@ def normalize_answer(raw: Union[str, int, float, None]) -> Union[str, int, float
     if text.startswith("```"):
         text = re.sub(r"^```\w*\s*", "", text)
         text = re.sub(r"\s*```$", "", text).strip()
-    return text.strip()
 def maybe_numeric(text: str) -> Union[str, int, float]:

     r"^\s*(?:FINAL\s*ANSWER\s*[:：]?\s*)",
     re.IGNORECASE,
 )
+# Model sometimes prints fake tool tags instead of calling the API.
+_PSEUDO_TOOL_BLOCK = re.compile(
+    r"<\s*[a-z_][a-z0-9_]*\s*>[\s\S]*?</function>",
+    re.IGNORECASE,
+)
 def normalize_answer(raw: Union[str, int, float, None]) -> Union[str, int, float]:
     text = str(raw).strip()
     if not text:
         return ""
+    text = _PSEUDO_TOOL_BLOCK.sub("", text).strip()
     text = _FINAL_ANSWER_RE.sub("", text, count=1).strip()
     # Strip common wrappers (single line)
     for prefix in ("The answer is", "Answer:", "ANSWER:", "```", "`"):
     if text.startswith("```"):
         text = re.sub(r"^```\w*\s*", "", text)
         text = re.sub(r"\s*```$", "", text).strip()
+    text = text.strip()
+    # Single trailing period on short token answers (e.g. city names).
+    if (
+        text.endswith(".")
+        and text.count(".") == 1
+        and 1 <= len(text) <= 80
+        and "\n" not in text
+    ):
+        text = text[:-1].strip()
+    return text
 def maybe_numeric(text: str) -> Union[str, int, float]:

tools/__pycache__/gaia_deterministic.cpython-312.pyc CHANGED Viewed

Binary files a/tools/__pycache__/gaia_deterministic.cpython-312.pyc and b/tools/__pycache__/gaia_deterministic.cpython-312.pyc differ

tools/__pycache__/registry.cpython-312.pyc CHANGED Viewed

Binary files a/tools/__pycache__/registry.cpython-312.pyc and b/tools/__pycache__/registry.cpython-312.pyc differ

tools/gaia_deterministic.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
 import re
 from typing import Optional
 import requests
@@ -16,13 +17,22 @@ def solve_botany_vegetable_list(question: str) -> Optional[str]:
     Excludes: bell pepper, zucchini, green beans, corn (fruits); herbs optional;
     canonical set matches common GAIA references.
     """
-    q = question.lower()
-    if "professor of botany" not in q and "botanical fruits" not in q:
         return None
-    if (
-        "vegetables from my list" not in q
-        and "list of just the vegetables" not in q
-        and "just the vegetables" not in q
     ):
         return None
     # Roots/leaf/stem crops only; no cucurbits, legume pods, grains, fruits.

 from __future__ import annotations
 import re
+import unicodedata
 from typing import Optional
 import requests
     Excludes: bell pepper, zucchini, green beans, corn (fruits); herbs optional;
     canonical set matches common GAIA references.
     """
+    q = unicodedata.normalize("NFKC", question).lower()
+    if "professor of botany" not in q:
+        return None
+    if "botanical fruit" not in q:
+        return None
+    if "vegetable" not in q:
         return None
+    if not any(
+        x in q
+        for x in (
+            "from my list",
+            "just the vegetables",
+            "list of just the vegetables",
+            "vegetables from",
+            "grocery list",
+        )
     ):
         return None
     # Roots/leaf/stem crops only; no cucurbits, legume pods, grains, fruits.

tools/registry.py CHANGED Viewed

@@ -109,7 +109,10 @@ TOOL_DEFINITIONS: list[dict[str, Any]] = [
         "type": "function",
         "function": {
             "name": "transcribe_audio",
-            "description": "Transcribe a local audio file (.mp3, etc.) using HF Whisper inference.",
             "parameters": {
                 "type": "object",
                 "properties": {"file_path": {"type": "string"}},

         "type": "function",
         "function": {
             "name": "transcribe_audio",
+            "description": (
+                "Transcribe a local audio file (.mp3, .wav, etc.). "
+                "Uses Groq/OpenAI Whisper when configured, else Hugging Face."
+            ),
             "parameters": {
                 "type": "object",
                 "properties": {"file_path": {"type": "string"}},