Spaces:

W01fAI
/

gaia_unit4_space

Sleeping

hawkdev commited on 27 days ago

Commit

2bf50d9

1 Parent(s): 9035818

Groq: smaller requests and retries for free-tier 413/TPM

- Lower default tool/context caps; count tool_calls in context budget
- Drop oldest tool rounds when over Groq char budget
- Shorter inlined audio for groq (GAIA_GROQ_AUTO_TRANSCRIPT_CHARS)
- GAIA_GROQ_MAX_TOKENS default 384; more retry sleeps with stronger shrink
- Return empty after exhausted rate/size errors; strip any Inference error: answers

Made-with: Cursor

Files changed (3) hide show

README.md +1 -1
agent.py +52 -11
answer_normalize.py +4 -1

README.md CHANGED Viewed

@@ -37,7 +37,7 @@ This folder is a **drop-in replacement** for the course Space
    - `GAIA_VISION_MODEL` — HF-only default `meta-llama/Llama-3.2-11B-Vision-Instruct`
    - `GAIA_API_URL` — default `https://agents-course-unit4-scoring.hf.space`
    - `GAIA_USE_CACHE` — `1` (default) or `0` to disable `gaia_answers_cache.json` (set **`0`** once after changing the agent so old wrong answers are not resubmitted).
-   - **Groq free-tier TPM (413 “request too large”)**: the agent truncates tool outputs and total context. Tune with `GAIA_GROQ_MAX_TOOL_CHARS` (default `3200`), `GAIA_GROQ_CONTEXT_CHARS` (default `26000`), and `GAIA_AUTO_TRANSCRIPT_CHARS` (default `12000` for inlined MP3 transcripts).
 Keep the Space **public** so `agent_code` (`…/tree/main`) verifies for the leaderboard.

    - `GAIA_VISION_MODEL` — HF-only default `meta-llama/Llama-3.2-11B-Vision-Instruct`
    - `GAIA_API_URL` — default `https://agents-course-unit4-scoring.hf.space`
    - `GAIA_USE_CACHE` — `1` (default) or `0` to disable `gaia_answers_cache.json` (set **`0`** once after changing the agent so old wrong answers are not resubmitted).
+   - **Groq free-tier TPM / 413 “request too large”**: defaults are conservative (`GAIA_GROQ_MAX_TOOL_CHARS` `1400`, `GAIA_GROQ_CONTEXT_CHARS` `12000`, `GAIA_GROQ_MAX_TOKENS` `384`, `GAIA_AUTO_TRANSCRIPT_CHARS` `8000`, `GAIA_GROQ_AUTO_TRANSCRIPT_CHARS` `3600` for inlined MP3 text). Increase only if you have higher Groq limits. After changing the agent, set `GAIA_USE_CACHE=0` once so cached **Inference error** strings are not resubmitted.
 Keep the Space **public** so `agent_code` (`…/tree/main`) verifies for the leaderboard.

agent.py CHANGED Viewed

@@ -45,18 +45,19 @@ Hard rules:
 def _tool_char_cap(backend: str, *, shrink_pass: int = 0) -> int:
     if backend == "groq":
-        base = int(os.environ.get("GAIA_GROQ_MAX_TOOL_CHARS", "3200"))
     elif backend == "openai":
         base = int(os.environ.get("GAIA_OPENAI_MAX_TOOL_CHARS", "12000"))
     else:
         base = int(os.environ.get("GAIA_MAX_TOOL_CHARS", "24000"))
     if shrink_pass > 0:
-        base = max(600, base // (2**shrink_pass))
     return base
 def _groq_context_budget() -> int:
-    return int(os.environ.get("GAIA_GROQ_CONTEXT_CHARS", "26000"))
 def _maybe_retryable_llm_error(exc: Exception) -> bool:
@@ -86,21 +87,44 @@ def _truncate_tool_messages(
             m["content"] = c[:cap] + "\n[truncated]"
 def _enforce_context_budget(messages: list[dict[str, Any]], backend: str) -> None:
     if backend != "groq":
         return
     budget = _groq_context_budget()
-    for _ in range(24):
-        total = sum(len(str(m.get("content") or "")) for m in messages)
         if total <= budget:
             return
         trimmed = False
         for m in messages[2:]:
             if m.get("role") != "tool":
                 continue
             c = m.get("content")
-            if isinstance(c, str) and len(c) > 800:
-                m["content"] = c[: max(600, len(c) * 2 // 3)] + "\n[truncated]"
                 trimmed = True
                 break
         if not trimmed:
@@ -159,12 +183,17 @@ class GaiaAgent:
         _enforce_context_budget(messages, self.backend)
         if self.backend in ("groq", "openai"):
             assert self._oa_client is not None
             return chat_complete_openai(
                 self._oa_client,
                 model=self.text_model,
                 messages=messages,
                 tools=TOOL_DEFINITIONS,
-                max_tokens=768,
                 temperature=0.15,
             )
         client = self._get_hf_client()
@@ -191,7 +220,9 @@ class GaiaAgent:
             return normalize_answer("", context_question=question)
         user_text = _build_user_payload(question, attachment_path, task_id)
-        user_text += _maybe_inline_audio_transcript(attachment_path, self.hf_token)
         messages: list[dict[str, Any]] = [
             {"role": "system", "content": SYSTEM_PROMPT},
@@ -199,7 +230,8 @@ class GaiaAgent:
         ]
         last_text = ""
-        retry_delays = (2.0, 6.0, 14.0)
         for _ in range(self.max_iterations):
             completion = None
@@ -219,6 +251,8 @@ class GaiaAgent:
                         continue
                     if "402" in str(e) or "payment required" in str(e).lower():
                         return normalize_answer("", context_question=question)
                     return normalize_answer(
                         f"Inference error: {e}", context_question=question
                     )
@@ -298,6 +332,8 @@ def _build_user_payload(
 def _maybe_inline_audio_transcript(
     attachment_path: Optional[str],
     hf_token: Optional[str],
 ) -> str:
     if not attachment_path:
         return ""
@@ -310,5 +346,10 @@ def _maybe_inline_audio_transcript(
     tx = transcribe_audio(str(p), hf_token=hf_token)
     if not tx or tx.lower().startswith(("error", "asr error")):
         return f"\n\n[Automatic transcription failed: {tx[:500]}]\n"
-    cap = int(os.environ.get("GAIA_AUTO_TRANSCRIPT_CHARS", "12000"))
     return f"\n\n[Audio transcript — use for your answer]\n{tx[:cap]}\n"

 def _tool_char_cap(backend: str, *, shrink_pass: int = 0) -> int:
     if backend == "groq":
+        # Free-tier Groq often rejects ~6k TPM per request; keep tool payloads small.
+        base = int(os.environ.get("GAIA_GROQ_MAX_TOOL_CHARS", "1400"))
     elif backend == "openai":
         base = int(os.environ.get("GAIA_OPENAI_MAX_TOOL_CHARS", "12000"))
     else:
         base = int(os.environ.get("GAIA_MAX_TOOL_CHARS", "24000"))
     if shrink_pass > 0:
+        base = max(280, base // (2**shrink_pass))
     return base
 def _groq_context_budget() -> int:
+    return int(os.environ.get("GAIA_GROQ_CONTEXT_CHARS", "12000"))
 def _maybe_retryable_llm_error(exc: Exception) -> bool:
             m["content"] = c[:cap] + "\n[truncated]"
+def _groq_message_chars(m: dict[str, Any]) -> int:
+    n = len(str(m.get("content") or ""))
+    tc = m.get("tool_calls")
+    if tc:
+        n += len(str(tc))
+    return n
+def _drop_oldest_tool_round(messages: list[dict[str, Any]]) -> bool:
+    """Remove the earliest assistant+tool_calls block and its tool replies."""
+    i = 2
+    while i < len(messages):
+        if messages[i].get("role") == "assistant" and messages[i].get("tool_calls"):
+            del messages[i]
+            while i < len(messages) and messages[i].get("role") == "tool":
+                del messages[i]
+            return True
+        i += 1
+    return False
 def _enforce_context_budget(messages: list[dict[str, Any]], backend: str) -> None:
     if backend != "groq":
         return
     budget = _groq_context_budget()
+    for _ in range(40):
+        total = sum(_groq_message_chars(m) for m in messages)
         if total <= budget:
             return
+        if _drop_oldest_tool_round(messages):
+            continue
         trimmed = False
         for m in messages[2:]:
             if m.get("role") != "tool":
                 continue
             c = m.get("content")
+            if isinstance(c, str) and len(c) > 400:
+                m["content"] = c[: max(400, len(c) * 2 // 3)] + "\n[truncated]"
                 trimmed = True
                 break
         if not trimmed:
         _enforce_context_budget(messages, self.backend)
         if self.backend in ("groq", "openai"):
             assert self._oa_client is not None
+            mt = (
+                int(os.environ.get("GAIA_GROQ_MAX_TOKENS", "384"))
+                if self.backend == "groq"
+                else int(os.environ.get("GAIA_OPENAI_MAX_TOKENS", "768"))
+            )
             return chat_complete_openai(
                 self._oa_client,
                 model=self.text_model,
                 messages=messages,
                 tools=TOOL_DEFINITIONS,
+                max_tokens=mt,
                 temperature=0.15,
             )
         client = self._get_hf_client()
             return normalize_answer("", context_question=question)
         user_text = _build_user_payload(question, attachment_path, task_id)
+        user_text += _maybe_inline_audio_transcript(
+            attachment_path, self.hf_token, backend=self.backend
+        )
         messages: list[dict[str, Any]] = [
             {"role": "system", "content": SYSTEM_PROMPT},
         ]
         last_text = ""
+        # Extra delays so Groq free-tier TPM / oversized-request errors can retry after shrink.
+        retry_delays = (2.0, 4.0, 8.0, 14.0, 22.0)
         for _ in range(self.max_iterations):
             completion = None
                         continue
                     if "402" in str(e) or "payment required" in str(e).lower():
                         return normalize_answer("", context_question=question)
+                    if _maybe_retryable_llm_error(e):
+                        return normalize_answer("", context_question=question)
                     return normalize_answer(
                         f"Inference error: {e}", context_question=question
                     )
 def _maybe_inline_audio_transcript(
     attachment_path: Optional[str],
     hf_token: Optional[str],
+    *,
+    backend: str = "hf",
 ) -> str:
     if not attachment_path:
         return ""
     tx = transcribe_audio(str(p), hf_token=hf_token)
     if not tx or tx.lower().startswith(("error", "asr error")):
         return f"\n\n[Automatic transcription failed: {tx[:500]}]\n"
+    cap = int(os.environ.get("GAIA_AUTO_TRANSCRIPT_CHARS", "8000"))
+    if backend == "groq":
+        cap = min(
+            cap,
+            int(os.environ.get("GAIA_GROQ_AUTO_TRANSCRIPT_CHARS", "3600")),
+        )
     return f"\n\n[Audio transcript — use for your answer]\n{tx[:cap]}\n"

answer_normalize.py CHANGED Viewed

@@ -73,10 +73,13 @@ def normalize_answer(
     if not text:
         return ""
     low = text.lower()
     if (
         "hugging face inference credits exhausted" in low
         or "inference credits exhausted" in low
-        or ("inference error:" in low and "402" in text)
     ):
         return ""
     if "wikipedia_search:" in low and low.count("wikipedia_search:") >= 4:

     if not text:
         return ""
     low = text.lower()
+    if low.startswith("inference error:"):
+        return ""
     if (
         "hugging face inference credits exhausted" in low
         or "inference credits exhausted" in low
+        or "error code: 413" in low
+        or ("rate_limit_exceeded" in low and "413" in text)
     ):
         return ""
     if "wikipedia_search:" in low and low.count("wikipedia_search:") >= 4: