Spaces:

W01fAI
/

gaia_unit4_space

Sleeping

hawkdev commited on Mar 22

Commit

088018b

1 Parent(s): 2bf50d9

Fix wrong answer/task pairing and refusal garbage in submissions

- Cache entries store qtag+answer; ignore legacy string cache; default GAIA_USE_CACHE=0
- Normalize: strip pseudo <web_search> XML, AGENT ERROR, model refusal phrases
- Broaden botany deterministic triggers; attachment download User-Agent + redirects

Made-with: Cursor

Files changed (4) hide show

README.md +1 -1
answer_normalize.py +35 -1
app.py +55 -10
tools/gaia_deterministic.py +3 -0

README.md CHANGED Viewed

@@ -36,7 +36,7 @@ This folder is a **drop-in replacement** for the course Space
    - `GAIA_ASR_MODEL` — HF-only default `openai/whisper-large-v3`
    - `GAIA_VISION_MODEL` — HF-only default `meta-llama/Llama-3.2-11B-Vision-Instruct`
    - `GAIA_API_URL` — default `https://agents-course-unit4-scoring.hf.space`
-   - `GAIA_USE_CACHE` — `1` (default) or `0` to disable `gaia_answers_cache.json` (set **`0`** once after changing the agent so old wrong answers are not resubmitted).
    - **Groq free-tier TPM / 413 “request too large”**: defaults are conservative (`GAIA_GROQ_MAX_TOOL_CHARS` `1400`, `GAIA_GROQ_CONTEXT_CHARS` `12000`, `GAIA_GROQ_MAX_TOKENS` `384`, `GAIA_AUTO_TRANSCRIPT_CHARS` `8000`, `GAIA_GROQ_AUTO_TRANSCRIPT_CHARS` `3600` for inlined MP3 text). Increase only if you have higher Groq limits. After changing the agent, set `GAIA_USE_CACHE=0` once so cached **Inference error** strings are not resubmitted.
 Keep the Space **public** so `agent_code` (`…/tree/main`) verifies for the leaderboard.

    - `GAIA_ASR_MODEL` — HF-only default `openai/whisper-large-v3`
    - `GAIA_VISION_MODEL` — HF-only default `meta-llama/Llama-3.2-11B-Vision-Instruct`
    - `GAIA_API_URL` — default `https://agents-course-unit4-scoring.hf.space`
+   - `GAIA_USE_CACHE` — default **`0`**. Set `1` to reuse `gaia_answers_cache.json` (entries bind **question text + task_id** so stale cross-answers are avoided). Delete the cache file on the Space if you ever see wrong answers paired with tasks.
    - **Groq free-tier TPM / 413 “request too large”**: defaults are conservative (`GAIA_GROQ_MAX_TOOL_CHARS` `1400`, `GAIA_GROQ_CONTEXT_CHARS` `12000`, `GAIA_GROQ_MAX_TOKENS` `384`, `GAIA_AUTO_TRANSCRIPT_CHARS` `8000`, `GAIA_GROQ_AUTO_TRANSCRIPT_CHARS` `3600` for inlined MP3 text). Increase only if you have higher Groq limits. After changing the agent, set `GAIA_USE_CACHE=0` once so cached **Inference error** strings are not resubmitted.
 Keep the Space **public** so `agent_code` (`…/tree/main`) verifies for the leaderboard.

answer_normalize.py CHANGED Viewed

@@ -17,13 +17,44 @@ _TOOL_RESPONSE_BLOCK = re.compile(
     r"<\s*tool_response\s*>[\s\S]*?</\s*tool_response\s*>",
     re.IGNORECASE,
 )
 def _strip_tool_markup(text: str) -> str:
     text = _TOOL_RESPONSE_BLOCK.sub("", text).strip()
     return text
 def _contextual_squeeze(text: str, question: Optional[str]) -> str:
     """Use question wording to pull out the exact payload (number, quote, etc.)."""
     if not question or not text:
@@ -73,7 +104,7 @@ def normalize_answer(
     if not text:
         return ""
     low = text.lower()
-    if low.startswith("inference error:"):
         return ""
     if (
         "hugging face inference credits exhausted" in low
@@ -117,6 +148,9 @@ def normalize_answer(
     text = _contextual_squeeze(text, context_question)
     if (
         context_question
         and "\n" in text

     r"<\s*tool_response\s*>[\s\S]*?</\s*tool_response\s*>",
     re.IGNORECASE,
 )
+# Unclosed pseudo tool XML the model prints instead of calling the API.
+_PSEUDO_TOOL_XML = re.compile(
+    r"<\s*(?:web_search|wikipedia_search|fetch_url|python)\b[^>]*>[\s\S]*",
+    re.IGNORECASE,
+)
 def _strip_tool_markup(text: str) -> str:
     text = _TOOL_RESPONSE_BLOCK.sub("", text).strip()
+    text = _PSEUDO_TOOL_XML.sub("", text).strip()
     return text
+def _looks_like_model_refusal(text: str) -> bool:
+    t = text.lower()
+    if len(t) < 24:
+        return False
+    return any(
+        x in t
+        for x in (
+            "unfortunately,",
+            "i cannot ",
+            "i can't ",
+            "i was unable",
+            "unable to find",
+            "cannot provide a final",
+            "cannot provide an answer",
+            "could not find",
+            "did not find",
+            "file is not available",
+            "required excel file",
+            "without the attachment",
+            "no attachment was",
+            "not available to me",
+        )
+    )
 def _contextual_squeeze(text: str, question: Optional[str]) -> str:
     """Use question wording to pull out the exact payload (number, quote, etc.)."""
     if not question or not text:
     if not text:
         return ""
     low = text.lower()
+    if low.startswith("inference error:") or low.startswith("agent error:"):
         return ""
     if (
         "hugging face inference credits exhausted" in low
     text = _contextual_squeeze(text, context_question)
+    if context_question and _looks_like_model_refusal(text):
+        return ""
     if (
         context_question
         and "\n" in text

app.py CHANGED Viewed

@@ -18,27 +18,68 @@ def _cache_path() -> Path:
     return Path(__file__).resolve().parent / CACHE_FILENAME
-def _load_cache() -> dict:
     p = _cache_path()
     if not p.is_file():
         return {}
     try:
-        return json.loads(p.read_text(encoding="utf-8"))
     except json.JSONDecodeError:
         return {}
-def _save_cache(cache: dict) -> None:
     _cache_path().write_text(json.dumps(cache, indent=2), encoding="utf-8")
 def _download_attachment(api_url: str, task_id: str, file_name: str) -> str | None:
     """Save task attachment to a temp file; return path or None."""
     if not file_name or not str(file_name).strip():
         return None
     url = f"{api_url}/files/{task_id}"
     try:
-        r = requests.get(url, timeout=120)
     except requests.RequestException:
         return None
     if r.status_code != 200:
@@ -63,7 +104,7 @@ def _download_attachment(api_url: str, task_id: str, file_name: str) -> str | No
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     space_id = os.getenv("SPACE_ID")
-    use_cache = os.getenv("GAIA_USE_CACHE", "1").lower() in ("1", "true", "yes")
     if profile:
         username = f"{profile.username}"
@@ -113,9 +154,10 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
             continue
         cache_key = str(task_id)
-        if use_cache and cache_key in cache:
             submitted_answer = normalize_answer(
-                cache[cache_key], context_question=str(question_text)
             )
             print(f"Cache hit for {task_id}")
         else:
@@ -134,10 +176,13 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
                     submitted_answer, context_question=str(question_text)
                 )
                 if use_cache:
-                    cache[cache_key] = (
                         submitted_answer
                         if isinstance(submitted_answer, str)
-                        else str(submitted_answer)
                     )
                     _save_cache(cache)
             except Exception as e:
@@ -239,7 +284,7 @@ with gr.Blocks() as demo:
 **Instructions**
 1. Duplicate this Space from the course template (or push this repo) and set **Secrets**: `HF_TOKEN` (read access to Inference).
-2. Optional env vars: `GAIA_TEXT_MODEL`, `GAIA_ASR_MODEL`, `GAIA_VISION_MODEL`, `GAIA_API_URL`, `GAIA_USE_CACHE` (default `1`).
 3. Log in with Hugging Face below (username is used for the leaderboard).
 4. Run **Evaluate & Submit** to answer all questions and post scores.

     return Path(__file__).resolve().parent / CACHE_FILENAME
+def _question_cache_tag(question: str) -> str:
+    """Bind cached answers to question text so task_id alone cannot serve stale rows."""
+    s = " ".join(str(question).split())
+    return s[:280]
+def _load_cache() -> dict[str, dict]:
     p = _cache_path()
     if not p.is_file():
         return {}
     try:
+        raw = json.loads(p.read_text(encoding="utf-8"))
     except json.JSONDecodeError:
         return {}
+    if not isinstance(raw, dict):
+        return {}
+    out: dict[str, dict] = {}
+    for k, v in raw.items():
+        if not isinstance(k, str):
+            continue
+        if isinstance(v, dict) and isinstance(v.get("a"), str) and isinstance(v.get("qtag"), str):
+            out[k] = v
+        # Legacy format task_id -> plain string (unsafe if questions rotate): ignore.
+    return out
+def _save_cache(cache: dict[str, dict]) -> None:
     _cache_path().write_text(json.dumps(cache, indent=2), encoding="utf-8")
+def _cache_get(cache: dict[str, dict], task_id: str, question_text: str) -> str | None:
+    entry = cache.get(str(task_id))
+    if not entry:
+        return None
+    if entry.get("qtag") != _question_cache_tag(question_text):
+        return None
+    return entry.get("a")
+def _cache_set(
+    cache: dict[str, dict], task_id: str, question_text: str, answer: str
+) -> None:
+    cache[str(task_id)] = {
+        "qtag": _question_cache_tag(question_text),
+        "a": answer,
+    }
 def _download_attachment(api_url: str, task_id: str, file_name: str) -> str | None:
     """Save task attachment to a temp file; return path or None."""
     if not file_name or not str(file_name).strip():
         return None
     url = f"{api_url}/files/{task_id}"
     try:
+        r = requests.get(
+            url,
+            timeout=120,
+            allow_redirects=True,
+            headers={
+                "User-Agent": "GAIA-Agent/1.0 (HuggingFace-Space; +https://huggingface.co)"
+            },
+        )
     except requests.RequestException:
         return None
     if r.status_code != 200:
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     space_id = os.getenv("SPACE_ID")
+    use_cache = os.getenv("GAIA_USE_CACHE", "0").lower() in ("1", "true", "yes")
     if profile:
         username = f"{profile.username}"
             continue
         cache_key = str(task_id)
+        cached_raw = _cache_get(cache, cache_key, str(question_text)) if use_cache else None
+        if cached_raw is not None:
             submitted_answer = normalize_answer(
+                cached_raw, context_question=str(question_text)
             )
             print(f"Cache hit for {task_id}")
         else:
                     submitted_answer, context_question=str(question_text)
                 )
                 if use_cache:
+                    _cache_set(
+                        cache,
+                        cache_key,
+                        str(question_text),
                         submitted_answer
                         if isinstance(submitted_answer, str)
+                        else str(submitted_answer),
                     )
                     _save_cache(cache)
             except Exception as e:
 **Instructions**
 1. Duplicate this Space from the course template (or push this repo) and set **Secrets**: `HF_TOKEN` (read access to Inference).
+2. Optional env vars: `GAIA_TEXT_MODEL`, `GAIA_ASR_MODEL`, `GAIA_VISION_MODEL`, `GAIA_API_URL`, `GAIA_USE_CACHE` (default **`0`** — answers are keyed by `task_id` **and** question text; set `1` only to speed re-runs).
 3. Log in with Hugging Face below (username is used for the leaderboard).
 4. Run **Evaluate & Submit** to answer all questions and post scores.

tools/gaia_deterministic.py CHANGED Viewed

@@ -33,10 +33,13 @@ def solve_botany_vegetable_list(question: str) -> Optional[str]:
         x in q
         for x in (
             "from my list",
             "just the vegetables",
             "list of just the vegetables",
             "vegetables from",
             "grocery list",
         )
     ):
         return None

         x in q
         for x in (
             "from my list",
+            "list i have so far",
             "just the vegetables",
             "list of just the vegetables",
             "vegetables from",
+            "vegetables from my list",
             "grocery list",
+            "fruits and vegetables",
         )
     ):
         return None