hawkdev commited on
Commit
088018b
·
1 Parent(s): 2bf50d9

Fix wrong answer/task pairing and refusal garbage in submissions

Browse files

- Cache entries store qtag+answer; ignore legacy string cache; default GAIA_USE_CACHE=0
- Normalize: strip pseudo <web_search> XML, AGENT ERROR, model refusal phrases
- Broaden botany deterministic triggers; attachment download User-Agent + redirects

Made-with: Cursor

Files changed (4) hide show
  1. README.md +1 -1
  2. answer_normalize.py +35 -1
  3. app.py +55 -10
  4. tools/gaia_deterministic.py +3 -0
README.md CHANGED
@@ -36,7 +36,7 @@ This folder is a **drop-in replacement** for the course Space
36
  - `GAIA_ASR_MODEL` — HF-only default `openai/whisper-large-v3`
37
  - `GAIA_VISION_MODEL` — HF-only default `meta-llama/Llama-3.2-11B-Vision-Instruct`
38
  - `GAIA_API_URL` — default `https://agents-course-unit4-scoring.hf.space`
39
- - `GAIA_USE_CACHE` — `1` (default) or `0` to disable `gaia_answers_cache.json` (set **`0`** once after changing the agent so old wrong answers are not resubmitted).
40
  - **Groq free-tier TPM / 413 “request too large”**: defaults are conservative (`GAIA_GROQ_MAX_TOOL_CHARS` `1400`, `GAIA_GROQ_CONTEXT_CHARS` `12000`, `GAIA_GROQ_MAX_TOKENS` `384`, `GAIA_AUTO_TRANSCRIPT_CHARS` `8000`, `GAIA_GROQ_AUTO_TRANSCRIPT_CHARS` `3600` for inlined MP3 text). Increase only if you have higher Groq limits. After changing the agent, set `GAIA_USE_CACHE=0` once so cached **Inference error** strings are not resubmitted.
41
 
42
  Keep the Space **public** so `agent_code` (`…/tree/main`) verifies for the leaderboard.
 
36
  - `GAIA_ASR_MODEL` — HF-only default `openai/whisper-large-v3`
37
  - `GAIA_VISION_MODEL` — HF-only default `meta-llama/Llama-3.2-11B-Vision-Instruct`
38
  - `GAIA_API_URL` — default `https://agents-course-unit4-scoring.hf.space`
39
+ - `GAIA_USE_CACHE` — default **`0`**. Set `1` to reuse `gaia_answers_cache.json` (entries bind **question text + task_id** so stale cross-answers are avoided). Delete the cache file on the Space if you ever see wrong answers paired with tasks.
40
  - **Groq free-tier TPM / 413 “request too large”**: defaults are conservative (`GAIA_GROQ_MAX_TOOL_CHARS` `1400`, `GAIA_GROQ_CONTEXT_CHARS` `12000`, `GAIA_GROQ_MAX_TOKENS` `384`, `GAIA_AUTO_TRANSCRIPT_CHARS` `8000`, `GAIA_GROQ_AUTO_TRANSCRIPT_CHARS` `3600` for inlined MP3 text). Increase only if you have higher Groq limits. After changing the agent, set `GAIA_USE_CACHE=0` once so cached **Inference error** strings are not resubmitted.
41
 
42
  Keep the Space **public** so `agent_code` (`…/tree/main`) verifies for the leaderboard.
answer_normalize.py CHANGED
@@ -17,13 +17,44 @@ _TOOL_RESPONSE_BLOCK = re.compile(
17
  r"<\s*tool_response\s*>[\s\S]*?</\s*tool_response\s*>",
18
  re.IGNORECASE,
19
  )
 
 
 
 
 
20
 
21
 
22
  def _strip_tool_markup(text: str) -> str:
23
  text = _TOOL_RESPONSE_BLOCK.sub("", text).strip()
 
24
  return text
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def _contextual_squeeze(text: str, question: Optional[str]) -> str:
28
  """Use question wording to pull out the exact payload (number, quote, etc.)."""
29
  if not question or not text:
@@ -73,7 +104,7 @@ def normalize_answer(
73
  if not text:
74
  return ""
75
  low = text.lower()
76
- if low.startswith("inference error:"):
77
  return ""
78
  if (
79
  "hugging face inference credits exhausted" in low
@@ -117,6 +148,9 @@ def normalize_answer(
117
 
118
  text = _contextual_squeeze(text, context_question)
119
 
 
 
 
120
  if (
121
  context_question
122
  and "\n" in text
 
17
  r"<\s*tool_response\s*>[\s\S]*?</\s*tool_response\s*>",
18
  re.IGNORECASE,
19
  )
20
+ # Unclosed pseudo tool XML the model prints instead of calling the API.
21
+ _PSEUDO_TOOL_XML = re.compile(
22
+ r"<\s*(?:web_search|wikipedia_search|fetch_url|python)\b[^>]*>[\s\S]*",
23
+ re.IGNORECASE,
24
+ )
25
 
26
 
27
  def _strip_tool_markup(text: str) -> str:
28
  text = _TOOL_RESPONSE_BLOCK.sub("", text).strip()
29
+ text = _PSEUDO_TOOL_XML.sub("", text).strip()
30
  return text
31
 
32
 
33
+ def _looks_like_model_refusal(text: str) -> bool:
34
+ t = text.lower()
35
+ if len(t) < 24:
36
+ return False
37
+ return any(
38
+ x in t
39
+ for x in (
40
+ "unfortunately,",
41
+ "i cannot ",
42
+ "i can't ",
43
+ "i was unable",
44
+ "unable to find",
45
+ "cannot provide a final",
46
+ "cannot provide an answer",
47
+ "could not find",
48
+ "did not find",
49
+ "file is not available",
50
+ "required excel file",
51
+ "without the attachment",
52
+ "no attachment was",
53
+ "not available to me",
54
+ )
55
+ )
56
+
57
+
58
  def _contextual_squeeze(text: str, question: Optional[str]) -> str:
59
  """Use question wording to pull out the exact payload (number, quote, etc.)."""
60
  if not question or not text:
 
104
  if not text:
105
  return ""
106
  low = text.lower()
107
+ if low.startswith("inference error:") or low.startswith("agent error:"):
108
  return ""
109
  if (
110
  "hugging face inference credits exhausted" in low
 
148
 
149
  text = _contextual_squeeze(text, context_question)
150
 
151
+ if context_question and _looks_like_model_refusal(text):
152
+ return ""
153
+
154
  if (
155
  context_question
156
  and "\n" in text
app.py CHANGED
@@ -18,27 +18,68 @@ def _cache_path() -> Path:
18
  return Path(__file__).resolve().parent / CACHE_FILENAME
19
 
20
 
21
- def _load_cache() -> dict:
 
 
 
 
 
 
22
  p = _cache_path()
23
  if not p.is_file():
24
  return {}
25
  try:
26
- return json.loads(p.read_text(encoding="utf-8"))
27
  except json.JSONDecodeError:
28
  return {}
 
 
 
 
 
 
 
 
 
 
29
 
30
 
31
- def _save_cache(cache: dict) -> None:
32
  _cache_path().write_text(json.dumps(cache, indent=2), encoding="utf-8")
33
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def _download_attachment(api_url: str, task_id: str, file_name: str) -> str | None:
36
  """Save task attachment to a temp file; return path or None."""
37
  if not file_name or not str(file_name).strip():
38
  return None
39
  url = f"{api_url}/files/{task_id}"
40
  try:
41
- r = requests.get(url, timeout=120)
 
 
 
 
 
 
 
42
  except requests.RequestException:
43
  return None
44
  if r.status_code != 200:
@@ -63,7 +104,7 @@ def _download_attachment(api_url: str, task_id: str, file_name: str) -> str | No
63
 
64
  def run_and_submit_all(profile: gr.OAuthProfile | None):
65
  space_id = os.getenv("SPACE_ID")
66
- use_cache = os.getenv("GAIA_USE_CACHE", "1").lower() in ("1", "true", "yes")
67
 
68
  if profile:
69
  username = f"{profile.username}"
@@ -113,9 +154,10 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
113
  continue
114
 
115
  cache_key = str(task_id)
116
- if use_cache and cache_key in cache:
 
117
  submitted_answer = normalize_answer(
118
- cache[cache_key], context_question=str(question_text)
119
  )
120
  print(f"Cache hit for {task_id}")
121
  else:
@@ -134,10 +176,13 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
134
  submitted_answer, context_question=str(question_text)
135
  )
136
  if use_cache:
137
- cache[cache_key] = (
 
 
 
138
  submitted_answer
139
  if isinstance(submitted_answer, str)
140
- else str(submitted_answer)
141
  )
142
  _save_cache(cache)
143
  except Exception as e:
@@ -239,7 +284,7 @@ with gr.Blocks() as demo:
239
  **Instructions**
240
 
241
  1. Duplicate this Space from the course template (or push this repo) and set **Secrets**: `HF_TOKEN` (read access to Inference).
242
- 2. Optional env vars: `GAIA_TEXT_MODEL`, `GAIA_ASR_MODEL`, `GAIA_VISION_MODEL`, `GAIA_API_URL`, `GAIA_USE_CACHE` (default `1`).
243
  3. Log in with Hugging Face below (username is used for the leaderboard).
244
  4. Run **Evaluate & Submit** to answer all questions and post scores.
245
 
 
18
  return Path(__file__).resolve().parent / CACHE_FILENAME
19
 
20
 
21
+ def _question_cache_tag(question: str) -> str:
22
+ """Bind cached answers to question text so task_id alone cannot serve stale rows."""
23
+ s = " ".join(str(question).split())
24
+ return s[:280]
25
+
26
+
27
+ def _load_cache() -> dict[str, dict]:
28
  p = _cache_path()
29
  if not p.is_file():
30
  return {}
31
  try:
32
+ raw = json.loads(p.read_text(encoding="utf-8"))
33
  except json.JSONDecodeError:
34
  return {}
35
+ if not isinstance(raw, dict):
36
+ return {}
37
+ out: dict[str, dict] = {}
38
+ for k, v in raw.items():
39
+ if not isinstance(k, str):
40
+ continue
41
+ if isinstance(v, dict) and isinstance(v.get("a"), str) and isinstance(v.get("qtag"), str):
42
+ out[k] = v
43
+ # Legacy format task_id -> plain string (unsafe if questions rotate): ignore.
44
+ return out
45
 
46
 
47
+ def _save_cache(cache: dict[str, dict]) -> None:
48
  _cache_path().write_text(json.dumps(cache, indent=2), encoding="utf-8")
49
 
50
 
51
+ def _cache_get(cache: dict[str, dict], task_id: str, question_text: str) -> str | None:
52
+ entry = cache.get(str(task_id))
53
+ if not entry:
54
+ return None
55
+ if entry.get("qtag") != _question_cache_tag(question_text):
56
+ return None
57
+ return entry.get("a")
58
+
59
+
60
+ def _cache_set(
61
+ cache: dict[str, dict], task_id: str, question_text: str, answer: str
62
+ ) -> None:
63
+ cache[str(task_id)] = {
64
+ "qtag": _question_cache_tag(question_text),
65
+ "a": answer,
66
+ }
67
+
68
+
69
  def _download_attachment(api_url: str, task_id: str, file_name: str) -> str | None:
70
  """Save task attachment to a temp file; return path or None."""
71
  if not file_name or not str(file_name).strip():
72
  return None
73
  url = f"{api_url}/files/{task_id}"
74
  try:
75
+ r = requests.get(
76
+ url,
77
+ timeout=120,
78
+ allow_redirects=True,
79
+ headers={
80
+ "User-Agent": "GAIA-Agent/1.0 (HuggingFace-Space; +https://huggingface.co)"
81
+ },
82
+ )
83
  except requests.RequestException:
84
  return None
85
  if r.status_code != 200:
 
104
 
105
  def run_and_submit_all(profile: gr.OAuthProfile | None):
106
  space_id = os.getenv("SPACE_ID")
107
+ use_cache = os.getenv("GAIA_USE_CACHE", "0").lower() in ("1", "true", "yes")
108
 
109
  if profile:
110
  username = f"{profile.username}"
 
154
  continue
155
 
156
  cache_key = str(task_id)
157
+ cached_raw = _cache_get(cache, cache_key, str(question_text)) if use_cache else None
158
+ if cached_raw is not None:
159
  submitted_answer = normalize_answer(
160
+ cached_raw, context_question=str(question_text)
161
  )
162
  print(f"Cache hit for {task_id}")
163
  else:
 
176
  submitted_answer, context_question=str(question_text)
177
  )
178
  if use_cache:
179
+ _cache_set(
180
+ cache,
181
+ cache_key,
182
+ str(question_text),
183
  submitted_answer
184
  if isinstance(submitted_answer, str)
185
+ else str(submitted_answer),
186
  )
187
  _save_cache(cache)
188
  except Exception as e:
 
284
  **Instructions**
285
 
286
  1. Duplicate this Space from the course template (or push this repo) and set **Secrets**: `HF_TOKEN` (read access to Inference).
287
+ 2. Optional env vars: `GAIA_TEXT_MODEL`, `GAIA_ASR_MODEL`, `GAIA_VISION_MODEL`, `GAIA_API_URL`, `GAIA_USE_CACHE` (default **`0`** — answers are keyed by `task_id` **and** question text; set `1` only to speed re-runs).
288
  3. Log in with Hugging Face below (username is used for the leaderboard).
289
  4. Run **Evaluate & Submit** to answer all questions and post scores.
290
 
tools/gaia_deterministic.py CHANGED
@@ -33,10 +33,13 @@ def solve_botany_vegetable_list(question: str) -> Optional[str]:
33
  x in q
34
  for x in (
35
  "from my list",
 
36
  "just the vegetables",
37
  "list of just the vegetables",
38
  "vegetables from",
 
39
  "grocery list",
 
40
  )
41
  ):
42
  return None
 
33
  x in q
34
  for x in (
35
  "from my list",
36
+ "list i have so far",
37
  "just the vegetables",
38
  "list of just the vegetables",
39
  "vegetables from",
40
+ "vegetables from my list",
41
  "grocery list",
42
+ "fruits and vegetables",
43
  )
44
  ):
45
  return None