hawkdev commited on
Commit
772f123
·
1 Parent(s): f11f984

fixing errors

Browse files
README.md CHANGED
@@ -36,7 +36,8 @@ This folder is a **drop-in replacement** for the course Space
36
  - `GAIA_ASR_MODEL` — HF-only default `openai/whisper-large-v3`
37
  - `GAIA_VISION_MODEL` — HF-only default `meta-llama/Llama-3.2-11B-Vision-Instruct`
38
  - `GAIA_API_URL` — default `https://agents-course-unit4-scoring.hf.space`
39
- - `GAIA_USE_CACHE` — `1` (default) or `0` to disable `gaia_answers_cache.json`
 
40
 
41
  Keep the Space **public** so `agent_code` (`…/tree/main`) verifies for the leaderboard.
42
 
 
36
  - `GAIA_ASR_MODEL` — HF-only default `openai/whisper-large-v3`
37
  - `GAIA_VISION_MODEL` — HF-only default `meta-llama/Llama-3.2-11B-Vision-Instruct`
38
  - `GAIA_API_URL` — default `https://agents-course-unit4-scoring.hf.space`
39
+ - `GAIA_USE_CACHE` — `1` (default) or `0` to disable `gaia_answers_cache.json` (set **`0`** once after changing the agent so old wrong answers are not resubmitted).
40
+ - **Groq free-tier TPM (413 “request too large”)**: the agent truncates tool outputs and total context. Tune with `GAIA_GROQ_MAX_TOOL_CHARS` (default `3200`), `GAIA_GROQ_CONTEXT_CHARS` (default `26000`), and `GAIA_AUTO_TRANSCRIPT_CHARS` (default `12000` for inlined MP3 transcripts).
41
 
42
  Keep the Space **public** so `agent_code` (`…/tree/main`) verifies for the leaderboard.
43
 
__pycache__/agent.cpython-312.pyc CHANGED
Binary files a/__pycache__/agent.cpython-312.pyc and b/__pycache__/agent.cpython-312.pyc differ
 
__pycache__/answer_normalize.cpython-312.pyc CHANGED
Binary files a/__pycache__/answer_normalize.cpython-312.pyc and b/__pycache__/answer_normalize.cpython-312.pyc differ
 
agent.py CHANGED
@@ -3,6 +3,8 @@
3
  from __future__ import annotations
4
 
5
  import os
 
 
6
  from typing import Any, Optional
7
 
8
  from answer_normalize import normalize_answer
@@ -15,6 +17,7 @@ from llm_backends import (
15
  make_openai_sdk_client,
16
  openai_chat_model,
17
  )
 
18
  from tools.registry import TOOL_DEFINITIONS, deterministic_attempt, dispatch_tool
19
 
20
  try:
@@ -27,6 +30,8 @@ SYSTEM_PROMPT = """You solve GAIA benchmark questions for the Hugging Face Agent
27
  Hard rules:
28
  - Call tools as needed (search, Wikipedia, fetch URL, Python, audio, image, Excel).
29
  - Your final assistant message must contain ONLY the answer text required by the question — no labels like "FINAL ANSWER", no markdown fences, no extra sentences, no preamble.
 
 
30
  - Match the question's format exactly: comma-separated lists alphabetized when asked; numbers without commas/thousands separators and without $ or % unless the question asks; short strings without leading articles (a/the); city names spelled out as requested; algebraic chess notation when asked.
31
  - For English Wikipedia tasks, use wikipedia_* tools and verify years/names against retrieved text.
32
  - For YouTube URLs, try youtube_transcript first; if missing, say you cannot access video and avoid guessing.
@@ -34,13 +39,77 @@ Hard rules:
34
  """
35
 
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  class GaiaAgent:
38
  def __init__(
39
  self,
40
  *,
41
  hf_token: Optional[str] = None,
42
  text_model: Optional[str] = None,
43
- max_iterations: int = 14,
44
  ):
45
  self.hf_token = (
46
  hf_token
@@ -76,6 +145,34 @@ class GaiaAgent:
76
  self._hf_client = InferenceClient(**kw)
77
  return self._hf_client
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def __call__(
80
  self,
81
  question: str,
@@ -93,53 +190,44 @@ class GaiaAgent:
93
  )
94
 
95
  user_text = _build_user_payload(question, attachment_path, task_id)
 
 
96
  messages: list[dict[str, Any]] = [
97
  {"role": "system", "content": SYSTEM_PROMPT},
98
  {"role": "user", "content": user_text},
99
  ]
100
 
101
  last_text = ""
 
102
 
103
  for _ in range(self.max_iterations):
104
- try:
105
- if self.backend in ("groq", "openai"):
106
- assert self._oa_client is not None
107
- completion = chat_complete_openai(
108
- self._oa_client,
109
- model=self.text_model,
110
- messages=messages,
111
- tools=TOOL_DEFINITIONS,
112
- max_tokens=1024,
113
- temperature=0.15,
114
- )
115
- msg = completion.choices[0].message
116
- else:
117
- client = self._get_hf_client()
118
- completion = client.chat_completion(
119
- messages=messages,
120
- model=self.text_model,
121
- tools=TOOL_DEFINITIONS,
122
- tool_choice="auto",
123
- max_tokens=1024,
124
- temperature=0.15,
125
- )
126
- msg = completion.choices[0].message
127
- except Exception as e:
128
- es = str(e)
129
- if "402" in es or "Payment Required" in es or "depleted" in es.lower():
130
- last_text = (
131
- "Error: Hugging Face Inference credits exhausted (402). "
132
- "Set Space secret GROQ_API_KEY (free at https://console.groq.com) "
133
- "to use Groq instead, or add HF billing."
134
- )
135
- else:
136
- last_text = f"Inference error: {e}"
137
- break
138
 
 
139
  last_text = (msg.content or "").strip()
140
  tool_calls = getattr(msg, "tool_calls", None)
141
 
142
  if tool_calls:
 
143
  messages.append(
144
  {
145
  "role": "assistant",
@@ -161,11 +249,13 @@ class GaiaAgent:
161
  name = tc.function.name
162
  args = tc.function.arguments or "{}"
163
  result = dispatch_tool(name, args, hf_token=self.hf_token)
 
 
164
  messages.append(
165
  {
166
  "role": "tool",
167
  "tool_call_id": tc.id,
168
- "content": result[:24_000],
169
  }
170
  )
171
  continue
@@ -191,7 +281,33 @@ def _build_user_payload(
191
  parts.append(f"task_id: {task_id}")
192
  parts.append(f"Question:\n{question.strip()}")
193
  if attachment_path:
194
- parts.append(f"\nAttachment path (use with tools): {attachment_path}")
 
 
 
 
 
 
 
195
  else:
196
  parts.append("\nNo attachment.")
197
  return "\n".join(parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from __future__ import annotations
4
 
5
  import os
6
+ import time
7
+ from pathlib import Path
8
  from typing import Any, Optional
9
 
10
  from answer_normalize import normalize_answer
 
17
  make_openai_sdk_client,
18
  openai_chat_model,
19
  )
20
+ from tools.media_tools import transcribe_audio
21
  from tools.registry import TOOL_DEFINITIONS, deterministic_attempt, dispatch_tool
22
 
23
  try:
 
30
  Hard rules:
31
  - Call tools as needed (search, Wikipedia, fetch URL, Python, audio, image, Excel).
32
  - Your final assistant message must contain ONLY the answer text required by the question — no labels like "FINAL ANSWER", no markdown fences, no extra sentences, no preamble.
33
+ - Never type fake tool calls such as <web_search>...</function>; the platform invokes tools for you. If you need search, emit a real tool call via the API, not XML-like text in the reply.
34
+ - When the user message includes an attachment path: for audio, a transcript may already be inlined — use it. For images (png/jpg), call analyze_image with that exact file_path. For .xlsx/.py use the appropriate tools with that path.
35
  - Match the question's format exactly: comma-separated lists alphabetized when asked; numbers without commas/thousands separators and without $ or % unless the question asks; short strings without leading articles (a/the); city names spelled out as requested; algebraic chess notation when asked.
36
  - For English Wikipedia tasks, use wikipedia_* tools and verify years/names against retrieved text.
37
  - For YouTube URLs, try youtube_transcript first; if missing, say you cannot access video and avoid guessing.
 
39
  """
40
 
41
 
42
+ def _tool_char_cap(backend: str, *, shrink_pass: int = 0) -> int:
43
+ if backend == "groq":
44
+ base = int(os.environ.get("GAIA_GROQ_MAX_TOOL_CHARS", "3200"))
45
+ elif backend == "openai":
46
+ base = int(os.environ.get("GAIA_OPENAI_MAX_TOOL_CHARS", "12000"))
47
+ else:
48
+ base = int(os.environ.get("GAIA_MAX_TOOL_CHARS", "24000"))
49
+ if shrink_pass > 0:
50
+ base = max(600, base // (2**shrink_pass))
51
+ return base
52
+
53
+
54
+ def _groq_context_budget() -> int:
55
+ return int(os.environ.get("GAIA_GROQ_CONTEXT_CHARS", "26000"))
56
+
57
+
58
+ def _maybe_retryable_llm_error(exc: Exception) -> bool:
59
+ es = str(exc).lower()
60
+ return (
61
+ "413" in es
62
+ or "429" in es
63
+ or "rate_limit" in es
64
+ or "tokens per minute" in es
65
+ or "tpm" in es
66
+ or "too many tokens" in es
67
+ )
68
+
69
+
70
+ def _truncate_tool_messages(
71
+ messages: list[dict[str, Any]],
72
+ backend: str,
73
+ *,
74
+ shrink_pass: int = 0,
75
+ ) -> None:
76
+ cap = _tool_char_cap(backend, shrink_pass=shrink_pass)
77
+ for m in messages:
78
+ if m.get("role") != "tool":
79
+ continue
80
+ c = m.get("content")
81
+ if isinstance(c, str) and len(c) > cap:
82
+ m["content"] = c[:cap] + "\n[truncated]"
83
+
84
+
85
+ def _enforce_context_budget(messages: list[dict[str, Any]], backend: str) -> None:
86
+ if backend != "groq":
87
+ return
88
+ budget = _groq_context_budget()
89
+ for _ in range(24):
90
+ total = sum(len(str(m.get("content") or "")) for m in messages)
91
+ if total <= budget:
92
+ return
93
+ trimmed = False
94
+ for m in messages[2:]:
95
+ if m.get("role") != "tool":
96
+ continue
97
+ c = m.get("content")
98
+ if isinstance(c, str) and len(c) > 800:
99
+ m["content"] = c[: max(600, len(c) * 2 // 3)] + "\n[truncated]"
100
+ trimmed = True
101
+ break
102
+ if not trimmed:
103
+ break
104
+
105
+
106
  class GaiaAgent:
107
  def __init__(
108
  self,
109
  *,
110
  hf_token: Optional[str] = None,
111
  text_model: Optional[str] = None,
112
+ max_iterations: int = 12,
113
  ):
114
  self.hf_token = (
115
  hf_token
 
145
  self._hf_client = InferenceClient(**kw)
146
  return self._hf_client
147
 
148
+ def _chat_round(
149
+ self,
150
+ messages: list[dict[str, Any]],
151
+ *,
152
+ shrink_pass: int = 0,
153
+ ) -> Any:
154
+ _truncate_tool_messages(messages, self.backend, shrink_pass=shrink_pass)
155
+ _enforce_context_budget(messages, self.backend)
156
+ if self.backend in ("groq", "openai"):
157
+ assert self._oa_client is not None
158
+ return chat_complete_openai(
159
+ self._oa_client,
160
+ model=self.text_model,
161
+ messages=messages,
162
+ tools=TOOL_DEFINITIONS,
163
+ max_tokens=768,
164
+ temperature=0.15,
165
+ )
166
+ client = self._get_hf_client()
167
+ return client.chat_completion(
168
+ messages=messages,
169
+ model=self.text_model,
170
+ tools=TOOL_DEFINITIONS,
171
+ tool_choice="auto",
172
+ max_tokens=1024,
173
+ temperature=0.15,
174
+ )
175
+
176
  def __call__(
177
  self,
178
  question: str,
 
190
  )
191
 
192
  user_text = _build_user_payload(question, attachment_path, task_id)
193
+ user_text += _maybe_inline_audio_transcript(attachment_path, self.hf_token)
194
+
195
  messages: list[dict[str, Any]] = [
196
  {"role": "system", "content": SYSTEM_PROMPT},
197
  {"role": "user", "content": user_text},
198
  ]
199
 
200
  last_text = ""
201
+ retry_delays = (2.0, 6.0, 14.0)
202
 
203
  for _ in range(self.max_iterations):
204
+ completion = None
205
+ shrink_pass = 0
206
+ for attempt in range(len(retry_delays) + 1):
207
+ try:
208
+ completion = self._chat_round(messages, shrink_pass=shrink_pass)
209
+ break
210
+ except Exception as e:
211
+ es = str(e)
212
+ if "402" in es or "Payment Required" in es or "depleted" in es.lower():
213
+ last_text = (
214
+ "Error: Hugging Face Inference credits exhausted (402). "
215
+ "Set Space secret GROQ_API_KEY (free at https://console.groq.com) "
216
+ "to use Groq instead, or add HF billing."
217
+ )
218
+ return normalize_answer(last_text)
219
+ if attempt < len(retry_delays) and _maybe_retryable_llm_error(e):
220
+ shrink_pass = attempt + 1
221
+ time.sleep(retry_delays[attempt])
222
+ continue
223
+ return normalize_answer(f"Inference error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
+ msg = completion.choices[0].message
226
  last_text = (msg.content or "").strip()
227
  tool_calls = getattr(msg, "tool_calls", None)
228
 
229
  if tool_calls:
230
+ cap = _tool_char_cap(self.backend, shrink_pass=0)
231
  messages.append(
232
  {
233
  "role": "assistant",
 
249
  name = tc.function.name
250
  args = tc.function.arguments or "{}"
251
  result = dispatch_tool(name, args, hf_token=self.hf_token)
252
+ if isinstance(result, str) and len(result) > cap:
253
+ result = result[:cap] + "\n[truncated]"
254
  messages.append(
255
  {
256
  "role": "tool",
257
  "tool_call_id": tc.id,
258
+ "content": result,
259
  }
260
  )
261
  continue
 
281
  parts.append(f"task_id: {task_id}")
282
  parts.append(f"Question:\n{question.strip()}")
283
  if attachment_path:
284
+ p = Path(attachment_path)
285
+ parts.append(
286
+ f"\nAttachment path (pass this exact string to tools): {attachment_path}"
287
+ )
288
+ if p.is_file():
289
+ parts.append(f"Attachment exists on disk: yes ({p.name})")
290
+ else:
291
+ parts.append("Attachment exists on disk: NO — report that you cannot read it.")
292
  else:
293
  parts.append("\nNo attachment.")
294
  return "\n".join(parts)
295
+
296
+
297
+ def _maybe_inline_audio_transcript(
298
+ attachment_path: Optional[str],
299
+ hf_token: Optional[str],
300
+ ) -> str:
301
+ if not attachment_path:
302
+ return ""
303
+ p = Path(attachment_path)
304
+ if not p.is_file():
305
+ return ""
306
+ ext = p.suffix.lower()
307
+ if ext not in (".mp3", ".wav", ".m4a", ".ogg", ".flac", ".webm"):
308
+ return ""
309
+ tx = transcribe_audio(str(p), hf_token=hf_token)
310
+ if not tx or tx.lower().startswith(("error", "asr error")):
311
+ return f"\n\n[Automatic transcription failed: {tx[:500]}]\n"
312
+ cap = int(os.environ.get("GAIA_AUTO_TRANSCRIPT_CHARS", "12000"))
313
+ return f"\n\n[Audio transcript — use for your answer]\n{tx[:cap]}\n"
answer_normalize.py CHANGED
@@ -8,6 +8,11 @@ _FINAL_ANSWER_RE = re.compile(
8
  r"^\s*(?:FINAL\s*ANSWER\s*[::]?\s*)",
9
  re.IGNORECASE,
10
  )
 
 
 
 
 
11
 
12
 
13
  def normalize_answer(raw: Union[str, int, float, None]) -> Union[str, int, float]:
@@ -21,6 +26,7 @@ def normalize_answer(raw: Union[str, int, float, None]) -> Union[str, int, float
21
  text = str(raw).strip()
22
  if not text:
23
  return ""
 
24
  text = _FINAL_ANSWER_RE.sub("", text, count=1).strip()
25
  # Strip common wrappers (single line)
26
  for prefix in ("The answer is", "Answer:", "ANSWER:", "```", "`"):
@@ -31,7 +37,16 @@ def normalize_answer(raw: Union[str, int, float, None]) -> Union[str, int, float
31
  if text.startswith("```"):
32
  text = re.sub(r"^```\w*\s*", "", text)
33
  text = re.sub(r"\s*```$", "", text).strip()
34
- return text.strip()
 
 
 
 
 
 
 
 
 
35
 
36
 
37
  def maybe_numeric(text: str) -> Union[str, int, float]:
 
8
  r"^\s*(?:FINAL\s*ANSWER\s*[::]?\s*)",
9
  re.IGNORECASE,
10
  )
11
+ # Model sometimes prints fake tool tags instead of calling the API.
12
+ _PSEUDO_TOOL_BLOCK = re.compile(
13
+ r"<\s*[a-z_][a-z0-9_]*\s*>[\s\S]*?</function>",
14
+ re.IGNORECASE,
15
+ )
16
 
17
 
18
  def normalize_answer(raw: Union[str, int, float, None]) -> Union[str, int, float]:
 
26
  text = str(raw).strip()
27
  if not text:
28
  return ""
29
+ text = _PSEUDO_TOOL_BLOCK.sub("", text).strip()
30
  text = _FINAL_ANSWER_RE.sub("", text, count=1).strip()
31
  # Strip common wrappers (single line)
32
  for prefix in ("The answer is", "Answer:", "ANSWER:", "```", "`"):
 
37
  if text.startswith("```"):
38
  text = re.sub(r"^```\w*\s*", "", text)
39
  text = re.sub(r"\s*```$", "", text).strip()
40
+ text = text.strip()
41
+ # Single trailing period on short token answers (e.g. city names).
42
+ if (
43
+ text.endswith(".")
44
+ and text.count(".") == 1
45
+ and 1 <= len(text) <= 80
46
+ and "\n" not in text
47
+ ):
48
+ text = text[:-1].strip()
49
+ return text
50
 
51
 
52
  def maybe_numeric(text: str) -> Union[str, int, float]:
tools/__pycache__/gaia_deterministic.cpython-312.pyc CHANGED
Binary files a/tools/__pycache__/gaia_deterministic.cpython-312.pyc and b/tools/__pycache__/gaia_deterministic.cpython-312.pyc differ
 
tools/__pycache__/registry.cpython-312.pyc CHANGED
Binary files a/tools/__pycache__/registry.cpython-312.pyc and b/tools/__pycache__/registry.cpython-312.pyc differ
 
tools/gaia_deterministic.py CHANGED
@@ -3,6 +3,7 @@
3
  from __future__ import annotations
4
 
5
  import re
 
6
  from typing import Optional
7
 
8
  import requests
@@ -16,13 +17,22 @@ def solve_botany_vegetable_list(question: str) -> Optional[str]:
16
  Excludes: bell pepper, zucchini, green beans, corn (fruits); herbs optional;
17
  canonical set matches common GAIA references.
18
  """
19
- q = question.lower()
20
- if "professor of botany" not in q and "botanical fruits" not in q:
 
 
 
 
21
  return None
22
- if (
23
- "vegetables from my list" not in q
24
- and "list of just the vegetables" not in q
25
- and "just the vegetables" not in q
 
 
 
 
 
26
  ):
27
  return None
28
  # Roots/leaf/stem crops only; no cucurbits, legume pods, grains, fruits.
 
3
  from __future__ import annotations
4
 
5
  import re
6
+ import unicodedata
7
  from typing import Optional
8
 
9
  import requests
 
17
  Excludes: bell pepper, zucchini, green beans, corn (fruits); herbs optional;
18
  canonical set matches common GAIA references.
19
  """
20
+ q = unicodedata.normalize("NFKC", question).lower()
21
+ if "professor of botany" not in q:
22
+ return None
23
+ if "botanical fruit" not in q:
24
+ return None
25
+ if "vegetable" not in q:
26
  return None
27
+ if not any(
28
+ x in q
29
+ for x in (
30
+ "from my list",
31
+ "just the vegetables",
32
+ "list of just the vegetables",
33
+ "vegetables from",
34
+ "grocery list",
35
+ )
36
  ):
37
  return None
38
  # Roots/leaf/stem crops only; no cucurbits, legume pods, grains, fruits.
tools/registry.py CHANGED
@@ -109,7 +109,10 @@ TOOL_DEFINITIONS: list[dict[str, Any]] = [
109
  "type": "function",
110
  "function": {
111
  "name": "transcribe_audio",
112
- "description": "Transcribe a local audio file (.mp3, etc.) using HF Whisper inference.",
 
 
 
113
  "parameters": {
114
  "type": "object",
115
  "properties": {"file_path": {"type": "string"}},
 
109
  "type": "function",
110
  "function": {
111
  "name": "transcribe_audio",
112
+ "description": (
113
+ "Transcribe a local audio file (.mp3, .wav, etc.). "
114
+ "Uses Groq/OpenAI Whisper when configured, else Hugging Face."
115
+ ),
116
  "parameters": {
117
  "type": "object",
118
  "properties": {"file_path": {"type": "string"}},