Spaces:
Sleeping
Sleeping
refactor: replace answer cache with agent helpers
Browse files- agent_helpers.py +68 -0
- app.py +60 -38
- gaia_resolvers.py +0 -108
- tests/test_agent_helpers.py +47 -0
- tests/test_gaia_resolvers.py +0 -120
agent_helpers.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Optional
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif"}
|
| 6 |
+
SPREADSHEET_SUFFIXES = {".xlsx", ".xls", ".csv"}
|
| 7 |
+
PYTHON_SUFFIXES = {".py"}
|
| 8 |
+
AUDIO_SUFFIXES = {".mp3", ".wav", ".m4a", ".aac", ".flac", ".ogg", ".opus", ".webm"}
|
| 9 |
+
TEXT_SUFFIXES = {".txt", ".md", ".json", ".csv", ".tsv", ".html", ".htm"}
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def build_user_content(question: str, task_id: Optional[str]) -> str:
|
| 13 |
+
if not task_id:
|
| 14 |
+
return question
|
| 15 |
+
|
| 16 |
+
return (
|
| 17 |
+
f"{question}\n\n"
|
| 18 |
+
f"Task ID: {task_id}\n"
|
| 19 |
+
"If this question has an attachment, call download_task_file with this task_id first. "
|
| 20 |
+
"Then use the most specific follow-up tool for the downloaded file type."
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def classify_attachment(question: str, suffix: str) -> Optional[str]:
|
| 25 |
+
suffix = (suffix or "").lower()
|
| 26 |
+
q = (question or "").lower()
|
| 27 |
+
|
| 28 |
+
if suffix in IMAGE_SUFFIXES:
|
| 29 |
+
return "image"
|
| 30 |
+
if suffix in AUDIO_SUFFIXES:
|
| 31 |
+
return "audio"
|
| 32 |
+
if suffix in PYTHON_SUFFIXES:
|
| 33 |
+
return "python"
|
| 34 |
+
if suffix in SPREADSHEET_SUFFIXES:
|
| 35 |
+
return "spreadsheet"
|
| 36 |
+
if suffix in TEXT_SUFFIXES:
|
| 37 |
+
return "text"
|
| 38 |
+
|
| 39 |
+
if any(x in q for x in ["image", "picture", "screenshot", "chess position", "visual", "diagram", "shown in"]):
|
| 40 |
+
return "image"
|
| 41 |
+
if any(x in q for x in ["audio", "recording", "mp3", "wav", "says", "say in response", "lecture"]):
|
| 42 |
+
return "audio"
|
| 43 |
+
if any(x in q for x in ["python code", "attached python", "numeric output", "run the attached python"]):
|
| 44 |
+
return "python"
|
| 45 |
+
if any(x in q for x in ["excel", "spreadsheet", "csv", "sales", "table contains"]):
|
| 46 |
+
return "spreadsheet"
|
| 47 |
+
if any(x in q for x in ["attached text", "text file", "read the attached", "document"]):
|
| 48 |
+
return "text"
|
| 49 |
+
|
| 50 |
+
return None
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def is_youtube_question(question: str) -> bool:
|
| 54 |
+
return bool(re.search(r"https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)", question or ""))
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def cleanup_exact_answer(raw_answer: str) -> str:
|
| 58 |
+
answer = str(raw_answer or "").strip()
|
| 59 |
+
answer = re.sub(r"^```(?:\w+)?\s*", "", answer)
|
| 60 |
+
answer = re.sub(r"\s*```$", "", answer)
|
| 61 |
+
answer = answer.strip().strip("`").strip()
|
| 62 |
+
answer = re.sub(r"^(?:final answer|answer)\s*:\s*", "", answer, flags=re.IGNORECASE)
|
| 63 |
+
answer = re.sub(r"^the answer is\s*:?\s*", "", answer, flags=re.IGNORECASE)
|
| 64 |
+
|
| 65 |
+
if len(answer) > 1 and answer.endswith(".") and not re.search(r"\d\.\d$", answer):
|
| 66 |
+
answer = answer[:-1]
|
| 67 |
+
|
| 68 |
+
return answer.strip()
|
app.py
CHANGED
|
@@ -21,7 +21,12 @@ from tools import (
|
|
| 21 |
answer_audio_question,
|
| 22 |
get_youtube_transcript,
|
| 23 |
)
|
| 24 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
# (Keep Constants as is)
|
| 26 |
# --- Constants ---
|
| 27 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
@@ -135,6 +140,7 @@ class BasicAgent:
|
|
| 135 |
- web lookup
|
| 136 |
|
| 137 |
Tool policy:
|
|
|
|
| 138 |
- For image, screenshot, chess position, chart image, diagram, or visual counting questions, use answer_image_question.
|
| 139 |
- For Excel or CSV questions, use answer_excel_question.
|
| 140 |
- For Python-code-output questions, use answer_python_question.
|
|
@@ -341,18 +347,41 @@ class BasicAgent:
|
|
| 341 |
HumanMessage(content=f"Question:\n{question}\n\nRaw answer:\n{raw_answer}")
|
| 342 |
])
|
| 343 |
return response.content.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
|
| 345 |
def answer_question(self, question: str, task_id: str | None = None) -> str:
|
| 346 |
-
deterministic_answer = try_deterministic_answer(question, task_id)
|
| 347 |
-
if deterministic_answer is not None:
|
| 348 |
-
print(
|
| 349 |
-
f"[deterministic_answer] task_id={task_id} answer={deterministic_answer}",
|
| 350 |
-
flush=True,
|
| 351 |
-
)
|
| 352 |
-
return deterministic_answer
|
| 353 |
-
|
| 354 |
file_info = None
|
| 355 |
-
|
|
|
|
|
|
|
|
|
|
| 356 |
|
| 357 |
if task_id:
|
| 358 |
info_str = download_task_file.invoke({"task_id": task_id})
|
|
@@ -365,53 +394,46 @@ class BasicAgent:
|
|
| 365 |
if file_info and "file_path" in file_info:
|
| 366 |
suffix = file_info.get("suffix", "").lower()
|
| 367 |
file_path = file_info["file_path"]
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
is_audio_q = any(x in q for x in [
|
| 373 |
-
"audio", "recording", "mp3", "wav", "says", "say in response", "lecture"
|
| 374 |
-
])
|
| 375 |
-
is_python_q = any(x in q for x in [
|
| 376 |
-
"python code", "attached python", "numeric output from the attached python"
|
| 377 |
-
])
|
| 378 |
-
is_excel_q = any(x in q for x in [
|
| 379 |
-
"excel", "spreadsheet", "csv", "sales", "table contains"
|
| 380 |
-
])
|
| 381 |
-
|
| 382 |
-
if suffix in [".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif"] or is_image_q:
|
| 383 |
raw = answer_image_question.invoke({
|
| 384 |
"file_path": file_path,
|
| 385 |
"question": question
|
| 386 |
})
|
| 387 |
-
return self.format_final_answer(question, raw)
|
| 388 |
|
| 389 |
-
if
|
| 390 |
raw = answer_audio_question.invoke({
|
| 391 |
"file_path": file_path,
|
| 392 |
"question": question
|
| 393 |
})
|
| 394 |
-
return self.format_final_answer(question, raw)
|
| 395 |
|
| 396 |
-
if
|
| 397 |
raw = answer_python_question.invoke({
|
| 398 |
"file_path": file_path
|
| 399 |
})
|
| 400 |
-
return self.format_final_answer(question, raw)
|
| 401 |
|
| 402 |
-
if
|
| 403 |
-
|
| 404 |
"file_path": file_path,
|
| 405 |
"question": question
|
| 406 |
})
|
| 407 |
-
return self.
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
|
|
|
|
|
|
|
|
|
| 412 |
|
|
|
|
| 413 |
result = self.react_graph.invoke({"messages": [HumanMessage(content=user_content)]})
|
| 414 |
-
return result["messages"][-1].content
|
| 415 |
|
| 416 |
|
| 417 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
|
|
| 21 |
answer_audio_question,
|
| 22 |
get_youtube_transcript,
|
| 23 |
)
|
| 24 |
+
from agent_helpers import (
|
| 25 |
+
build_user_content,
|
| 26 |
+
classify_attachment,
|
| 27 |
+
cleanup_exact_answer,
|
| 28 |
+
is_youtube_question,
|
| 29 |
+
)
|
| 30 |
# (Keep Constants as is)
|
| 31 |
# --- Constants ---
|
| 32 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
|
|
|
| 140 |
- web lookup
|
| 141 |
|
| 142 |
Tool policy:
|
| 143 |
+
- If a task_id is present and the question hints at an attachment, call download_task_file first.
|
| 144 |
- For image, screenshot, chess position, chart image, diagram, or visual counting questions, use answer_image_question.
|
| 145 |
- For Excel or CSV questions, use answer_excel_question.
|
| 146 |
- For Python-code-output questions, use answer_python_question.
|
|
|
|
| 347 |
HumanMessage(content=f"Question:\n{question}\n\nRaw answer:\n{raw_answer}")
|
| 348 |
])
|
| 349 |
return response.content.strip()
|
| 350 |
+
|
| 351 |
+
def answer_from_context(self, question: str, context: str, source_label: str = "context") -> str:
|
| 352 |
+
response = self.model.invoke([
|
| 353 |
+
SystemMessage(content="""
|
| 354 |
+
You are an exact-match QA extractor.
|
| 355 |
+
|
| 356 |
+
Answer the question using only the provided source context.
|
| 357 |
+
|
| 358 |
+
Rules:
|
| 359 |
+
- Return only the final answer.
|
| 360 |
+
- No explanation.
|
| 361 |
+
- No markdown.
|
| 362 |
+
- No citations.
|
| 363 |
+
- Do not mention the source context.
|
| 364 |
+
- If the question asks for a number, return only the number.
|
| 365 |
+
- If the question asks for a list, return only the requested items.
|
| 366 |
+
- If comma-separated output is appropriate, use comma + space.
|
| 367 |
+
- If the answer is not present, return the best concise answer implied by the context.
|
| 368 |
+
"""),
|
| 369 |
+
HumanMessage(content=f"""
|
| 370 |
+
Question:
|
| 371 |
+
{question}
|
| 372 |
+
|
| 373 |
+
Source ({source_label}):
|
| 374 |
+
{context}
|
| 375 |
+
""")
|
| 376 |
+
])
|
| 377 |
+
return cleanup_exact_answer(response.content)
|
| 378 |
|
| 379 |
def answer_question(self, question: str, task_id: str | None = None) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
file_info = None
|
| 381 |
+
|
| 382 |
+
if is_youtube_question(question):
|
| 383 |
+
transcript = get_youtube_transcript.invoke({"url_or_question": question})
|
| 384 |
+
return self.answer_from_context(question, transcript, "YouTube transcript")
|
| 385 |
|
| 386 |
if task_id:
|
| 387 |
info_str = download_task_file.invoke({"task_id": task_id})
|
|
|
|
| 394 |
if file_info and "file_path" in file_info:
|
| 395 |
suffix = file_info.get("suffix", "").lower()
|
| 396 |
file_path = file_info["file_path"]
|
| 397 |
+
|
| 398 |
+
attachment_kind = classify_attachment(question, suffix)
|
| 399 |
+
|
| 400 |
+
if attachment_kind == "image":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
raw = answer_image_question.invoke({
|
| 402 |
"file_path": file_path,
|
| 403 |
"question": question
|
| 404 |
})
|
| 405 |
+
return cleanup_exact_answer(self.format_final_answer(question, raw))
|
| 406 |
|
| 407 |
+
if attachment_kind == "audio":
|
| 408 |
raw = answer_audio_question.invoke({
|
| 409 |
"file_path": file_path,
|
| 410 |
"question": question
|
| 411 |
})
|
| 412 |
+
return cleanup_exact_answer(self.format_final_answer(question, raw))
|
| 413 |
|
| 414 |
+
if attachment_kind == "python":
|
| 415 |
raw = answer_python_question.invoke({
|
| 416 |
"file_path": file_path
|
| 417 |
})
|
| 418 |
+
return cleanup_exact_answer(self.format_final_answer(question, raw))
|
| 419 |
|
| 420 |
+
if attachment_kind == "spreadsheet":
|
| 421 |
+
context = answer_excel_question.invoke({
|
| 422 |
"file_path": file_path,
|
| 423 |
"question": question
|
| 424 |
})
|
| 425 |
+
return self.answer_from_context(question, context, "spreadsheet summary")
|
| 426 |
+
|
| 427 |
+
if attachment_kind == "text":
|
| 428 |
+
context = read_attached_text_file.invoke({
|
| 429 |
+
"file_path": file_path,
|
| 430 |
+
"max_chars": 20000,
|
| 431 |
+
})
|
| 432 |
+
return self.answer_from_context(question, context, "attached text file")
|
| 433 |
|
| 434 |
+
user_content = build_user_content(question, task_id)
|
| 435 |
result = self.react_graph.invoke({"messages": [HumanMessage(content=user_content)]})
|
| 436 |
+
return cleanup_exact_answer(result["messages"][-1].content)
|
| 437 |
|
| 438 |
|
| 439 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
gaia_resolvers.py
DELETED
|
@@ -1,108 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import re
|
| 3 |
-
from typing import Optional
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
PUBLIC_UNIT4_ANSWERS = {
|
| 7 |
-
"8e867cd7-cff9-4e6c-867a-ff5ddc2550be": "3",
|
| 8 |
-
"a1e91b78-d3d8-4675-bb8d-62741b4b68a6": "3",
|
| 9 |
-
"2d83110e-a098-4ebb-9987-066c06fa42d0": "Right",
|
| 10 |
-
"cca530fc-4052-43b2-b130-b30968d8aa44": "Rd5",
|
| 11 |
-
"4fc2f1ae-8625-45b5-ab34-ad4433bc21f8": "FunkMonk",
|
| 12 |
-
"6f37996b-2ac7-44b0-8e68-6d28256631b4": "b, e",
|
| 13 |
-
"9d191bce-651d-4746-be2d-7ef8ecadb9c2": "Extremely",
|
| 14 |
-
"cabe07ed-9eca-40ea-8ead-410ef5e83f91": "Louvrier",
|
| 15 |
-
"3cef3a44-215e-4aed-8e3b-b1e3f08063b7": "broccoli, celery, fresh basil, lettuce, sweet potatoes",
|
| 16 |
-
"99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3": (
|
| 17 |
-
"cornstarch, freshly squeezed lemon juice, granulated sugar, "
|
| 18 |
-
"pure vanilla extract, ripe strawberries"
|
| 19 |
-
),
|
| 20 |
-
"305ac316-eef6-4446-960a-92d80d542f82": "Wojciech",
|
| 21 |
-
"f918266a-b3e0-4914-865d-4faa564f1aef": "0",
|
| 22 |
-
"3f57289b-8c60-48be-bd80-01f8099ca449": "519",
|
| 23 |
-
"1f975693-876d-457b-a649-393859e79bf3": "132, 133, 134, 197, 245",
|
| 24 |
-
"840bfca7-4f7b-481a-8794-c560c340185d": "80GSFC21M0002",
|
| 25 |
-
"bda648d7-d618-4883-88f4-3466eabd860e": "Saint Petersburg",
|
| 26 |
-
"cf106601-ab4f-4af9-b045-5295fe67b37d": "CUB",
|
| 27 |
-
"a0c07678-e491-4bbc-8f0b-07405144218f": "Yoshida, Uehara",
|
| 28 |
-
"7bd855d8-463d-4ed5-93ca-5fe35145f733": "89706.00",
|
| 29 |
-
"5a0c1adf-205e-4841-a666-7c3ef95def9d": "Claus",
|
| 30 |
-
}
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
def _public_fallbacks_enabled() -> bool:
|
| 34 |
-
return os.getenv("ENABLE_PUBLIC_GAIA_FALLBACKS", "1").strip().lower() not in {
|
| 35 |
-
"0",
|
| 36 |
-
"false",
|
| 37 |
-
"no",
|
| 38 |
-
"off",
|
| 39 |
-
}
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
def _normalized(text: str) -> str:
|
| 43 |
-
return re.sub(r"\s+", " ", text.lower()).strip()
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
def _answer_reversed_instruction(question: str) -> Optional[str]:
|
| 47 |
-
reversed_question = question[::-1]
|
| 48 |
-
if "opposite of the word" in reversed_question and "left" in reversed_question:
|
| 49 |
-
return "Right"
|
| 50 |
-
return None
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
def _answer_commutativity_counterexample(question: str) -> Optional[str]:
|
| 54 |
-
q = _normalized(question)
|
| 55 |
-
if "not commutative" not in q or "set s = {a, b, c, d, e}" not in q:
|
| 56 |
-
return None
|
| 57 |
-
|
| 58 |
-
table = {
|
| 59 |
-
"a": {"a": "a", "b": "b", "c": "c", "d": "b", "e": "d"},
|
| 60 |
-
"b": {"a": "b", "b": "c", "c": "a", "d": "e", "e": "c"},
|
| 61 |
-
"c": {"a": "c", "b": "a", "c": "b", "d": "b", "e": "a"},
|
| 62 |
-
"d": {"a": "b", "b": "e", "c": "b", "d": "e", "e": "d"},
|
| 63 |
-
"e": {"a": "d", "b": "b", "c": "a", "d": "d", "e": "c"},
|
| 64 |
-
}
|
| 65 |
-
involved = set()
|
| 66 |
-
elements = sorted(table)
|
| 67 |
-
|
| 68 |
-
for left in elements:
|
| 69 |
-
for right in elements:
|
| 70 |
-
if table[left][right] != table[right][left]:
|
| 71 |
-
involved.add(left)
|
| 72 |
-
involved.add(right)
|
| 73 |
-
|
| 74 |
-
return ", ".join(sorted(involved))
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
def _answer_botanical_vegetables(question: str) -> Optional[str]:
|
| 78 |
-
q = _normalized(question)
|
| 79 |
-
if "botany" not in q or "vegetables" not in q or "sweet potatoes" not in q:
|
| 80 |
-
return None
|
| 81 |
-
|
| 82 |
-
vegetables = ["broccoli", "celery", "fresh basil", "lettuce", "sweet potatoes"]
|
| 83 |
-
return ", ".join(sorted(vegetables))
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
def try_deterministic_answer(question: str, task_id: Optional[str] = None) -> Optional[str]:
|
| 87 |
-
"""
|
| 88 |
-
Return an exact answer for deterministic public Unit 4 cases.
|
| 89 |
-
|
| 90 |
-
Unknown questions return None so the normal tool-using agent can solve them.
|
| 91 |
-
Set ENABLE_PUBLIC_GAIA_FALLBACKS=0 to disable task-id fallbacks while keeping
|
| 92 |
-
general deterministic solvers active.
|
| 93 |
-
"""
|
| 94 |
-
question = question or ""
|
| 95 |
-
|
| 96 |
-
for resolver in (
|
| 97 |
-
_answer_reversed_instruction,
|
| 98 |
-
_answer_commutativity_counterexample,
|
| 99 |
-
_answer_botanical_vegetables,
|
| 100 |
-
):
|
| 101 |
-
answer = resolver(question)
|
| 102 |
-
if answer is not None:
|
| 103 |
-
return answer
|
| 104 |
-
|
| 105 |
-
if task_id and _public_fallbacks_enabled():
|
| 106 |
-
return PUBLIC_UNIT4_ANSWERS.get(task_id)
|
| 107 |
-
|
| 108 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_agent_helpers.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
from agent_helpers import (
|
| 4 |
+
build_user_content,
|
| 5 |
+
classify_attachment,
|
| 6 |
+
cleanup_exact_answer,
|
| 7 |
+
is_youtube_question,
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AgentHelperTests(unittest.TestCase):
|
| 12 |
+
def test_build_user_content_exposes_task_id_to_tool_agent(self):
|
| 13 |
+
content = build_user_content("What is in the attached file?", "abc-123")
|
| 14 |
+
|
| 15 |
+
self.assertIn("What is in the attached file?", content)
|
| 16 |
+
self.assertIn("abc-123", content)
|
| 17 |
+
self.assertIn("download_task_file", content)
|
| 18 |
+
|
| 19 |
+
def test_build_user_content_without_task_id_is_plain_question(self):
|
| 20 |
+
self.assertEqual(build_user_content("What is 2 + 2?", None), "What is 2 + 2?")
|
| 21 |
+
|
| 22 |
+
def test_classify_attachment_prefers_file_suffix(self):
|
| 23 |
+
self.assertEqual(classify_attachment("What does it say?", ".mp3"), "audio")
|
| 24 |
+
self.assertEqual(classify_attachment("Analyze the table", ".xlsx"), "spreadsheet")
|
| 25 |
+
self.assertEqual(classify_attachment("What is shown?", ".png"), "image")
|
| 26 |
+
self.assertEqual(classify_attachment("What is the output?", ".py"), "python")
|
| 27 |
+
|
| 28 |
+
def test_classify_attachment_uses_question_when_suffix_is_missing(self):
|
| 29 |
+
self.assertEqual(classify_attachment("Use the attached spreadsheet", ""), "spreadsheet")
|
| 30 |
+
self.assertEqual(classify_attachment("What is said in the recording?", ""), "audio")
|
| 31 |
+
self.assertEqual(classify_attachment("Review the chess position image", ""), "image")
|
| 32 |
+
self.assertEqual(classify_attachment("Run the attached Python code", ""), "python")
|
| 33 |
+
self.assertEqual(classify_attachment("Read the attached text file", ""), "text")
|
| 34 |
+
|
| 35 |
+
def test_youtube_detection(self):
|
| 36 |
+
self.assertTrue(is_youtube_question("Watch https://www.youtube.com/watch?v=L1vXCYZAYYM"))
|
| 37 |
+
self.assertTrue(is_youtube_question("See https://youtu.be/L1vXCYZAYYM"))
|
| 38 |
+
self.assertFalse(is_youtube_question("This mentions video but has no URL"))
|
| 39 |
+
|
| 40 |
+
def test_cleanup_exact_answer_removes_common_wrappers(self):
|
| 41 |
+
self.assertEqual(cleanup_exact_answer("FINAL ANSWER: 519"), "519")
|
| 42 |
+
self.assertEqual(cleanup_exact_answer("`b, e`"), "b, e")
|
| 43 |
+
self.assertEqual(cleanup_exact_answer("The answer is: Right."), "Right")
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
unittest.main()
|
tests/test_gaia_resolvers.py
DELETED
|
@@ -1,120 +0,0 @@
|
|
| 1 |
-
import unittest
|
| 2 |
-
|
| 3 |
-
from gaia_resolvers import try_deterministic_answer
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
class DeterministicResolverTests(unittest.TestCase):
|
| 7 |
-
def test_public_unit4_questions_return_exact_answers(self):
|
| 8 |
-
cases = [
|
| 9 |
-
(
|
| 10 |
-
"8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
|
| 11 |
-
"How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)?",
|
| 12 |
-
"3",
|
| 13 |
-
),
|
| 14 |
-
(
|
| 15 |
-
"a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
|
| 16 |
-
"In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
|
| 17 |
-
"3",
|
| 18 |
-
),
|
| 19 |
-
(
|
| 20 |
-
"2d83110e-a098-4ebb-9987-066c06fa42d0",
|
| 21 |
-
'.rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI',
|
| 22 |
-
"Right",
|
| 23 |
-
),
|
| 24 |
-
(
|
| 25 |
-
"cca530fc-4052-43b2-b130-b30968d8aa44",
|
| 26 |
-
"Review the chess position provided in the image. It is black's turn.",
|
| 27 |
-
"Rd5",
|
| 28 |
-
),
|
| 29 |
-
(
|
| 30 |
-
"4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
|
| 31 |
-
"Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
|
| 32 |
-
"FunkMonk",
|
| 33 |
-
),
|
| 34 |
-
(
|
| 35 |
-
"6f37996b-2ac7-44b0-8e68-6d28256631b4",
|
| 36 |
-
"Given this table defining * on the set S = {a, b, c, d, e}",
|
| 37 |
-
"b, e",
|
| 38 |
-
),
|
| 39 |
-
(
|
| 40 |
-
"9d191bce-651d-4746-be2d-7ef8ecadb9c2",
|
| 41 |
-
"What does Teal'c say in response to the question \"Isn't that hot?\"",
|
| 42 |
-
"Extremely",
|
| 43 |
-
),
|
| 44 |
-
(
|
| 45 |
-
"cabe07ed-9eca-40ea-8ead-410ef5e83f91",
|
| 46 |
-
"What is the surname of the equine veterinarian mentioned in 1.E Exercises?",
|
| 47 |
-
"Louvrier",
|
| 48 |
-
),
|
| 49 |
-
(
|
| 50 |
-
"3cef3a44-215e-4aed-8e3b-b1e3f08063b7",
|
| 51 |
-
"Please alphabetize the list of vegetables",
|
| 52 |
-
"broccoli, celery, fresh basil, lettuce, sweet potatoes",
|
| 53 |
-
),
|
| 54 |
-
(
|
| 55 |
-
"99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
|
| 56 |
-
"I've attached the recipe as Strawberry pie.mp3.",
|
| 57 |
-
"cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",
|
| 58 |
-
),
|
| 59 |
-
(
|
| 60 |
-
"305ac316-eef6-4446-960a-92d80d542f82",
|
| 61 |
-
"Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.?",
|
| 62 |
-
"Wojciech",
|
| 63 |
-
),
|
| 64 |
-
(
|
| 65 |
-
"f918266a-b3e0-4914-865d-4faa564f1aef",
|
| 66 |
-
"What is the final numeric output from the attached Python code?",
|
| 67 |
-
"0",
|
| 68 |
-
),
|
| 69 |
-
(
|
| 70 |
-
"3f57289b-8c60-48be-bd80-01f8099ca449",
|
| 71 |
-
"How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?",
|
| 72 |
-
"519",
|
| 73 |
-
),
|
| 74 |
-
(
|
| 75 |
-
"1f975693-876d-457b-a649-393859e79bf3",
|
| 76 |
-
"Please provide just the page numbers as a comma-delimited list.",
|
| 77 |
-
"132, 133, 134, 197, 245",
|
| 78 |
-
),
|
| 79 |
-
(
|
| 80 |
-
"840bfca7-4f7b-481a-8794-c560c340185d",
|
| 81 |
-
"Under what NASA award number was the work performed by R. G. Arendt supported by?",
|
| 82 |
-
"80GSFC21M0002",
|
| 83 |
-
),
|
| 84 |
-
(
|
| 85 |
-
"bda648d7-d618-4883-88f4-3466eabd860e",
|
| 86 |
-
"Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited?",
|
| 87 |
-
"Saint Petersburg",
|
| 88 |
-
),
|
| 89 |
-
(
|
| 90 |
-
"cf106601-ab4f-4af9-b045-5295fe67b37d",
|
| 91 |
-
"What country had the least number of athletes at the 1928 Summer Olympics?",
|
| 92 |
-
"CUB",
|
| 93 |
-
),
|
| 94 |
-
(
|
| 95 |
-
"a0c07678-e491-4bbc-8f0b-07405144218f",
|
| 96 |
-
"Who are the pitchers with the number before and after Taisho Tamai's number as of July 2023?",
|
| 97 |
-
"Yoshida, Uehara",
|
| 98 |
-
),
|
| 99 |
-
(
|
| 100 |
-
"7bd855d8-463d-4ed5-93ca-5fe35145f733",
|
| 101 |
-
"What were the total sales that the chain made from food?",
|
| 102 |
-
"89706.00",
|
| 103 |
-
),
|
| 104 |
-
(
|
| 105 |
-
"5a0c1adf-205e-4841-a666-7c3ef95def9d",
|
| 106 |
-
"What is the first name of the only Malko Competition recipient from the 20th Century?",
|
| 107 |
-
"Claus",
|
| 108 |
-
),
|
| 109 |
-
]
|
| 110 |
-
|
| 111 |
-
for task_id, question, expected in cases:
|
| 112 |
-
with self.subTest(task_id=task_id):
|
| 113 |
-
self.assertEqual(try_deterministic_answer(question, task_id), expected)
|
| 114 |
-
|
| 115 |
-
def test_unknown_question_falls_back_to_agent(self):
|
| 116 |
-
self.assertIsNone(try_deterministic_answer("What is 2 + 2?", None))
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
if __name__ == "__main__":
|
| 120 |
-
unittest.main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|