Final_Assignment

Sleeping

App Files Files Community

kenqia commited on 21 days ago

Commit

c3e9fb6

1 Parent(s): 79be81b

refactor: replace answer cache with agent helpers

Browse files

Files changed (5) hide show

agent_helpers.py +68 -0
app.py +60 -38
gaia_resolvers.py +0 -108
tests/test_agent_helpers.py +47 -0
tests/test_gaia_resolvers.py +0 -120

agent_helpers.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import re
+from typing import Optional
+IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif"}
+SPREADSHEET_SUFFIXES = {".xlsx", ".xls", ".csv"}
+PYTHON_SUFFIXES = {".py"}
+AUDIO_SUFFIXES = {".mp3", ".wav", ".m4a", ".aac", ".flac", ".ogg", ".opus", ".webm"}
+TEXT_SUFFIXES = {".txt", ".md", ".json", ".csv", ".tsv", ".html", ".htm"}
+def build_user_content(question: str, task_id: Optional[str]) -> str:
+    if not task_id:
+        return question
+    return (
+        f"{question}\n\n"
+        f"Task ID: {task_id}\n"
+        "If this question has an attachment, call download_task_file with this task_id first. "
+        "Then use the most specific follow-up tool for the downloaded file type."
+    )
+def classify_attachment(question: str, suffix: str) -> Optional[str]:
+    suffix = (suffix or "").lower()
+    q = (question or "").lower()
+    if suffix in IMAGE_SUFFIXES:
+        return "image"
+    if suffix in AUDIO_SUFFIXES:
+        return "audio"
+    if suffix in PYTHON_SUFFIXES:
+        return "python"
+    if suffix in SPREADSHEET_SUFFIXES:
+        return "spreadsheet"
+    if suffix in TEXT_SUFFIXES:
+        return "text"
+    if any(x in q for x in ["image", "picture", "screenshot", "chess position", "visual", "diagram", "shown in"]):
+        return "image"
+    if any(x in q for x in ["audio", "recording", "mp3", "wav", "says", "say in response", "lecture"]):
+        return "audio"
+    if any(x in q for x in ["python code", "attached python", "numeric output", "run the attached python"]):
+        return "python"
+    if any(x in q for x in ["excel", "spreadsheet", "csv", "sales", "table contains"]):
+        return "spreadsheet"
+    if any(x in q for x in ["attached text", "text file", "read the attached", "document"]):
+        return "text"
+    return None
+def is_youtube_question(question: str) -> bool:
+    return bool(re.search(r"https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)", question or ""))
+def cleanup_exact_answer(raw_answer: str) -> str:
+    answer = str(raw_answer or "").strip()
+    answer = re.sub(r"^```(?:\w+)?\s*", "", answer)
+    answer = re.sub(r"\s*```$", "", answer)
+    answer = answer.strip().strip("`").strip()
+    answer = re.sub(r"^(?:final answer|answer)\s*:\s*", "", answer, flags=re.IGNORECASE)
+    answer = re.sub(r"^the answer is\s*:?\s*", "", answer, flags=re.IGNORECASE)
+    if len(answer) > 1 and answer.endswith(".") and not re.search(r"\d\.\d$", answer):
+        answer = answer[:-1]
+    return answer.strip()

app.py CHANGED Viewed

@@ -21,7 +21,12 @@ from tools import (
     answer_audio_question,
     get_youtube_transcript,
 )
-from gaia_resolvers import try_deterministic_answer
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
@@ -135,6 +140,7 @@ class BasicAgent:
         - web lookup
         Tool policy:
         - For image, screenshot, chess position, chart image, diagram, or visual counting questions, use answer_image_question.
         - For Excel or CSV questions, use answer_excel_question.
         - For Python-code-output questions, use answer_python_question.
@@ -341,18 +347,41 @@ class BasicAgent:
                 HumanMessage(content=f"Question:\n{question}\n\nRaw answer:\n{raw_answer}")
             ])
         return response.content.strip()
     def answer_question(self, question: str, task_id: str | None = None) -> str:
-        deterministic_answer = try_deterministic_answer(question, task_id)
-        if deterministic_answer is not None:
-            print(
-                f"[deterministic_answer] task_id={task_id} answer={deterministic_answer}",
-                flush=True,
-            )
-            return deterministic_answer
         file_info = None
-        q = question.lower()
         if task_id:
             info_str = download_task_file.invoke({"task_id": task_id})
@@ -365,53 +394,46 @@ class BasicAgent:
         if file_info and "file_path" in file_info:
             suffix = file_info.get("suffix", "").lower()
             file_path = file_info["file_path"]
-            is_image_q = any(x in q for x in [
-                "image", "picture", "screenshot", "chess position", "visual", "diagram", "shown in"
-            ])
-            is_audio_q = any(x in q for x in [
-                "audio", "recording", "mp3", "wav", "says", "say in response", "lecture"
-            ])
-            is_python_q = any(x in q for x in [
-                "python code", "attached python", "numeric output from the attached python"
-            ])
-            is_excel_q = any(x in q for x in [
-                "excel", "spreadsheet", "csv", "sales", "table contains"
-            ])
-            if suffix in [".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif"] or is_image_q:
                 raw = answer_image_question.invoke({
                     "file_path": file_path,
                     "question": question
                 })
-                return self.format_final_answer(question, raw)
-            if suffix in [".mp3", ".wav", ".m4a", ".aac", ".flac", ".ogg", ".opus", ".webm"] or is_audio_q:
                 raw = answer_audio_question.invoke({
                     "file_path": file_path,
                     "question": question
                 })
-                return self.format_final_answer(question, raw)
-            if suffix == ".py" or is_python_q:
                 raw = answer_python_question.invoke({
                     "file_path": file_path
                 })
-                return self.format_final_answer(question, raw)
-            if suffix in [".xlsx", ".xls", ".csv"] or is_excel_q:
-                raw = answer_excel_question.invoke({
                     "file_path": file_path,
                     "question": question
                 })
-                return self.format_final_answer(question, raw)
-        user_content = question
-        if task_id:
-            user_content += f"\n\nTask ID: {task_id}."
         result = self.react_graph.invoke({"messages": [HumanMessage(content=user_content)]})
-        return result["messages"][-1].content
 def run_and_submit_all( profile: gr.OAuthProfile | None):

     answer_audio_question,
     get_youtube_transcript,
 )
+from agent_helpers import (
+    build_user_content,
+    classify_attachment,
+    cleanup_exact_answer,
+    is_youtube_question,
+)
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
         - web lookup
         Tool policy:
+        - If a task_id is present and the question hints at an attachment, call download_task_file first.
         - For image, screenshot, chess position, chart image, diagram, or visual counting questions, use answer_image_question.
         - For Excel or CSV questions, use answer_excel_question.
         - For Python-code-output questions, use answer_python_question.
                 HumanMessage(content=f"Question:\n{question}\n\nRaw answer:\n{raw_answer}")
             ])
         return response.content.strip()
+    def answer_from_context(self, question: str, context: str, source_label: str = "context") -> str:
+        response = self.model.invoke([
+                SystemMessage(content="""
+        You are an exact-match QA extractor.
+        Answer the question using only the provided source context.
+        Rules:
+        - Return only the final answer.
+        - No explanation.
+        - No markdown.
+        - No citations.
+        - Do not mention the source context.
+        - If the question asks for a number, return only the number.
+        - If the question asks for a list, return only the requested items.
+        - If comma-separated output is appropriate, use comma + space.
+        - If the answer is not present, return the best concise answer implied by the context.
+        """),
+                HumanMessage(content=f"""
+        Question:
+        {question}
+        Source ({source_label}):
+        {context}
+        """)
+            ])
+        return cleanup_exact_answer(response.content)
     def answer_question(self, question: str, task_id: str | None = None) -> str:
         file_info = None
+        if is_youtube_question(question):
+            transcript = get_youtube_transcript.invoke({"url_or_question": question})
+            return self.answer_from_context(question, transcript, "YouTube transcript")
         if task_id:
             info_str = download_task_file.invoke({"task_id": task_id})
         if file_info and "file_path" in file_info:
             suffix = file_info.get("suffix", "").lower()
             file_path = file_info["file_path"]
+            attachment_kind = classify_attachment(question, suffix)
+            if attachment_kind == "image":
                 raw = answer_image_question.invoke({
                     "file_path": file_path,
                     "question": question
                 })
+                return cleanup_exact_answer(self.format_final_answer(question, raw))
+            if attachment_kind == "audio":
                 raw = answer_audio_question.invoke({
                     "file_path": file_path,
                     "question": question
                 })
+                return cleanup_exact_answer(self.format_final_answer(question, raw))
+            if attachment_kind == "python":
                 raw = answer_python_question.invoke({
                     "file_path": file_path
                 })
+                return cleanup_exact_answer(self.format_final_answer(question, raw))
+            if attachment_kind == "spreadsheet":
+                context = answer_excel_question.invoke({
                     "file_path": file_path,
                     "question": question
                 })
+                return self.answer_from_context(question, context, "spreadsheet summary")
+            if attachment_kind == "text":
+                context = read_attached_text_file.invoke({
+                    "file_path": file_path,
+                    "max_chars": 20000,
+                })
+                return self.answer_from_context(question, context, "attached text file")
+        user_content = build_user_content(question, task_id)
         result = self.react_graph.invoke({"messages": [HumanMessage(content=user_content)]})
+        return cleanup_exact_answer(result["messages"][-1].content)
 def run_and_submit_all( profile: gr.OAuthProfile | None):

gaia_resolvers.py DELETED Viewed

@@ -1,108 +0,0 @@
-import os
-import re
-from typing import Optional
-PUBLIC_UNIT4_ANSWERS = {
-    "8e867cd7-cff9-4e6c-867a-ff5ddc2550be": "3",
-    "a1e91b78-d3d8-4675-bb8d-62741b4b68a6": "3",
-    "2d83110e-a098-4ebb-9987-066c06fa42d0": "Right",
-    "cca530fc-4052-43b2-b130-b30968d8aa44": "Rd5",
-    "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8": "FunkMonk",
-    "6f37996b-2ac7-44b0-8e68-6d28256631b4": "b, e",
-    "9d191bce-651d-4746-be2d-7ef8ecadb9c2": "Extremely",
-    "cabe07ed-9eca-40ea-8ead-410ef5e83f91": "Louvrier",
-    "3cef3a44-215e-4aed-8e3b-b1e3f08063b7": "broccoli, celery, fresh basil, lettuce, sweet potatoes",
-    "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3": (
-        "cornstarch, freshly squeezed lemon juice, granulated sugar, "
-        "pure vanilla extract, ripe strawberries"
-    ),
-    "305ac316-eef6-4446-960a-92d80d542f82": "Wojciech",
-    "f918266a-b3e0-4914-865d-4faa564f1aef": "0",
-    "3f57289b-8c60-48be-bd80-01f8099ca449": "519",
-    "1f975693-876d-457b-a649-393859e79bf3": "132, 133, 134, 197, 245",
-    "840bfca7-4f7b-481a-8794-c560c340185d": "80GSFC21M0002",
-    "bda648d7-d618-4883-88f4-3466eabd860e": "Saint Petersburg",
-    "cf106601-ab4f-4af9-b045-5295fe67b37d": "CUB",
-    "a0c07678-e491-4bbc-8f0b-07405144218f": "Yoshida, Uehara",
-    "7bd855d8-463d-4ed5-93ca-5fe35145f733": "89706.00",
-    "5a0c1adf-205e-4841-a666-7c3ef95def9d": "Claus",
-}
-def _public_fallbacks_enabled() -> bool:
-    return os.getenv("ENABLE_PUBLIC_GAIA_FALLBACKS", "1").strip().lower() not in {
-        "0",
-        "false",
-        "no",
-        "off",
-    }
-def _normalized(text: str) -> str:
-    return re.sub(r"\s+", " ", text.lower()).strip()
-def _answer_reversed_instruction(question: str) -> Optional[str]:
-    reversed_question = question[::-1]
-    if "opposite of the word" in reversed_question and "left" in reversed_question:
-        return "Right"
-    return None
-def _answer_commutativity_counterexample(question: str) -> Optional[str]:
-    q = _normalized(question)
-    if "not commutative" not in q or "set s = {a, b, c, d, e}" not in q:
-        return None
-    table = {
-        "a": {"a": "a", "b": "b", "c": "c", "d": "b", "e": "d"},
-        "b": {"a": "b", "b": "c", "c": "a", "d": "e", "e": "c"},
-        "c": {"a": "c", "b": "a", "c": "b", "d": "b", "e": "a"},
-        "d": {"a": "b", "b": "e", "c": "b", "d": "e", "e": "d"},
-        "e": {"a": "d", "b": "b", "c": "a", "d": "d", "e": "c"},
-    }
-    involved = set()
-    elements = sorted(table)
-    for left in elements:
-        for right in elements:
-            if table[left][right] != table[right][left]:
-                involved.add(left)
-                involved.add(right)
-    return ", ".join(sorted(involved))
-def _answer_botanical_vegetables(question: str) -> Optional[str]:
-    q = _normalized(question)
-    if "botany" not in q or "vegetables" not in q or "sweet potatoes" not in q:
-        return None
-    vegetables = ["broccoli", "celery", "fresh basil", "lettuce", "sweet potatoes"]
-    return ", ".join(sorted(vegetables))
-def try_deterministic_answer(question: str, task_id: Optional[str] = None) -> Optional[str]:
-    """
-    Return an exact answer for deterministic public Unit 4 cases.
-    Unknown questions return None so the normal tool-using agent can solve them.
-    Set ENABLE_PUBLIC_GAIA_FALLBACKS=0 to disable task-id fallbacks while keeping
-    general deterministic solvers active.
-    """
-    question = question or ""
-    for resolver in (
-        _answer_reversed_instruction,
-        _answer_commutativity_counterexample,
-        _answer_botanical_vegetables,
-    ):
-        answer = resolver(question)
-        if answer is not None:
-            return answer
-    if task_id and _public_fallbacks_enabled():
-        return PUBLIC_UNIT4_ANSWERS.get(task_id)
-    return None

tests/test_agent_helpers.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import unittest
+from agent_helpers import (
+    build_user_content,
+    classify_attachment,
+    cleanup_exact_answer,
+    is_youtube_question,
+)
+class AgentHelperTests(unittest.TestCase):
+    def test_build_user_content_exposes_task_id_to_tool_agent(self):
+        content = build_user_content("What is in the attached file?", "abc-123")
+        self.assertIn("What is in the attached file?", content)
+        self.assertIn("abc-123", content)
+        self.assertIn("download_task_file", content)
+    def test_build_user_content_without_task_id_is_plain_question(self):
+        self.assertEqual(build_user_content("What is 2 + 2?", None), "What is 2 + 2?")
+    def test_classify_attachment_prefers_file_suffix(self):
+        self.assertEqual(classify_attachment("What does it say?", ".mp3"), "audio")
+        self.assertEqual(classify_attachment("Analyze the table", ".xlsx"), "spreadsheet")
+        self.assertEqual(classify_attachment("What is shown?", ".png"), "image")
+        self.assertEqual(classify_attachment("What is the output?", ".py"), "python")
+    def test_classify_attachment_uses_question_when_suffix_is_missing(self):
+        self.assertEqual(classify_attachment("Use the attached spreadsheet", ""), "spreadsheet")
+        self.assertEqual(classify_attachment("What is said in the recording?", ""), "audio")
+        self.assertEqual(classify_attachment("Review the chess position image", ""), "image")
+        self.assertEqual(classify_attachment("Run the attached Python code", ""), "python")
+        self.assertEqual(classify_attachment("Read the attached text file", ""), "text")
+    def test_youtube_detection(self):
+        self.assertTrue(is_youtube_question("Watch https://www.youtube.com/watch?v=L1vXCYZAYYM"))
+        self.assertTrue(is_youtube_question("See https://youtu.be/L1vXCYZAYYM"))
+        self.assertFalse(is_youtube_question("This mentions video but has no URL"))
+    def test_cleanup_exact_answer_removes_common_wrappers(self):
+        self.assertEqual(cleanup_exact_answer("FINAL ANSWER: 519"), "519")
+        self.assertEqual(cleanup_exact_answer("`b, e`"), "b, e")
+        self.assertEqual(cleanup_exact_answer("The answer is: Right."), "Right")
+if __name__ == "__main__":
+    unittest.main()

tests/test_gaia_resolvers.py DELETED Viewed

@@ -1,120 +0,0 @@
-import unittest
-from gaia_resolvers import try_deterministic_answer
-class DeterministicResolverTests(unittest.TestCase):
-    def test_public_unit4_questions_return_exact_answers(self):
-        cases = [
-            (
-                "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",
-                "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)?",
-                "3",
-            ),
-            (
-                "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",
-                "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?",
-                "3",
-            ),
-            (
-                "2d83110e-a098-4ebb-9987-066c06fa42d0",
-                '.rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI',
-                "Right",
-            ),
-            (
-                "cca530fc-4052-43b2-b130-b30968d8aa44",
-                "Review the chess position provided in the image. It is black's turn.",
-                "Rd5",
-            ),
-            (
-                "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",
-                "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
-                "FunkMonk",
-            ),
-            (
-                "6f37996b-2ac7-44b0-8e68-6d28256631b4",
-                "Given this table defining * on the set S = {a, b, c, d, e}",
-                "b, e",
-            ),
-            (
-                "9d191bce-651d-4746-be2d-7ef8ecadb9c2",
-                "What does Teal'c say in response to the question \"Isn't that hot?\"",
-                "Extremely",
-            ),
-            (
-                "cabe07ed-9eca-40ea-8ead-410ef5e83f91",
-                "What is the surname of the equine veterinarian mentioned in 1.E Exercises?",
-                "Louvrier",
-            ),
-            (
-                "3cef3a44-215e-4aed-8e3b-b1e3f08063b7",
-                "Please alphabetize the list of vegetables",
-                "broccoli, celery, fresh basil, lettuce, sweet potatoes",
-            ),
-            (
-                "99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3",
-                "I've attached the recipe as Strawberry pie.mp3.",
-                "cornstarch, freshly squeezed lemon juice, granulated sugar, pure vanilla extract, ripe strawberries",
-            ),
-            (
-                "305ac316-eef6-4446-960a-92d80d542f82",
-                "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.?",
-                "Wojciech",
-            ),
-            (
-                "f918266a-b3e0-4914-865d-4faa564f1aef",
-                "What is the final numeric output from the attached Python code?",
-                "0",
-            ),
-            (
-                "3f57289b-8c60-48be-bd80-01f8099ca449",
-                "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?",
-                "519",
-            ),
-            (
-                "1f975693-876d-457b-a649-393859e79bf3",
-                "Please provide just the page numbers as a comma-delimited list.",
-                "132, 133, 134, 197, 245",
-            ),
-            (
-                "840bfca7-4f7b-481a-8794-c560c340185d",
-                "Under what NASA award number was the work performed by R. G. Arendt supported by?",
-                "80GSFC21M0002",
-            ),
-            (
-                "bda648d7-d618-4883-88f4-3466eabd860e",
-                "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina's 2010 paper eventually deposited?",
-                "Saint Petersburg",
-            ),
-            (
-                "cf106601-ab4f-4af9-b045-5295fe67b37d",
-                "What country had the least number of athletes at the 1928 Summer Olympics?",
-                "CUB",
-            ),
-            (
-                "a0c07678-e491-4bbc-8f0b-07405144218f",
-                "Who are the pitchers with the number before and after Taisho Tamai's number as of July 2023?",
-                "Yoshida, Uehara",
-            ),
-            (
-                "7bd855d8-463d-4ed5-93ca-5fe35145f733",
-                "What were the total sales that the chain made from food?",
-                "89706.00",
-            ),
-            (
-                "5a0c1adf-205e-4841-a666-7c3ef95def9d",
-                "What is the first name of the only Malko Competition recipient from the 20th Century?",
-                "Claus",
-            ),
-        ]
-        for task_id, question, expected in cases:
-            with self.subTest(task_id=task_id):
-                self.assertEqual(try_deterministic_answer(question, task_id), expected)
-    def test_unknown_question_falls_back_to_agent(self):
-        self.assertIsNone(try_deterministic_answer("What is 2 + 2?", None))
-if __name__ == "__main__":
-    unittest.main()