Final_Assignment / agent_helpers.py
kenqia's picture
feat: strengthen evidence-based GAIA agent
f55fed4
import re
from typing import Optional
IMAGE_SUFFIXES = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif"}
SPREADSHEET_SUFFIXES = {".xlsx", ".xls", ".csv"}
PYTHON_SUFFIXES = {".py"}
AUDIO_SUFFIXES = {".mp3", ".wav", ".m4a", ".aac", ".flac", ".ogg", ".opus", ".webm"}
TEXT_SUFFIXES = {".txt", ".md", ".json", ".csv", ".tsv", ".html", ".htm"}
def build_user_content(question: str, task_id: Optional[str]) -> str:
if not task_id:
return question
return (
f"{question}\n\n"
f"Task ID: {task_id}\n"
"If this question has an attachment, call download_task_file with this task_id first. "
"Then use the most specific follow-up tool for the downloaded file type."
)
def classify_attachment(question: str, suffix: str) -> Optional[str]:
suffix = (suffix or "").lower()
q = (question or "").lower()
if suffix in IMAGE_SUFFIXES:
return "image"
if suffix in AUDIO_SUFFIXES:
return "audio"
if suffix in PYTHON_SUFFIXES:
return "python"
if suffix in SPREADSHEET_SUFFIXES:
return "spreadsheet"
if suffix in TEXT_SUFFIXES:
return "text"
if any(x in q for x in ["image", "picture", "screenshot", "chess position", "visual", "diagram", "shown in"]):
return "image"
if any(x in q for x in ["audio", "recording", "mp3", "wav", "says", "say in response", "lecture"]):
return "audio"
if any(x in q for x in ["python code", "attached python", "numeric output", "run the attached python"]):
return "python"
if any(x in q for x in ["excel", "spreadsheet", "csv", "sales", "table contains"]):
return "spreadsheet"
if any(x in q for x in ["attached text", "text file", "read the attached", "document"]):
return "text"
return None
def is_youtube_question(question: str) -> bool:
return bool(re.search(r"https?://(?:www\.)?(?:youtube\.com/watch\?v=|youtu\.be/)", question or ""))
def is_youtube_visual_question(question: str) -> bool:
q = (question or "").lower()
if not is_youtube_question(question):
return False
visual_markers = [
"on camera",
"visible",
"shown",
"see in the video",
"highest number",
"how many",
"appears",
"frame",
]
speech_markers = [
"what does",
"say",
"says",
"spoken",
"response",
"transcript",
]
return any(marker in q for marker in visual_markers) and not any(marker in q for marker in speech_markers)
def cleanup_exact_answer(raw_answer: str) -> str:
answer = str(raw_answer or "").strip()
answer = re.sub(r"^```(?:\w+)?\s*", "", answer)
answer = re.sub(r"\s*```$", "", answer)
answer = answer.strip().strip("`").strip()
answer = re.sub(r"^(?:final answer|answer)\s*:\s*", "", answer, flags=re.IGNORECASE)
answer = re.sub(r"^the answer is\s*:?\s*", "", answer, flags=re.IGNORECASE)
if len(answer) > 1 and answer.endswith(".") and not re.search(r"\d\.\d$", answer):
answer = answer[:-1]
return answer.strip()