Spaces:
Sleeping
Sleeping
shan gao
commited on
Commit
·
15f210e
1
Parent(s):
b9de22e
change
Browse files- agent.py +395 -19
- app.py +5 -0
- requirements.txt +8 -1
agent.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
# agent_v6.py
|
| 2 |
# Develop an AI agent with LangGraph and LangChain
|
| 3 |
# to answer the questions in the "gaia-benchmark/GAIA" dataset.
|
| 4 |
|
|
@@ -14,7 +13,24 @@ from langchain_core.tools import tool
|
|
| 14 |
from langchain_core.messages import HumanMessage, SystemMessage
|
| 15 |
from langchain_openai import ChatOpenAI
|
| 16 |
from langgraph.graph import StateGraph, START, END
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# Optional: pdf parsing if GAIA sometimes includes PDFs
|
| 20 |
try:
|
|
@@ -26,7 +42,8 @@ except Exception:
|
|
| 26 |
|
| 27 |
# -------------- State -------------
|
| 28 |
class EvidenceItem(TypedDict):
|
| 29 |
-
|
|
|
|
| 30 |
text: str
|
| 31 |
path: Optional[str]
|
| 32 |
meta: Dict[str, Any]
|
|
@@ -40,6 +57,12 @@ class AgentState(TypedDict):
|
|
| 40 |
answer: Optional[str]
|
| 41 |
parsed_final_answer: Optional[str]
|
| 42 |
emit_final_answer: bool # <<< add this (default True if you want old behavior)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
# -------------- helpers ---------------
|
| 45 |
def _filename_from_cd(cd: str) -> str | None:
|
|
@@ -75,6 +98,10 @@ def _summarize_evidence(evidence: List[Dict[str, Any]], limit_chars: int = 6000)
|
|
| 75 |
tag = f"{e.get('kind','?')}"
|
| 76 |
if meta.get("mime"):
|
| 77 |
tag += f"({meta['mime']})"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
chunks.append(f"[{i}:{tag}] {t}")
|
| 79 |
out = "\n".join(chunks)
|
| 80 |
return out if len(out) <= limit_chars else out[:limit_chars] + " …"
|
|
@@ -129,6 +156,13 @@ def _convert_to_wav_mono16k(src_path: str) -> str:
|
|
| 129 |
raise RuntimeError(f"ffmpeg failed: {p.stderr[-500:]}")
|
| 130 |
return out
|
| 131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
# ----------------------Tools ----------------------
|
| 133 |
@tool
|
| 134 |
def download_file(url: str, headers: dict | None = None, auth_token: str | None = None) -> str:
|
|
@@ -163,10 +197,6 @@ def download_file(url: str, headers: dict | None = None, auth_token: str | None
|
|
| 163 |
out_dir = tempfile.mkdtemp(prefix="gaia_tmpdl_")
|
| 164 |
out_path = os.path.join(out_dir, fname)
|
| 165 |
|
| 166 |
-
# # Write to colab folder
|
| 167 |
-
# out_dir: str | Path = "."
|
| 168 |
-
# out_path = Path(out_dir) / fname
|
| 169 |
-
|
| 170 |
print("out_path:", out_path)
|
| 171 |
|
| 172 |
with open(out_path, "wb") as f:
|
|
@@ -177,6 +207,9 @@ def download_file(url: str, headers: dict | None = None, auth_token: str | None
|
|
| 177 |
return out_path
|
| 178 |
|
| 179 |
|
|
|
|
|
|
|
|
|
|
| 180 |
@tool
|
| 181 |
def transcribe_audio(path: str, model_size: str = "base") -> str:
|
| 182 |
"""
|
|
@@ -184,13 +217,15 @@ def transcribe_audio(path: str, model_size: str = "base") -> str:
|
|
| 184 |
Returns the transcript text; raises on failure (caller handles).
|
| 185 |
"""
|
| 186 |
print("running transcribe_audio")
|
|
|
|
| 187 |
try:
|
| 188 |
-
|
| 189 |
-
|
|
|
|
| 190 |
return (result.get("text") or "").strip()
|
| 191 |
except Exception as e:
|
| 192 |
raise RuntimeError(f"Whisper error: {e}")
|
| 193 |
-
|
| 194 |
|
| 195 |
@tool
|
| 196 |
def ocr_image(path: str) -> str:
|
|
@@ -202,6 +237,194 @@ def ocr_image(path: str) -> str:
|
|
| 202 |
return text.strip()
|
| 203 |
|
| 204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
# ------------------------------- Nodes ------------------------------
|
| 206 |
def check_attachment_node(state: AgentState) -> AgentState:
|
| 207 |
"""Check if there is attachment."""
|
|
@@ -283,7 +506,6 @@ def preprocess_node(state: AgentState) -> AgentState:
|
|
| 283 |
try:
|
| 284 |
if mime and mime.startswith("audio"):
|
| 285 |
print("mime start with audio")
|
| 286 |
-
# print("path: ", path)
|
| 287 |
# --- ASR ---
|
| 288 |
try:
|
| 289 |
wav = _convert_to_wav_mono16k(path)
|
|
@@ -352,7 +574,7 @@ def solve_multimodal_node(state: AgentState) -> AgentState:
|
|
| 352 |
vision_llm = ChatOpenAI(model="gpt-4o", temperature=0) # vision-capable
|
| 353 |
sys = SystemMessage(content=(
|
| 354 |
"You solve GAIA tasks using the provided evidence and attached images.\n"
|
| 355 |
-
"Be precise, quote numbers/strings exactly. If uncertain, say so.\n"
|
| 356 |
"Your answer to the GAIA tasks should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. If your answer only include a single word, make the first letter capital.\n" + end_instr
|
| 357 |
))
|
| 358 |
|
|
@@ -401,7 +623,7 @@ def solve_text_only_node(state: "AgentState") -> "AgentState":
|
|
| 401 |
|
| 402 |
sys = SystemMessage(content=(
|
| 403 |
"You solve GAIA tasks. Use careful step-by-step reasoning but keep it concise.\n"
|
| 404 |
-
"You can use the provided textual evidence if there is any. \n"
|
| 405 |
"Your answer to the GAIA tasks should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. If your answer only include a single word, make the first letter capital.\n" + end_instr
|
| 406 |
))
|
| 407 |
|
|
@@ -427,7 +649,7 @@ def validate_format_node(state: AgentState) -> AgentState:
|
|
| 427 |
|
| 428 |
emit = bool(state.get("emit_final_answer", True))
|
| 429 |
txt = (state.get("answer") or "").strip()
|
| 430 |
-
|
| 431 |
if not txt:
|
| 432 |
if emit:
|
| 433 |
state["answer"] = "No answer generated.\n\nfinal_answer: [NO_ANSWER]"
|
|
@@ -468,13 +690,151 @@ def has_images(state: AgentState) -> bool:
|
|
| 468 |
return True
|
| 469 |
return False
|
| 470 |
|
|
|
|
| 471 |
def route_after_preprocess(state: AgentState) -> Literal["vision","text"]:
|
| 472 |
return "vision" if has_images(state) else "text"
|
| 473 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
# ---------- Graph ----------
|
| 475 |
# Build graph function
|
| 476 |
def build_graph():
|
| 477 |
g = StateGraph(AgentState)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 478 |
g.add_node("check_attachment", check_attachment_node)
|
| 479 |
g.add_node("fetch", fetch_node)
|
| 480 |
g.add_node("preprocess", preprocess_node)
|
|
@@ -483,7 +843,15 @@ def build_graph():
|
|
| 483 |
g.add_node("validate", validate_format_node)
|
| 484 |
|
| 485 |
# Start the edges
|
| 486 |
-
g.add_edge(START, "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
|
| 488 |
# Add conditional branching from check_attachment
|
| 489 |
g.add_conditional_edges(
|
|
@@ -521,18 +889,26 @@ def build_graph():
|
|
| 521 |
if __name__ == "__main__":
|
| 522 |
task_id = '0001'
|
| 523 |
task_q = 'Who is the current president of France'
|
| 524 |
-
|
| 525 |
-
|
|
|
|
| 526 |
"task_id": task_id,
|
| 527 |
"question": task_q,
|
| 528 |
-
"attachment_urls":
|
| 529 |
"local_files": [],
|
| 530 |
"evidence": [],
|
| 531 |
"answer": None,
|
| 532 |
"parsed_final_answer": None,
|
|
|
|
| 533 |
"emit_final_answer": False, # <<< pure output mode
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
}
|
| 535 |
agent_GAIA = build_graph()
|
| 536 |
out = agent_GAIA.invoke(sample)
|
| 537 |
print("---------------------------")
|
| 538 |
-
print(out["answer"])
|
|
|
|
|
|
|
|
|
| 1 |
# Develop an AI agent with LangGraph and LangChain
|
| 2 |
# to answer the questions in the "gaia-benchmark/GAIA" dataset.
|
| 3 |
|
|
|
|
| 13 |
from langchain_core.messages import HumanMessage, SystemMessage
|
| 14 |
from langchain_openai import ChatOpenAI
|
| 15 |
from langgraph.graph import StateGraph, START, END
|
| 16 |
+
from tavily import TavilyClient
|
| 17 |
+
import serpapi
|
| 18 |
+
import trafilatura
|
| 19 |
+
from readability import Document
|
| 20 |
+
import html as _html
|
| 21 |
+
import wikipedia
|
| 22 |
+
from urllib.parse import parse_qs
|
| 23 |
+
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
| 24 |
+
import yt_dlp
|
| 25 |
+
|
| 26 |
+
# ==== NEW: (optional) tiny helpers used by browsing nodes ====
|
| 27 |
+
def _has_search_key() -> bool:
|
| 28 |
+
"""Return True if any supported search backend is configured."""
|
| 29 |
+
return bool(
|
| 30 |
+
os.getenv("TAVILY_API_KEY")
|
| 31 |
+
or os.getenv("SERPAPI_API_KEY")
|
| 32 |
+
or (os.getenv("GOOGLE_API_KEY") and os.getenv("GOOGLE_CSE_ID"))
|
| 33 |
+
)
|
| 34 |
|
| 35 |
# Optional: pdf parsing if GAIA sometimes includes PDFs
|
| 36 |
try:
|
|
|
|
| 42 |
|
| 43 |
# -------------- State -------------
|
| 44 |
class EvidenceItem(TypedDict):
|
| 45 |
+
# ==== CHANGED: expanded allowed kinds to match actual usage paths ====
|
| 46 |
+
kind: Literal["audio_transcript","image_ocr","image_vqa","doc_text","unknown_file","preprocess_error"]
|
| 47 |
text: str
|
| 48 |
path: Optional[str]
|
| 49 |
meta: Dict[str, Any]
|
|
|
|
| 57 |
answer: Optional[str]
|
| 58 |
parsed_final_answer: Optional[str]
|
| 59 |
emit_final_answer: bool # <<< add this (default True if you want old behavior)
|
| 60 |
+
# ==== NEW: state used by browse pipeline (optional) ====
|
| 61 |
+
use_browsing: Optional[bool]
|
| 62 |
+
web_hits: Optional[List[Dict[str, str]]]
|
| 63 |
+
# ==== NEW: urls found directly in the question ====
|
| 64 |
+
question_urls: Optional[List[str]]
|
| 65 |
+
question_youtube_urls: Optional[List[str]]
|
| 66 |
|
| 67 |
# -------------- helpers ---------------
|
| 68 |
def _filename_from_cd(cd: str) -> str | None:
|
|
|
|
| 98 |
tag = f"{e.get('kind','?')}"
|
| 99 |
if meta.get("mime"):
|
| 100 |
tag += f"({meta['mime']})"
|
| 101 |
+
if meta.get("title"):
|
| 102 |
+
tag += f"[{meta['title']}]"
|
| 103 |
+
if meta.get("url"):
|
| 104 |
+
tag += f"<{meta['url']}>"
|
| 105 |
chunks.append(f"[{i}:{tag}] {t}")
|
| 106 |
out = "\n".join(chunks)
|
| 107 |
return out if len(out) <= limit_chars else out[:limit_chars] + " …"
|
|
|
|
| 156 |
raise RuntimeError(f"ffmpeg failed: {p.stderr[-500:]}")
|
| 157 |
return out
|
| 158 |
|
| 159 |
+
# ==== NEW: URL helpers ====
|
| 160 |
+
_URL_RE = re.compile(r'https?://\S+')
|
| 161 |
+
|
| 162 |
+
def _extract_urls(text: str) -> List[str]:
|
| 163 |
+
return _URL_RE.findall(text or "")
|
| 164 |
+
|
| 165 |
+
|
| 166 |
# ----------------------Tools ----------------------
|
| 167 |
@tool
|
| 168 |
def download_file(url: str, headers: dict | None = None, auth_token: str | None = None) -> str:
|
|
|
|
| 197 |
out_dir = tempfile.mkdtemp(prefix="gaia_tmpdl_")
|
| 198 |
out_path = os.path.join(out_dir, fname)
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
print("out_path:", out_path)
|
| 201 |
|
| 202 |
with open(out_path, "wb") as f:
|
|
|
|
| 207 |
return out_path
|
| 208 |
|
| 209 |
|
| 210 |
+
# ==== NEW: cache Whisper model so we don't reload each call ====
|
| 211 |
+
_WHISPER = None
|
| 212 |
+
|
| 213 |
@tool
|
| 214 |
def transcribe_audio(path: str, model_size: str = "base") -> str:
|
| 215 |
"""
|
|
|
|
| 217 |
Returns the transcript text; raises on failure (caller handles).
|
| 218 |
"""
|
| 219 |
print("running transcribe_audio")
|
| 220 |
+
global _WHISPER
|
| 221 |
try:
|
| 222 |
+
if _WHISPER is None:
|
| 223 |
+
_WHISPER = whisper.load_model(model_size)
|
| 224 |
+
result = _WHISPER.transcribe(path)
|
| 225 |
return (result.get("text") or "").strip()
|
| 226 |
except Exception as e:
|
| 227 |
raise RuntimeError(f"Whisper error: {e}")
|
| 228 |
+
|
| 229 |
|
| 230 |
@tool
|
| 231 |
def ocr_image(path: str) -> str:
|
|
|
|
| 237 |
return text.strip()
|
| 238 |
|
| 239 |
|
| 240 |
+
# ==== NEW: WEB / WIKI / YOUTUBE TOOLS =========================================
|
| 241 |
+
# Choose your search backend (Tavily simplest). Set env var before use.
|
| 242 |
+
_USE_TAVILY = False # flip to False to use SerpAPI example
|
| 243 |
+
|
| 244 |
+
if _USE_TAVILY:
|
| 245 |
+
@tool
|
| 246 |
+
def web_search(query: str, k: int = 6) -> List[Dict[str, str]]:
|
| 247 |
+
"""
|
| 248 |
+
Web search via Tavily. Returns a list of {title, url, snippet}.
|
| 249 |
+
Requires TAVILY_API_KEY.
|
| 250 |
+
"""
|
| 251 |
+
try:
|
| 252 |
+
tv = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
|
| 253 |
+
res = tv.search(
|
| 254 |
+
query=query,
|
| 255 |
+
search_depth="advanced",
|
| 256 |
+
max_results=k,
|
| 257 |
+
include_answer=False,
|
| 258 |
+
include_images=False,
|
| 259 |
+
)
|
| 260 |
+
out = []
|
| 261 |
+
for r in res.get("results", []):
|
| 262 |
+
out.append({
|
| 263 |
+
"title": r.get("title",""),
|
| 264 |
+
"url": r.get("url",""),
|
| 265 |
+
"snippet": (r.get("content","") or "")[:400]
|
| 266 |
+
})
|
| 267 |
+
return out
|
| 268 |
+
except Exception as e:
|
| 269 |
+
return [{"title":"", "url":"", "snippet": f"[search error: {e}]"}]
|
| 270 |
+
else:
|
| 271 |
+
@tool
|
| 272 |
+
def web_search(query: str, k: int = 6) -> List[Dict[str, str]]:
|
| 273 |
+
"""
|
| 274 |
+
Web search via SerpAPI. Returns a list of {title, url, snippet}.
|
| 275 |
+
Requires SERPAPI_API_KEY.
|
| 276 |
+
"""
|
| 277 |
+
try:
|
| 278 |
+
params = {"engine":"google", "q":query, "num":k, "api_key":os.getenv("SERPAPI_API_KEY")}
|
| 279 |
+
search = serpapi.search(params)
|
| 280 |
+
# results = search.get_dict()
|
| 281 |
+
results = search
|
| 282 |
+
items = results.get("organic_results", [])
|
| 283 |
+
out = []
|
| 284 |
+
for it in items[:k]:
|
| 285 |
+
out.append({
|
| 286 |
+
"title": it.get("title",""),
|
| 287 |
+
"url": it.get("link",""),
|
| 288 |
+
"snippet": (it.get("snippet","") or "")[:400]
|
| 289 |
+
})
|
| 290 |
+
return out
|
| 291 |
+
except Exception as e:
|
| 292 |
+
return [{"title":"", "url":"", "snippet": f"[search error: {e}]"}]
|
| 293 |
+
|
| 294 |
+
@tool
|
| 295 |
+
def fetch_url_text(url: str, max_chars: int = 12000, timeout: int = 30) -> Dict[str, Any]:
|
| 296 |
+
"""
|
| 297 |
+
Download a web page and extract main article text using trafilatura,
|
| 298 |
+
with a readability-lxml fallback. Returns {url, title, text}.
|
| 299 |
+
"""
|
| 300 |
+
sess = requests.Session()
|
| 301 |
+
headers = {
|
| 302 |
+
"User-Agent": "gaia-agent/1.0 (+https://example.org)",
|
| 303 |
+
"Accept": "text/html,*/*;q=0.8",
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
try:
|
| 307 |
+
r = sess.get(url, headers=headers, timeout=timeout)
|
| 308 |
+
r.raise_for_status()
|
| 309 |
+
html_content = r.text
|
| 310 |
+
except Exception as e:
|
| 311 |
+
return {"url": url, "title": "", "text": f"[fetch error: {e}]"}
|
| 312 |
+
|
| 313 |
+
# 1) try trafilatura (best for boilerplate removal)
|
| 314 |
+
try:
|
| 315 |
+
downloaded = trafilatura.extract(html_content, include_comments=False, include_tables=False, url=url)
|
| 316 |
+
if downloaded and len(downloaded) > 200:
|
| 317 |
+
text = downloaded
|
| 318 |
+
title = ""
|
| 319 |
+
else:
|
| 320 |
+
raise ValueError("trafilatura extraction too short")
|
| 321 |
+
except Exception:
|
| 322 |
+
# 2) fallback: readability
|
| 323 |
+
try:
|
| 324 |
+
doc = Document(html_content)
|
| 325 |
+
title = doc.short_title() or ""
|
| 326 |
+
text = doc.summary(html_partial=False)
|
| 327 |
+
# rudimentary HTML strip
|
| 328 |
+
text = re.sub(r"<[^>]+>", " ", text)
|
| 329 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 330 |
+
except Exception as e2:
|
| 331 |
+
return {"url": url, "title": "", "text": f"[extraction error: {e2}]"}
|
| 332 |
+
|
| 333 |
+
if len(text) > max_chars:
|
| 334 |
+
text = text[:max_chars] + " …"
|
| 335 |
+
|
| 336 |
+
# Try to fill title if empty
|
| 337 |
+
if not title:
|
| 338 |
+
m = re.search(r"<title[^>]*>(.*?)</title>", html_content, flags=re.I|re.S)
|
| 339 |
+
if m:
|
| 340 |
+
title = _html.unescape(m.group(1).strip())
|
| 341 |
+
|
| 342 |
+
return {"url": url, "title": title or "", "text": text}
|
| 343 |
+
|
| 344 |
+
@tool
|
| 345 |
+
def wikipedia_lookup(query: str, sentences: int = 4) -> Dict[str, Any]:
|
| 346 |
+
"""
|
| 347 |
+
Simple Wikipedia lookup. Returns {title, url, summary}.
|
| 348 |
+
"""
|
| 349 |
+
try:
|
| 350 |
+
wikipedia.set_lang("en")
|
| 351 |
+
try:
|
| 352 |
+
title = wikipedia.search(query, results=1)[0]
|
| 353 |
+
except Exception as e:
|
| 354 |
+
return {"title":"", "url":"", "summary": f"[wikipedia search error: {e}]"}
|
| 355 |
+
try:
|
| 356 |
+
summary = wikipedia.summary(title, sentences=sentences, auto_suggest=False)
|
| 357 |
+
page = wikipedia.page(title, auto_suggest=False, preload=False)
|
| 358 |
+
return {"title": page.title, "url": page.url, "summary": summary}
|
| 359 |
+
except Exception as e:
|
| 360 |
+
return {"title": title, "url":"", "summary": f"[wikipedia fetch error: {e}]"}
|
| 361 |
+
except Exception as e:
|
| 362 |
+
return {"title":"", "url":"", "summary": f"[wikipedia import error: {e}]"}
|
| 363 |
+
|
| 364 |
+
@tool
|
| 365 |
+
def youtube_get_transcript(url_or_id: str, prefer_langs: List[str] | None = None) -> str:
|
| 366 |
+
"""
|
| 367 |
+
Get YouTube transcript via API (no download). Returns plain text.
|
| 368 |
+
"""
|
| 369 |
+
print('try to get youtube video transcript')
|
| 370 |
+
try:
|
| 371 |
+
prefer_langs = prefer_langs or ["en", "en-US", "en-GB", "auto"]
|
| 372 |
+
vid = url_or_id
|
| 373 |
+
print("vid: ", vid)
|
| 374 |
+
if "youtube.com" in url_or_id or "youtu.be" in url_or_id:
|
| 375 |
+
u = urlparse(url_or_id)
|
| 376 |
+
if u.netloc.endswith("youtu.be"):
|
| 377 |
+
vid = u.path.lstrip("/")
|
| 378 |
+
else:
|
| 379 |
+
vid = parse_qs(u.query).get("v", [""])[0]
|
| 380 |
+
trs_list = YouTubeTranscriptApi.list_transcripts(vid)
|
| 381 |
+
# choose first matching language
|
| 382 |
+
for lang in prefer_langs:
|
| 383 |
+
try:
|
| 384 |
+
trs = trs_list.find_transcript([lang])
|
| 385 |
+
chunks = trs.fetch()
|
| 386 |
+
print("transcript from youtube website?")
|
| 387 |
+
print(" ".join([c["text"] for c in chunks if c.get("text")]).strip())
|
| 388 |
+
return " ".join([c["text"] for c in chunks if c.get("text")]).strip()
|
| 389 |
+
except Exception:
|
| 390 |
+
continue
|
| 391 |
+
# fallback: first any transcript
|
| 392 |
+
trs = list(trs_list)[0]
|
| 393 |
+
chunks = trs.fetch()
|
| 394 |
+
print("transcript from youtube website?")
|
| 395 |
+
print(" ".join([c["text"] for c in chunks if c.get("text")]).strip())
|
| 396 |
+
return " ".join([c["text"] for c in chunks if c.get("text")]).strip()
|
| 397 |
+
except (TranscriptsDisabled, NoTranscriptFound):
|
| 398 |
+
return "[no captions available]"
|
| 399 |
+
except Exception as e:
|
| 400 |
+
return f"[youtube transcript error: {e}]"
|
| 401 |
+
|
| 402 |
+
@tool
|
| 403 |
+
def youtube_transcribe_audio(url: str, model_size: str = "base") -> str:
|
| 404 |
+
"""
|
| 405 |
+
Download YouTube audio (yt-dlp) and transcribe with Whisper.
|
| 406 |
+
"""
|
| 407 |
+
tmpdir = tempfile.mkdtemp(prefix="gaia_yt_")
|
| 408 |
+
outfile = os.path.join(tmpdir, "%(id)s.%(ext)s")
|
| 409 |
+
|
| 410 |
+
ydl_opts = {
|
| 411 |
+
"format": "bestaudio/best",
|
| 412 |
+
"outtmpl": outfile,
|
| 413 |
+
"quiet": True,
|
| 414 |
+
"no_warnings": True,
|
| 415 |
+
"noplaylist": True,
|
| 416 |
+
}
|
| 417 |
+
try:
|
| 418 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 419 |
+
info = ydl.extract_info(url, download=True)
|
| 420 |
+
path = ydl.prepare_filename(info)
|
| 421 |
+
# convert & transcribe
|
| 422 |
+
wav = _convert_to_wav_mono16k(path)
|
| 423 |
+
txt = transcribe_audio.invoke({"path": wav, "model_size": model_size})
|
| 424 |
+
return txt
|
| 425 |
+
except Exception as e:
|
| 426 |
+
return f"[youtube download/transcribe error: {e}]"
|
| 427 |
+
|
| 428 |
# ------------------------------- Nodes ------------------------------
|
| 429 |
def check_attachment_node(state: AgentState) -> AgentState:
|
| 430 |
"""Check if there is attachment."""
|
|
|
|
| 506 |
try:
|
| 507 |
if mime and mime.startswith("audio"):
|
| 508 |
print("mime start with audio")
|
|
|
|
| 509 |
# --- ASR ---
|
| 510 |
try:
|
| 511 |
wav = _convert_to_wav_mono16k(path)
|
|
|
|
| 574 |
vision_llm = ChatOpenAI(model="gpt-4o", temperature=0) # vision-capable
|
| 575 |
sys = SystemMessage(content=(
|
| 576 |
"You solve GAIA tasks using the provided evidence and attached images.\n"
|
| 577 |
+
"Be precise, quote numbers/strings exactly. If uncertain, say so.\n"
|
| 578 |
"Your answer to the GAIA tasks should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. If your answer only include a single word, make the first letter capital.\n" + end_instr
|
| 579 |
))
|
| 580 |
|
|
|
|
| 623 |
|
| 624 |
sys = SystemMessage(content=(
|
| 625 |
"You solve GAIA tasks. Use careful step-by-step reasoning but keep it concise.\n"
|
| 626 |
+
"You can use the provided textual evidence if there is any. \n"
|
| 627 |
"Your answer to the GAIA tasks should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. If your answer only include a single word, make the first letter capital.\n" + end_instr
|
| 628 |
))
|
| 629 |
|
|
|
|
| 649 |
|
| 650 |
emit = bool(state.get("emit_final_answer", True))
|
| 651 |
txt = (state.get("answer") or "").strip()
|
| 652 |
+
|
| 653 |
if not txt:
|
| 654 |
if emit:
|
| 655 |
state["answer"] = "No answer generated.\n\nfinal_answer: [NO_ANSWER]"
|
|
|
|
| 690 |
return True
|
| 691 |
return False
|
| 692 |
|
| 693 |
+
# ==== CHANGED: fix return type Literal to match actual branch key ====
|
| 694 |
def route_after_preprocess(state: AgentState) -> Literal["vision","text"]:
|
| 695 |
return "vision" if has_images(state) else "text"
|
| 696 |
|
| 697 |
+
# ==== NEW: Browsing router ====
|
| 698 |
+
def needs_browsing(q: str) -> bool:
|
| 699 |
+
q = (q or "").lower()
|
| 700 |
+
hot = ["today","current","latest","price","How","who","where","what","How many",
|
| 701 |
+
"2023","2024","2025","news","wins","Which",
|
| 702 |
+
"http://","https://","wikipedia","youtube.com"]
|
| 703 |
+
# Only browse if we *also* have a search key, so the sample runs without keys.
|
| 704 |
+
return _has_search_key() and any(w in q for w in hot)
|
| 705 |
+
|
| 706 |
+
# ==== NEW: Decide browse node ====
|
| 707 |
+
def decide_browse_node(state: AgentState) -> AgentState:
|
| 708 |
+
print("enter decide_browse_node")
|
| 709 |
+
q = state.get("question", "")
|
| 710 |
+
urls = _extract_urls(q)
|
| 711 |
+
yt_urls = [u for u in urls if _is_youtube(u)]
|
| 712 |
+
|
| 713 |
+
# Save for later stages
|
| 714 |
+
state["question_urls"] = urls
|
| 715 |
+
state["question_youtube_urls"] = yt_urls
|
| 716 |
+
|
| 717 |
+
# Browse if:
|
| 718 |
+
# - we have any YouTube links in the question (can handle w/o search key), OR
|
| 719 |
+
# - the normal heuristic says we should browse (requires a search key)
|
| 720 |
+
state["use_browsing"] = bool(yt_urls) or needs_browsing(q)
|
| 721 |
+
return state
|
| 722 |
+
|
| 723 |
+
|
| 724 |
+
def route_browse(state: AgentState) -> Literal["browse","skip"]:
|
| 725 |
+
return "browse" if state.get("use_browsing") else "skip"
|
| 726 |
+
|
| 727 |
+
# ==== NEW: Search node ====
|
| 728 |
+
def search_node(state: AgentState) -> AgentState:
|
| 729 |
+
print("enter search_node")
|
| 730 |
+
q = state.get("question","")
|
| 731 |
+
|
| 732 |
+
# Start with YouTube links found in the question
|
| 733 |
+
preseed = [{"title": "(from question)", "url": u, "snippet": ""}
|
| 734 |
+
for u in (state.get("question_youtube_urls") + state.get("question_urls") or [])]
|
| 735 |
+
|
| 736 |
+
# Do a web search only if keys are configured
|
| 737 |
+
hits = []
|
| 738 |
+
if _has_search_key():
|
| 739 |
+
hits = web_search.invoke({"query": q, "k": 6}) or []
|
| 740 |
+
|
| 741 |
+
# Optionally seed Wikipedia for short queries
|
| 742 |
+
if len(q.split()) <= 30: #8
|
| 743 |
+
wiki = wikipedia_lookup.invoke({"query": q, "sentences": 4})
|
| 744 |
+
if (wiki.get("summary") or "").strip():
|
| 745 |
+
state.setdefault("evidence", []).append({
|
| 746 |
+
"kind": "doc_text",
|
| 747 |
+
"text": wiki["summary"],
|
| 748 |
+
"path": None,
|
| 749 |
+
"meta": {"source": "wikipedia", "title": wiki.get("title",""),
|
| 750 |
+
"url": wiki.get("url",""), "mime":"text/plain"}
|
| 751 |
+
})
|
| 752 |
+
|
| 753 |
+
# Combine: question YouTube links first, then search hits
|
| 754 |
+
state["web_hits"] = preseed + hits
|
| 755 |
+
return state
|
| 756 |
+
|
| 757 |
+
|
| 758 |
+
def _is_youtube(u: str) -> bool:
|
| 759 |
+
try:
|
| 760 |
+
net = urlparse(u).netloc.lower()
|
| 761 |
+
return ("youtube.com" in net) or ("youtu.be" in net)
|
| 762 |
+
except Exception:
|
| 763 |
+
return False
|
| 764 |
+
|
| 765 |
+
def crawl_node(state: AgentState) -> AgentState:
|
| 766 |
+
print("enter crawl_node")
|
| 767 |
+
ev = list(state.get("evidence", []))
|
| 768 |
+
hits: List[Dict[str,str]] = state.get("web_hits", []) or []
|
| 769 |
+
print("hits: ", hits)
|
| 770 |
+
|
| 771 |
+
# choose top M distinct domains
|
| 772 |
+
def _domain(u: str) -> str:
|
| 773 |
+
try: return urlparse(u).netloc.lower().lstrip("www.")
|
| 774 |
+
except: return ""
|
| 775 |
+
|
| 776 |
+
seen_domains = set()
|
| 777 |
+
picked = []
|
| 778 |
+
for h in hits:
|
| 779 |
+
u = h.get("url","")
|
| 780 |
+
d = _domain(u)
|
| 781 |
+
if not u or not d:
|
| 782 |
+
continue
|
| 783 |
+
if d in seen_domains:
|
| 784 |
+
continue
|
| 785 |
+
seen_domains.add(d)
|
| 786 |
+
picked.append(h)
|
| 787 |
+
if len(picked) >= 4:
|
| 788 |
+
break
|
| 789 |
+
|
| 790 |
+
print("picked: ", picked)
|
| 791 |
+
|
| 792 |
+
# Fetch & extract
|
| 793 |
+
for h in picked:
|
| 794 |
+
u = h["url"]
|
| 795 |
+
print("url: ", u)
|
| 796 |
+
title = h.get("title","")
|
| 797 |
+
# Special-case YouTube
|
| 798 |
+
if _is_youtube(u):
|
| 799 |
+
print("is_youtube? ", _is_youtube(u))
|
| 800 |
+
cap = youtube_get_transcript.invoke({"url_or_id": u})
|
| 801 |
+
print('cap: ', cap)
|
| 802 |
+
if cap and not cap.startswith("[no captions"):
|
| 803 |
+
ev.append({"kind":"doc_text","text":cap,"path":None,
|
| 804 |
+
"meta":{"source":"youtube","title": title, "url":u,"mime":"text/plain"}})
|
| 805 |
+
continue
|
| 806 |
+
# fallback: download+ASR (heavier)
|
| 807 |
+
cap2 = youtube_transcribe_audio.invoke({"url": u, "model_size":"base"})
|
| 808 |
+
ev.append({"kind":"audio_transcript","text":cap2,"path":None,
|
| 809 |
+
"meta":{"source":"youtube","title": title, "url":u,"mime":"audio"}})
|
| 810 |
+
continue
|
| 811 |
+
|
| 812 |
+
out = fetch_url_text.invoke({"url": u, "max_chars": 12000})
|
| 813 |
+
text = out.get("text","") or ""
|
| 814 |
+
page_title = out.get("title","") or title
|
| 815 |
+
if not text:
|
| 816 |
+
continue
|
| 817 |
+
ev.append({
|
| 818 |
+
"kind": "doc_text",
|
| 819 |
+
"text": text,
|
| 820 |
+
"path": None,
|
| 821 |
+
"meta": {"source":"web", "title": page_title, "url": u, "mime":"text/html"}
|
| 822 |
+
})
|
| 823 |
+
|
| 824 |
+
state["evidence"] = ev
|
| 825 |
+
return state
|
| 826 |
+
|
| 827 |
# ---------- Graph ----------
|
| 828 |
# Build graph function
|
| 829 |
def build_graph():
|
| 830 |
g = StateGraph(AgentState)
|
| 831 |
+
|
| 832 |
+
# ==== NEW: browsing nodes ====
|
| 833 |
+
g.add_node("decide_browse", decide_browse_node)
|
| 834 |
+
g.add_node("search", search_node)
|
| 835 |
+
g.add_node("crawl", crawl_node)
|
| 836 |
+
|
| 837 |
+
# Existing nodes
|
| 838 |
g.add_node("check_attachment", check_attachment_node)
|
| 839 |
g.add_node("fetch", fetch_node)
|
| 840 |
g.add_node("preprocess", preprocess_node)
|
|
|
|
| 843 |
g.add_node("validate", validate_format_node)
|
| 844 |
|
| 845 |
# Start the edges
|
| 846 |
+
g.add_edge(START, "decide_browse")
|
| 847 |
+
|
| 848 |
+
# Browse or skip
|
| 849 |
+
g.add_conditional_edges("decide_browse", route_browse, {
|
| 850 |
+
"browse": "search",
|
| 851 |
+
"skip": "check_attachment"
|
| 852 |
+
})
|
| 853 |
+
g.add_edge("search", "crawl")
|
| 854 |
+
g.add_edge("crawl", "check_attachment")
|
| 855 |
|
| 856 |
# Add conditional branching from check_attachment
|
| 857 |
g.add_conditional_edges(
|
|
|
|
| 889 |
if __name__ == "__main__":
|
| 890 |
task_id = '0001'
|
| 891 |
task_q = 'Who is the current president of France'
|
| 892 |
+
# ==== CHANGED: make it a flat empty list (not `[[]]`)
|
| 893 |
+
attachment_urls: List[str] = []
|
| 894 |
+
sample: AgentState = {
|
| 895 |
"task_id": task_id,
|
| 896 |
"question": task_q,
|
| 897 |
+
"attachment_urls": attachment_urls, # from GAIA sample
|
| 898 |
"local_files": [],
|
| 899 |
"evidence": [],
|
| 900 |
"answer": None,
|
| 901 |
"parsed_final_answer": None,
|
| 902 |
+
# Tip: set True to force a final_answer line for scoring
|
| 903 |
"emit_final_answer": False, # <<< pure output mode
|
| 904 |
+
# new optional fields:
|
| 905 |
+
"use_browsing": None,
|
| 906 |
+
"web_hits": None,
|
| 907 |
+
"question_urls": None,
|
| 908 |
+
"question_youtube_urls": None
|
| 909 |
}
|
| 910 |
agent_GAIA = build_graph()
|
| 911 |
out = agent_GAIA.invoke(sample)
|
| 912 |
print("---------------------------")
|
| 913 |
+
print(out["answer"])
|
| 914 |
+
|
app.py
CHANGED
|
@@ -77,6 +77,11 @@ def run_and_submit_all( profile: bool = True):
|
|
| 77 |
"answer": None,
|
| 78 |
"parsed_final_answer": None,
|
| 79 |
"emit_final_answer": False, # <<< pure output mode
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
}
|
| 81 |
|
| 82 |
if not task_id or question_text is None:
|
|
|
|
| 77 |
"answer": None,
|
| 78 |
"parsed_final_answer": None,
|
| 79 |
"emit_final_answer": False, # <<< pure output mode
|
| 80 |
+
# new optional fields:
|
| 81 |
+
"use_browsing": None,
|
| 82 |
+
"web_hits": None,
|
| 83 |
+
"question_urls": None,
|
| 84 |
+
"question_youtube_urls": None
|
| 85 |
}
|
| 86 |
|
| 87 |
if not task_id or question_text is None:
|
requirements.txt
CHANGED
|
@@ -8,4 +8,11 @@ langchain-community
|
|
| 8 |
ddgs
|
| 9 |
openai-whisper
|
| 10 |
pytesseract
|
| 11 |
-
ffmpeg
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
ddgs
|
| 9 |
openai-whisper
|
| 10 |
pytesseract
|
| 11 |
+
ffmpeg
|
| 12 |
+
tavily-python
|
| 13 |
+
trafilatura
|
| 14 |
+
readability-lxml
|
| 15 |
+
youtube-transcript-api
|
| 16 |
+
yt-dlp
|
| 17 |
+
wikipedia
|
| 18 |
+
serpapi
|