shan gao commited on
Commit
a53e629
·
1 Parent(s): 15f210e
Files changed (3) hide show
  1. agent.py +19 -395
  2. app.py +0 -5
  3. requirements.txt +1 -8
agent.py CHANGED
@@ -1,3 +1,4 @@
 
1
  # Develop an AI agent with LangGraph and LangChain
2
  # to answer the questions in the "gaia-benchmark/GAIA" dataset.
3
 
@@ -13,24 +14,7 @@ from langchain_core.tools import tool
13
  from langchain_core.messages import HumanMessage, SystemMessage
14
  from langchain_openai import ChatOpenAI
15
  from langgraph.graph import StateGraph, START, END
16
- from tavily import TavilyClient
17
- import serpapi
18
- import trafilatura
19
- from readability import Document
20
- import html as _html
21
- import wikipedia
22
- from urllib.parse import parse_qs
23
- from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
24
- import yt_dlp
25
-
26
- # ==== NEW: (optional) tiny helpers used by browsing nodes ====
27
- def _has_search_key() -> bool:
28
- """Return True if any supported search backend is configured."""
29
- return bool(
30
- os.getenv("TAVILY_API_KEY")
31
- or os.getenv("SERPAPI_API_KEY")
32
- or (os.getenv("GOOGLE_API_KEY") and os.getenv("GOOGLE_CSE_ID"))
33
- )
34
 
35
  # Optional: pdf parsing if GAIA sometimes includes PDFs
36
  try:
@@ -42,8 +26,7 @@ except Exception:
42
 
43
  # -------------- State -------------
44
  class EvidenceItem(TypedDict):
45
- # ==== CHANGED: expanded allowed kinds to match actual usage paths ====
46
- kind: Literal["audio_transcript","image_ocr","image_vqa","doc_text","unknown_file","preprocess_error"]
47
  text: str
48
  path: Optional[str]
49
  meta: Dict[str, Any]
@@ -57,12 +40,6 @@ class AgentState(TypedDict):
57
  answer: Optional[str]
58
  parsed_final_answer: Optional[str]
59
  emit_final_answer: bool # <<< add this (default True if you want old behavior)
60
- # ==== NEW: state used by browse pipeline (optional) ====
61
- use_browsing: Optional[bool]
62
- web_hits: Optional[List[Dict[str, str]]]
63
- # ==== NEW: urls found directly in the question ====
64
- question_urls: Optional[List[str]]
65
- question_youtube_urls: Optional[List[str]]
66
 
67
  # -------------- helpers ---------------
68
  def _filename_from_cd(cd: str) -> str | None:
@@ -98,10 +75,6 @@ def _summarize_evidence(evidence: List[Dict[str, Any]], limit_chars: int = 6000)
98
  tag = f"{e.get('kind','?')}"
99
  if meta.get("mime"):
100
  tag += f"({meta['mime']})"
101
- if meta.get("title"):
102
- tag += f"[{meta['title']}]"
103
- if meta.get("url"):
104
- tag += f"<{meta['url']}>"
105
  chunks.append(f"[{i}:{tag}] {t}")
106
  out = "\n".join(chunks)
107
  return out if len(out) <= limit_chars else out[:limit_chars] + " …"
@@ -156,13 +129,6 @@ def _convert_to_wav_mono16k(src_path: str) -> str:
156
  raise RuntimeError(f"ffmpeg failed: {p.stderr[-500:]}")
157
  return out
158
 
159
- # ==== NEW: URL helpers ====
160
- _URL_RE = re.compile(r'https?://\S+')
161
-
162
- def _extract_urls(text: str) -> List[str]:
163
- return _URL_RE.findall(text or "")
164
-
165
-
166
  # ----------------------Tools ----------------------
167
  @tool
168
  def download_file(url: str, headers: dict | None = None, auth_token: str | None = None) -> str:
@@ -197,6 +163,10 @@ def download_file(url: str, headers: dict | None = None, auth_token: str | None
197
  out_dir = tempfile.mkdtemp(prefix="gaia_tmpdl_")
198
  out_path = os.path.join(out_dir, fname)
199
 
 
 
 
 
200
  print("out_path:", out_path)
201
 
202
  with open(out_path, "wb") as f:
@@ -207,9 +177,6 @@ def download_file(url: str, headers: dict | None = None, auth_token: str | None
207
  return out_path
208
 
209
 
210
- # ==== NEW: cache Whisper model so we don't reload each call ====
211
- _WHISPER = None
212
-
213
  @tool
214
  def transcribe_audio(path: str, model_size: str = "base") -> str:
215
  """
@@ -217,15 +184,13 @@ def transcribe_audio(path: str, model_size: str = "base") -> str:
217
  Returns the transcript text; raises on failure (caller handles).
218
  """
219
  print("running transcribe_audio")
220
- global _WHISPER
221
  try:
222
- if _WHISPER is None:
223
- _WHISPER = whisper.load_model(model_size)
224
- result = _WHISPER.transcribe(path)
225
  return (result.get("text") or "").strip()
226
  except Exception as e:
227
  raise RuntimeError(f"Whisper error: {e}")
228
-
229
 
230
  @tool
231
  def ocr_image(path: str) -> str:
@@ -237,194 +202,6 @@ def ocr_image(path: str) -> str:
237
  return text.strip()
238
 
239
 
240
- # ==== NEW: WEB / WIKI / YOUTUBE TOOLS =========================================
241
- # Choose your search backend (Tavily simplest). Set env var before use.
242
- _USE_TAVILY = False # flip to False to use SerpAPI example
243
-
244
- if _USE_TAVILY:
245
- @tool
246
- def web_search(query: str, k: int = 6) -> List[Dict[str, str]]:
247
- """
248
- Web search via Tavily. Returns a list of {title, url, snippet}.
249
- Requires TAVILY_API_KEY.
250
- """
251
- try:
252
- tv = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
253
- res = tv.search(
254
- query=query,
255
- search_depth="advanced",
256
- max_results=k,
257
- include_answer=False,
258
- include_images=False,
259
- )
260
- out = []
261
- for r in res.get("results", []):
262
- out.append({
263
- "title": r.get("title",""),
264
- "url": r.get("url",""),
265
- "snippet": (r.get("content","") or "")[:400]
266
- })
267
- return out
268
- except Exception as e:
269
- return [{"title":"", "url":"", "snippet": f"[search error: {e}]"}]
270
- else:
271
- @tool
272
- def web_search(query: str, k: int = 6) -> List[Dict[str, str]]:
273
- """
274
- Web search via SerpAPI. Returns a list of {title, url, snippet}.
275
- Requires SERPAPI_API_KEY.
276
- """
277
- try:
278
- params = {"engine":"google", "q":query, "num":k, "api_key":os.getenv("SERPAPI_API_KEY")}
279
- search = serpapi.search(params)
280
- # results = search.get_dict()
281
- results = search
282
- items = results.get("organic_results", [])
283
- out = []
284
- for it in items[:k]:
285
- out.append({
286
- "title": it.get("title",""),
287
- "url": it.get("link",""),
288
- "snippet": (it.get("snippet","") or "")[:400]
289
- })
290
- return out
291
- except Exception as e:
292
- return [{"title":"", "url":"", "snippet": f"[search error: {e}]"}]
293
-
294
- @tool
295
- def fetch_url_text(url: str, max_chars: int = 12000, timeout: int = 30) -> Dict[str, Any]:
296
- """
297
- Download a web page and extract main article text using trafilatura,
298
- with a readability-lxml fallback. Returns {url, title, text}.
299
- """
300
- sess = requests.Session()
301
- headers = {
302
- "User-Agent": "gaia-agent/1.0 (+https://example.org)",
303
- "Accept": "text/html,*/*;q=0.8",
304
- }
305
-
306
- try:
307
- r = sess.get(url, headers=headers, timeout=timeout)
308
- r.raise_for_status()
309
- html_content = r.text
310
- except Exception as e:
311
- return {"url": url, "title": "", "text": f"[fetch error: {e}]"}
312
-
313
- # 1) try trafilatura (best for boilerplate removal)
314
- try:
315
- downloaded = trafilatura.extract(html_content, include_comments=False, include_tables=False, url=url)
316
- if downloaded and len(downloaded) > 200:
317
- text = downloaded
318
- title = ""
319
- else:
320
- raise ValueError("trafilatura extraction too short")
321
- except Exception:
322
- # 2) fallback: readability
323
- try:
324
- doc = Document(html_content)
325
- title = doc.short_title() or ""
326
- text = doc.summary(html_partial=False)
327
- # rudimentary HTML strip
328
- text = re.sub(r"<[^>]+>", " ", text)
329
- text = re.sub(r"\s+", " ", text).strip()
330
- except Exception as e2:
331
- return {"url": url, "title": "", "text": f"[extraction error: {e2}]"}
332
-
333
- if len(text) > max_chars:
334
- text = text[:max_chars] + " …"
335
-
336
- # Try to fill title if empty
337
- if not title:
338
- m = re.search(r"<title[^>]*>(.*?)</title>", html_content, flags=re.I|re.S)
339
- if m:
340
- title = _html.unescape(m.group(1).strip())
341
-
342
- return {"url": url, "title": title or "", "text": text}
343
-
344
- @tool
345
- def wikipedia_lookup(query: str, sentences: int = 4) -> Dict[str, Any]:
346
- """
347
- Simple Wikipedia lookup. Returns {title, url, summary}.
348
- """
349
- try:
350
- wikipedia.set_lang("en")
351
- try:
352
- title = wikipedia.search(query, results=1)[0]
353
- except Exception as e:
354
- return {"title":"", "url":"", "summary": f"[wikipedia search error: {e}]"}
355
- try:
356
- summary = wikipedia.summary(title, sentences=sentences, auto_suggest=False)
357
- page = wikipedia.page(title, auto_suggest=False, preload=False)
358
- return {"title": page.title, "url": page.url, "summary": summary}
359
- except Exception as e:
360
- return {"title": title, "url":"", "summary": f"[wikipedia fetch error: {e}]"}
361
- except Exception as e:
362
- return {"title":"", "url":"", "summary": f"[wikipedia import error: {e}]"}
363
-
364
- @tool
365
- def youtube_get_transcript(url_or_id: str, prefer_langs: List[str] | None = None) -> str:
366
- """
367
- Get YouTube transcript via API (no download). Returns plain text.
368
- """
369
- print('try to get youtube video transcript')
370
- try:
371
- prefer_langs = prefer_langs or ["en", "en-US", "en-GB", "auto"]
372
- vid = url_or_id
373
- print("vid: ", vid)
374
- if "youtube.com" in url_or_id or "youtu.be" in url_or_id:
375
- u = urlparse(url_or_id)
376
- if u.netloc.endswith("youtu.be"):
377
- vid = u.path.lstrip("/")
378
- else:
379
- vid = parse_qs(u.query).get("v", [""])[0]
380
- trs_list = YouTubeTranscriptApi.list_transcripts(vid)
381
- # choose first matching language
382
- for lang in prefer_langs:
383
- try:
384
- trs = trs_list.find_transcript([lang])
385
- chunks = trs.fetch()
386
- print("transcript from youtube website?")
387
- print(" ".join([c["text"] for c in chunks if c.get("text")]).strip())
388
- return " ".join([c["text"] for c in chunks if c.get("text")]).strip()
389
- except Exception:
390
- continue
391
- # fallback: first any transcript
392
- trs = list(trs_list)[0]
393
- chunks = trs.fetch()
394
- print("transcript from youtube website?")
395
- print(" ".join([c["text"] for c in chunks if c.get("text")]).strip())
396
- return " ".join([c["text"] for c in chunks if c.get("text")]).strip()
397
- except (TranscriptsDisabled, NoTranscriptFound):
398
- return "[no captions available]"
399
- except Exception as e:
400
- return f"[youtube transcript error: {e}]"
401
-
402
- @tool
403
- def youtube_transcribe_audio(url: str, model_size: str = "base") -> str:
404
- """
405
- Download YouTube audio (yt-dlp) and transcribe with Whisper.
406
- """
407
- tmpdir = tempfile.mkdtemp(prefix="gaia_yt_")
408
- outfile = os.path.join(tmpdir, "%(id)s.%(ext)s")
409
-
410
- ydl_opts = {
411
- "format": "bestaudio/best",
412
- "outtmpl": outfile,
413
- "quiet": True,
414
- "no_warnings": True,
415
- "noplaylist": True,
416
- }
417
- try:
418
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
419
- info = ydl.extract_info(url, download=True)
420
- path = ydl.prepare_filename(info)
421
- # convert & transcribe
422
- wav = _convert_to_wav_mono16k(path)
423
- txt = transcribe_audio.invoke({"path": wav, "model_size": model_size})
424
- return txt
425
- except Exception as e:
426
- return f"[youtube download/transcribe error: {e}]"
427
-
428
  # ------------------------------- Nodes ------------------------------
429
  def check_attachment_node(state: AgentState) -> AgentState:
430
  """Check if there is attachment."""
@@ -506,6 +283,7 @@ def preprocess_node(state: AgentState) -> AgentState:
506
  try:
507
  if mime and mime.startswith("audio"):
508
  print("mime start with audio")
 
509
  # --- ASR ---
510
  try:
511
  wav = _convert_to_wav_mono16k(path)
@@ -574,7 +352,7 @@ def solve_multimodal_node(state: AgentState) -> AgentState:
574
  vision_llm = ChatOpenAI(model="gpt-4o", temperature=0) # vision-capable
575
  sys = SystemMessage(content=(
576
  "You solve GAIA tasks using the provided evidence and attached images.\n"
577
- "Be precise, quote numbers/strings exactly. If uncertain, say so.\n"
578
  "Your answer to the GAIA tasks should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. If your answer only include a single word, make the first letter capital.\n" + end_instr
579
  ))
580
 
@@ -623,7 +401,7 @@ def solve_text_only_node(state: "AgentState") -> "AgentState":
623
 
624
  sys = SystemMessage(content=(
625
  "You solve GAIA tasks. Use careful step-by-step reasoning but keep it concise.\n"
626
- "You can use the provided textual evidence if there is any. \n"
627
  "Your answer to the GAIA tasks should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. If your answer only include a single word, make the first letter capital.\n" + end_instr
628
  ))
629
 
@@ -649,7 +427,7 @@ def validate_format_node(state: AgentState) -> AgentState:
649
 
650
  emit = bool(state.get("emit_final_answer", True))
651
  txt = (state.get("answer") or "").strip()
652
-
653
  if not txt:
654
  if emit:
655
  state["answer"] = "No answer generated.\n\nfinal_answer: [NO_ANSWER]"
@@ -690,151 +468,13 @@ def has_images(state: AgentState) -> bool:
690
  return True
691
  return False
692
 
693
- # ==== CHANGED: fix return type Literal to match actual branch key ====
694
  def route_after_preprocess(state: AgentState) -> Literal["vision","text"]:
695
  return "vision" if has_images(state) else "text"
696
 
697
- # ==== NEW: Browsing router ====
698
- def needs_browsing(q: str) -> bool:
699
- q = (q or "").lower()
700
- hot = ["today","current","latest","price","How","who","where","what","How many",
701
- "2023","2024","2025","news","wins","Which",
702
- "http://","https://","wikipedia","youtube.com"]
703
- # Only browse if we *also* have a search key, so the sample runs without keys.
704
- return _has_search_key() and any(w in q for w in hot)
705
-
706
- # ==== NEW: Decide browse node ====
707
- def decide_browse_node(state: AgentState) -> AgentState:
708
- print("enter decide_browse_node")
709
- q = state.get("question", "")
710
- urls = _extract_urls(q)
711
- yt_urls = [u for u in urls if _is_youtube(u)]
712
-
713
- # Save for later stages
714
- state["question_urls"] = urls
715
- state["question_youtube_urls"] = yt_urls
716
-
717
- # Browse if:
718
- # - we have any YouTube links in the question (can handle w/o search key), OR
719
- # - the normal heuristic says we should browse (requires a search key)
720
- state["use_browsing"] = bool(yt_urls) or needs_browsing(q)
721
- return state
722
-
723
-
724
- def route_browse(state: AgentState) -> Literal["browse","skip"]:
725
- return "browse" if state.get("use_browsing") else "skip"
726
-
727
- # ==== NEW: Search node ====
728
- def search_node(state: AgentState) -> AgentState:
729
- print("enter search_node")
730
- q = state.get("question","")
731
-
732
- # Start with YouTube links found in the question
733
- preseed = [{"title": "(from question)", "url": u, "snippet": ""}
734
- for u in (state.get("question_youtube_urls") + state.get("question_urls") or [])]
735
-
736
- # Do a web search only if keys are configured
737
- hits = []
738
- if _has_search_key():
739
- hits = web_search.invoke({"query": q, "k": 6}) or []
740
-
741
- # Optionally seed Wikipedia for short queries
742
- if len(q.split()) <= 30: #8
743
- wiki = wikipedia_lookup.invoke({"query": q, "sentences": 4})
744
- if (wiki.get("summary") or "").strip():
745
- state.setdefault("evidence", []).append({
746
- "kind": "doc_text",
747
- "text": wiki["summary"],
748
- "path": None,
749
- "meta": {"source": "wikipedia", "title": wiki.get("title",""),
750
- "url": wiki.get("url",""), "mime":"text/plain"}
751
- })
752
-
753
- # Combine: question YouTube links first, then search hits
754
- state["web_hits"] = preseed + hits
755
- return state
756
-
757
-
758
- def _is_youtube(u: str) -> bool:
759
- try:
760
- net = urlparse(u).netloc.lower()
761
- return ("youtube.com" in net) or ("youtu.be" in net)
762
- except Exception:
763
- return False
764
-
765
- def crawl_node(state: AgentState) -> AgentState:
766
- print("enter crawl_node")
767
- ev = list(state.get("evidence", []))
768
- hits: List[Dict[str,str]] = state.get("web_hits", []) or []
769
- print("hits: ", hits)
770
-
771
- # choose top M distinct domains
772
- def _domain(u: str) -> str:
773
- try: return urlparse(u).netloc.lower().lstrip("www.")
774
- except: return ""
775
-
776
- seen_domains = set()
777
- picked = []
778
- for h in hits:
779
- u = h.get("url","")
780
- d = _domain(u)
781
- if not u or not d:
782
- continue
783
- if d in seen_domains:
784
- continue
785
- seen_domains.add(d)
786
- picked.append(h)
787
- if len(picked) >= 4:
788
- break
789
-
790
- print("picked: ", picked)
791
-
792
- # Fetch & extract
793
- for h in picked:
794
- u = h["url"]
795
- print("url: ", u)
796
- title = h.get("title","")
797
- # Special-case YouTube
798
- if _is_youtube(u):
799
- print("is_youtube? ", _is_youtube(u))
800
- cap = youtube_get_transcript.invoke({"url_or_id": u})
801
- print('cap: ', cap)
802
- if cap and not cap.startswith("[no captions"):
803
- ev.append({"kind":"doc_text","text":cap,"path":None,
804
- "meta":{"source":"youtube","title": title, "url":u,"mime":"text/plain"}})
805
- continue
806
- # fallback: download+ASR (heavier)
807
- cap2 = youtube_transcribe_audio.invoke({"url": u, "model_size":"base"})
808
- ev.append({"kind":"audio_transcript","text":cap2,"path":None,
809
- "meta":{"source":"youtube","title": title, "url":u,"mime":"audio"}})
810
- continue
811
-
812
- out = fetch_url_text.invoke({"url": u, "max_chars": 12000})
813
- text = out.get("text","") or ""
814
- page_title = out.get("title","") or title
815
- if not text:
816
- continue
817
- ev.append({
818
- "kind": "doc_text",
819
- "text": text,
820
- "path": None,
821
- "meta": {"source":"web", "title": page_title, "url": u, "mime":"text/html"}
822
- })
823
-
824
- state["evidence"] = ev
825
- return state
826
-
827
  # ---------- Graph ----------
828
  # Build graph function
829
  def build_graph():
830
  g = StateGraph(AgentState)
831
-
832
- # ==== NEW: browsing nodes ====
833
- g.add_node("decide_browse", decide_browse_node)
834
- g.add_node("search", search_node)
835
- g.add_node("crawl", crawl_node)
836
-
837
- # Existing nodes
838
  g.add_node("check_attachment", check_attachment_node)
839
  g.add_node("fetch", fetch_node)
840
  g.add_node("preprocess", preprocess_node)
@@ -843,15 +483,7 @@ def build_graph():
843
  g.add_node("validate", validate_format_node)
844
 
845
  # Start the edges
846
- g.add_edge(START, "decide_browse")
847
-
848
- # Browse or skip
849
- g.add_conditional_edges("decide_browse", route_browse, {
850
- "browse": "search",
851
- "skip": "check_attachment"
852
- })
853
- g.add_edge("search", "crawl")
854
- g.add_edge("crawl", "check_attachment")
855
 
856
  # Add conditional branching from check_attachment
857
  g.add_conditional_edges(
@@ -889,26 +521,18 @@ def build_graph():
889
  if __name__ == "__main__":
890
  task_id = '0001'
891
  task_q = 'Who is the current president of France'
892
- # ==== CHANGED: make it a flat empty list (not `[[]]`)
893
- attachment_urls: List[str] = []
894
- sample: AgentState = {
895
  "task_id": task_id,
896
  "question": task_q,
897
- "attachment_urls": attachment_urls, # from GAIA sample
898
  "local_files": [],
899
  "evidence": [],
900
  "answer": None,
901
  "parsed_final_answer": None,
902
- # Tip: set True to force a final_answer line for scoring
903
  "emit_final_answer": False, # <<< pure output mode
904
- # new optional fields:
905
- "use_browsing": None,
906
- "web_hits": None,
907
- "question_urls": None,
908
- "question_youtube_urls": None
909
  }
910
  agent_GAIA = build_graph()
911
  out = agent_GAIA.invoke(sample)
912
  print("---------------------------")
913
- print(out["answer"])
914
-
 
1
+ # agent_v6.py
2
  # Develop an AI agent with LangGraph and LangChain
3
  # to answer the questions in the "gaia-benchmark/GAIA" dataset.
4
 
 
14
  from langchain_core.messages import HumanMessage, SystemMessage
15
  from langchain_openai import ChatOpenAI
16
  from langgraph.graph import StateGraph, START, END
17
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  # Optional: pdf parsing if GAIA sometimes includes PDFs
20
  try:
 
26
 
27
  # -------------- State -------------
28
  class EvidenceItem(TypedDict):
29
+ kind: Literal["audio_transcript","image_ocr","image_vqa","doc_text"]
 
30
  text: str
31
  path: Optional[str]
32
  meta: Dict[str, Any]
 
40
  answer: Optional[str]
41
  parsed_final_answer: Optional[str]
42
  emit_final_answer: bool # <<< add this (default True if you want old behavior)
 
 
 
 
 
 
43
 
44
  # -------------- helpers ---------------
45
  def _filename_from_cd(cd: str) -> str | None:
 
75
  tag = f"{e.get('kind','?')}"
76
  if meta.get("mime"):
77
  tag += f"({meta['mime']})"
 
 
 
 
78
  chunks.append(f"[{i}:{tag}] {t}")
79
  out = "\n".join(chunks)
80
  return out if len(out) <= limit_chars else out[:limit_chars] + " …"
 
129
  raise RuntimeError(f"ffmpeg failed: {p.stderr[-500:]}")
130
  return out
131
 
 
 
 
 
 
 
 
132
  # ----------------------Tools ----------------------
133
  @tool
134
  def download_file(url: str, headers: dict | None = None, auth_token: str | None = None) -> str:
 
163
  out_dir = tempfile.mkdtemp(prefix="gaia_tmpdl_")
164
  out_path = os.path.join(out_dir, fname)
165
 
166
+ # # Write to colab folder
167
+ # out_dir: str | Path = "."
168
+ # out_path = Path(out_dir) / fname
169
+
170
  print("out_path:", out_path)
171
 
172
  with open(out_path, "wb") as f:
 
177
  return out_path
178
 
179
 
 
 
 
180
  @tool
181
  def transcribe_audio(path: str, model_size: str = "base") -> str:
182
  """
 
184
  Returns the transcript text; raises on failure (caller handles).
185
  """
186
  print("running transcribe_audio")
 
187
  try:
188
+ model = whisper.load_model(model_size)
189
+ result = model.transcribe(path)
 
190
  return (result.get("text") or "").strip()
191
  except Exception as e:
192
  raise RuntimeError(f"Whisper error: {e}")
193
+
194
 
195
  @tool
196
  def ocr_image(path: str) -> str:
 
202
  return text.strip()
203
 
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  # ------------------------------- Nodes ------------------------------
206
  def check_attachment_node(state: AgentState) -> AgentState:
207
  """Check if there is attachment."""
 
283
  try:
284
  if mime and mime.startswith("audio"):
285
  print("mime start with audio")
286
+ # print("path: ", path)
287
  # --- ASR ---
288
  try:
289
  wav = _convert_to_wav_mono16k(path)
 
352
  vision_llm = ChatOpenAI(model="gpt-4o", temperature=0) # vision-capable
353
  sys = SystemMessage(content=(
354
  "You solve GAIA tasks using the provided evidence and attached images.\n"
355
+ "Be precise, quote numbers/strings exactly. If uncertain, say so.\n"
356
  "Your answer to the GAIA tasks should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. If your answer only include a single word, make the first letter capital.\n" + end_instr
357
  ))
358
 
 
401
 
402
  sys = SystemMessage(content=(
403
  "You solve GAIA tasks. Use careful step-by-step reasoning but keep it concise.\n"
404
+ "You can use the provided textual evidence if there is any. \n"
405
  "Your answer to the GAIA tasks should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. If your answer only include a single word, make the first letter capital.\n" + end_instr
406
  ))
407
 
 
427
 
428
  emit = bool(state.get("emit_final_answer", True))
429
  txt = (state.get("answer") or "").strip()
430
+
431
  if not txt:
432
  if emit:
433
  state["answer"] = "No answer generated.\n\nfinal_answer: [NO_ANSWER]"
 
468
  return True
469
  return False
470
 
 
471
  def route_after_preprocess(state: AgentState) -> Literal["vision","text"]:
472
  return "vision" if has_images(state) else "text"
473
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
  # ---------- Graph ----------
475
  # Build graph function
476
  def build_graph():
477
  g = StateGraph(AgentState)
 
 
 
 
 
 
 
478
  g.add_node("check_attachment", check_attachment_node)
479
  g.add_node("fetch", fetch_node)
480
  g.add_node("preprocess", preprocess_node)
 
483
  g.add_node("validate", validate_format_node)
484
 
485
  # Start the edges
486
+ g.add_edge(START, "check_attachment")
 
 
 
 
 
 
 
 
487
 
488
  # Add conditional branching from check_attachment
489
  g.add_conditional_edges(
 
521
  if __name__ == "__main__":
522
  task_id = '0001'
523
  task_q = 'Who is the current president of France'
524
+ task_url = []
525
+ sample = {
 
526
  "task_id": task_id,
527
  "question": task_q,
528
+ "attachment_urls": [task_url], # from GAIA sample
529
  "local_files": [],
530
  "evidence": [],
531
  "answer": None,
532
  "parsed_final_answer": None,
 
533
  "emit_final_answer": False, # <<< pure output mode
 
 
 
 
 
534
  }
535
  agent_GAIA = build_graph()
536
  out = agent_GAIA.invoke(sample)
537
  print("---------------------------")
538
+ print(out["answer"])
 
app.py CHANGED
@@ -77,11 +77,6 @@ def run_and_submit_all( profile: bool = True):
77
  "answer": None,
78
  "parsed_final_answer": None,
79
  "emit_final_answer": False, # <<< pure output mode
80
- # new optional fields:
81
- "use_browsing": None,
82
- "web_hits": None,
83
- "question_urls": None,
84
- "question_youtube_urls": None
85
  }
86
 
87
  if not task_id or question_text is None:
 
77
  "answer": None,
78
  "parsed_final_answer": None,
79
  "emit_final_answer": False, # <<< pure output mode
 
 
 
 
 
80
  }
81
 
82
  if not task_id or question_text is None:
requirements.txt CHANGED
@@ -8,11 +8,4 @@ langchain-community
8
  ddgs
9
  openai-whisper
10
  pytesseract
11
- ffmpeg
12
- tavily-python
13
- trafilatura
14
- readability-lxml
15
- youtube-transcript-api
16
- yt-dlp
17
- wikipedia
18
- serpapi
 
8
  ddgs
9
  openai-whisper
10
  pytesseract
11
+ ffmpeg