shan gao commited on
Commit
e9b8f0b
·
1 Parent(s): 5c7c966
Files changed (1) hide show
  1. agent.py +15 -11
agent.py CHANGED
@@ -1,4 +1,3 @@
1
- # v8
2
  # Develop an AI agent with LangGraph and LangChain
3
  # to answer the questions in the "gaia-benchmark/GAIA" dataset.
4
 
@@ -366,6 +365,7 @@ def wikipedia_lookup(query: str, sentences: int = 4) -> Dict[str, Any]:
366
  def youtube_get_transcript(url_or_id: str, prefer_langs: List[str] = ["en"] ) -> str:
367
  """
368
  Get YouTube transcript via API (no download). Returns plain text.
 
369
  """
370
  print('try to get youtube video transcript')
371
  try:
@@ -718,10 +718,17 @@ def decide_browse_node(state: AgentState) -> AgentState:
718
  state["use_browsing"] = bool(yt_urls) or needs_browsing(q)
719
  return state
720
 
721
-
722
  def route_browse(state: AgentState) -> Literal["browse","skip"]:
723
  return "browse" if state.get("use_browsing") else "skip"
724
 
 
 
 
 
 
 
 
 
725
  # ==== NEW: Search node ====
726
  def search_node(state: AgentState) -> AgentState:
727
  print("enter search_node")
@@ -736,6 +743,10 @@ def search_node(state: AgentState) -> AgentState:
736
  if _has_search_key():
737
  hits = web_search.invoke({"query": q, "k": 6}) or []
738
 
 
 
 
 
739
  # Optionally seed Wikipedia for short queries
740
  if len(q.split()) <= 30: #8
741
  wiki = wikipedia_lookup.invoke({"query": q, "sentences": 4})
@@ -752,14 +763,6 @@ def search_node(state: AgentState) -> AgentState:
752
  state["web_hits"] = preseed + hits
753
  return state
754
 
755
-
756
- def _is_youtube(u: str) -> bool:
757
- try:
758
- net = urlparse(u).netloc.lower()
759
- return ("youtube.com" in net) or ("youtu.be" in net)
760
- except Exception:
761
- return False
762
-
763
  def crawl_node(state: AgentState) -> AgentState:
764
  print("enter crawl_node")
765
  ev = list(state.get("evidence", []))
@@ -795,7 +798,8 @@ def crawl_node(state: AgentState) -> AgentState:
795
  # Special-case YouTube
796
  if _is_youtube(u):
797
  print("is_youtube? ", _is_youtube(u))
798
- cap = youtube_get_transcript.invoke({"url_or_id": u})
 
799
  print('caption: ', cap)
800
  if cap and not cap.startswith("[no captions"):
801
  ev.append({"kind":"doc_text","text":cap,"path":None,
 
 
1
  # Develop an AI agent with LangGraph and LangChain
2
  # to answer the questions in the "gaia-benchmark/GAIA" dataset.
3
 
 
365
  def youtube_get_transcript(url_or_id: str, prefer_langs: List[str] = ["en"] ) -> str:
366
  """
367
  Get YouTube transcript via API (no download). Returns plain text.
368
+ If request too many times, will be blocked by youtube and lead to Agent error.
369
  """
370
  print('try to get youtube video transcript')
371
  try:
 
718
  state["use_browsing"] = bool(yt_urls) or needs_browsing(q)
719
  return state
720
 
 
721
  def route_browse(state: AgentState) -> Literal["browse","skip"]:
722
  return "browse" if state.get("use_browsing") else "skip"
723
 
724
+
725
+ def _is_youtube(u: str) -> bool:
726
+ try:
727
+ net = urlparse(u).netloc.lower()
728
+ return ("youtube.com" in net) or ("youtu.be" in net)
729
+ except Exception:
730
+ return False
731
+
732
  # ==== NEW: Search node ====
733
  def search_node(state: AgentState) -> AgentState:
734
  print("enter search_node")
 
743
  if _has_search_key():
744
  hits = web_search.invoke({"query": q, "k": 6}) or []
745
 
746
+ # Create a new list with non-YouTube links in the search results
747
+ if len(hits) > 0:
748
+ hits = [hit for hit in hits if not _is_youtube(hit["url"])]
749
+
750
  # Optionally seed Wikipedia for short queries
751
  if len(q.split()) <= 30: #8
752
  wiki = wikipedia_lookup.invoke({"query": q, "sentences": 4})
 
763
  state["web_hits"] = preseed + hits
764
  return state
765
 
 
 
 
 
 
 
 
 
766
  def crawl_node(state: AgentState) -> AgentState:
767
  print("enter crawl_node")
768
  ev = list(state.get("evidence", []))
 
798
  # Special-case YouTube
799
  if _is_youtube(u):
800
  print("is_youtube? ", _is_youtube(u))
801
+ # cap = youtube_get_transcript.invoke({"url_or_id": u}) # blocked by youtube
802
+ cap = "[no captions available]"
803
  print('caption: ', cap)
804
  if cap and not cap.startswith("[no captions"):
805
  ev.append({"kind":"doc_text","text":cap,"path":None,