Spaces:
Sleeping
Sleeping
shan gao
commited on
Commit
·
e9b8f0b
1
Parent(s):
5c7c966
change
Browse files
agent.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
# v8
|
| 2 |
# Develop an AI agent with LangGraph and LangChain
|
| 3 |
# to answer the questions in the "gaia-benchmark/GAIA" dataset.
|
| 4 |
|
|
@@ -366,6 +365,7 @@ def wikipedia_lookup(query: str, sentences: int = 4) -> Dict[str, Any]:
|
|
| 366 |
def youtube_get_transcript(url_or_id: str, prefer_langs: List[str] = ["en"] ) -> str:
|
| 367 |
"""
|
| 368 |
Get YouTube transcript via API (no download). Returns plain text.
|
|
|
|
| 369 |
"""
|
| 370 |
print('try to get youtube video transcript')
|
| 371 |
try:
|
|
@@ -718,10 +718,17 @@ def decide_browse_node(state: AgentState) -> AgentState:
|
|
| 718 |
state["use_browsing"] = bool(yt_urls) or needs_browsing(q)
|
| 719 |
return state
|
| 720 |
|
| 721 |
-
|
| 722 |
def route_browse(state: AgentState) -> Literal["browse","skip"]:
|
| 723 |
return "browse" if state.get("use_browsing") else "skip"
|
| 724 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 725 |
# ==== NEW: Search node ====
|
| 726 |
def search_node(state: AgentState) -> AgentState:
|
| 727 |
print("enter search_node")
|
|
@@ -736,6 +743,10 @@ def search_node(state: AgentState) -> AgentState:
|
|
| 736 |
if _has_search_key():
|
| 737 |
hits = web_search.invoke({"query": q, "k": 6}) or []
|
| 738 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 739 |
# Optionally seed Wikipedia for short queries
|
| 740 |
if len(q.split()) <= 30: #8
|
| 741 |
wiki = wikipedia_lookup.invoke({"query": q, "sentences": 4})
|
|
@@ -752,14 +763,6 @@ def search_node(state: AgentState) -> AgentState:
|
|
| 752 |
state["web_hits"] = preseed + hits
|
| 753 |
return state
|
| 754 |
|
| 755 |
-
|
| 756 |
-
def _is_youtube(u: str) -> bool:
|
| 757 |
-
try:
|
| 758 |
-
net = urlparse(u).netloc.lower()
|
| 759 |
-
return ("youtube.com" in net) or ("youtu.be" in net)
|
| 760 |
-
except Exception:
|
| 761 |
-
return False
|
| 762 |
-
|
| 763 |
def crawl_node(state: AgentState) -> AgentState:
|
| 764 |
print("enter crawl_node")
|
| 765 |
ev = list(state.get("evidence", []))
|
|
@@ -795,7 +798,8 @@ def crawl_node(state: AgentState) -> AgentState:
|
|
| 795 |
# Special-case YouTube
|
| 796 |
if _is_youtube(u):
|
| 797 |
print("is_youtube? ", _is_youtube(u))
|
| 798 |
-
cap = youtube_get_transcript.invoke({"url_or_id": u})
|
|
|
|
| 799 |
print('caption: ', cap)
|
| 800 |
if cap and not cap.startswith("[no captions"):
|
| 801 |
ev.append({"kind":"doc_text","text":cap,"path":None,
|
|
|
|
|
|
|
| 1 |
# Develop an AI agent with LangGraph and LangChain
|
| 2 |
# to answer the questions in the "gaia-benchmark/GAIA" dataset.
|
| 3 |
|
|
|
|
| 365 |
def youtube_get_transcript(url_or_id: str, prefer_langs: List[str] = ["en"] ) -> str:
|
| 366 |
"""
|
| 367 |
Get YouTube transcript via API (no download). Returns plain text.
|
| 368 |
+
If request too many times, will be blocked by youtube and lead to Agent error.
|
| 369 |
"""
|
| 370 |
print('try to get youtube video transcript')
|
| 371 |
try:
|
|
|
|
| 718 |
state["use_browsing"] = bool(yt_urls) or needs_browsing(q)
|
| 719 |
return state
|
| 720 |
|
|
|
|
| 721 |
def route_browse(state: AgentState) -> Literal["browse","skip"]:
|
| 722 |
return "browse" if state.get("use_browsing") else "skip"
|
| 723 |
|
| 724 |
+
|
| 725 |
+
def _is_youtube(u: str) -> bool:
|
| 726 |
+
try:
|
| 727 |
+
net = urlparse(u).netloc.lower()
|
| 728 |
+
return ("youtube.com" in net) or ("youtu.be" in net)
|
| 729 |
+
except Exception:
|
| 730 |
+
return False
|
| 731 |
+
|
| 732 |
# ==== NEW: Search node ====
|
| 733 |
def search_node(state: AgentState) -> AgentState:
|
| 734 |
print("enter search_node")
|
|
|
|
| 743 |
if _has_search_key():
|
| 744 |
hits = web_search.invoke({"query": q, "k": 6}) or []
|
| 745 |
|
| 746 |
+
# Create a new list with non-YouTube links in the search results
|
| 747 |
+
if len(hits) > 0:
|
| 748 |
+
hits = [hit for hit in hits if not _is_youtube(hit["url"])]
|
| 749 |
+
|
| 750 |
# Optionally seed Wikipedia for short queries
|
| 751 |
if len(q.split()) <= 30: #8
|
| 752 |
wiki = wikipedia_lookup.invoke({"query": q, "sentences": 4})
|
|
|
|
| 763 |
state["web_hits"] = preseed + hits
|
| 764 |
return state
|
| 765 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 766 |
def crawl_node(state: AgentState) -> AgentState:
|
| 767 |
print("enter crawl_node")
|
| 768 |
ev = list(state.get("evidence", []))
|
|
|
|
| 798 |
# Special-case YouTube
|
| 799 |
if _is_youtube(u):
|
| 800 |
print("is_youtube? ", _is_youtube(u))
|
| 801 |
+
# cap = youtube_get_transcript.invoke({"url_or_id": u}) # blocked by youtube
|
| 802 |
+
cap = "[no captions available]"
|
| 803 |
print('caption: ', cap)
|
| 804 |
if cap and not cap.startswith("[no captions"):
|
| 805 |
ev.append({"kind":"doc_text","text":cap,"path":None,
|