gaia-agents-langgraph

Sleeping

App Files Files Community

fmarky commited on Nov 4, 2025

Commit

96d3600

1 Parent(s): f491e70

feat: create web search sub agent

Browse files

Files changed (7) hide show

agents/__init__.py +5 -0
agent.py → agents/assistant_agent.py +2 -5
tools.py → agents/assistant_tools.py +7 -137
agents/web_search_subagent.py +100 -0
agents/web_search_tools.py +140 -0
youtube_transcript.py → agents/youtube_transcript_tool.py +0 -0
app.py +1 -1

agents/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Agents package for the Final Assignment Agents Course."""
+from agents.assistant_agent import AwesomeAgent
+__all__ = ["AwesomeAgent"]

agent.py → agents/assistant_agent.py RENAMED Viewed

@@ -24,7 +24,7 @@ from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_openai import ChatOpenAI
 from langchain_groq import ChatGroq
 from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
-from tools import build_tools
 from langfuse.langchain import CallbackHandler
 load_dotenv()
@@ -83,10 +83,7 @@ class AgentState(TypedDict):
 tools_description = """
 WEB & SEARCH:
-- duckduckgo_search: Search the web
-- wikipedia_tool: Search Wikipedia for knowledge
-- visit_webpage: Visit a webpage and extract readable markdown content
-- arxiv_tool: Search arXiv for research papers
 CALCULATIONS:
 - calculator: Basic arithmetic (+, -, *, /)

 from langchain_openai import ChatOpenAI
 from langchain_groq import ChatGroq
 from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
+from agents.assistant_tools import build_tools
 from langfuse.langchain import CallbackHandler
 load_dotenv()
 tools_description = """
 WEB & SEARCH:
+- web_search_agent: web search subagent (for Wikipedia, arXiv, Web Search)
 CALCULATIONS:
 - calculator: Basic arithmetic (+, -, *, /)

tools.py → agents/assistant_tools.py RENAMED Viewed

@@ -11,42 +11,31 @@
 import base64
 import math
 import os
-import re
 from typing import Optional
 import pandas as pd
-import requests
-from bs4 import BeautifulSoup
 from dotenv import load_dotenv
 from langchain_core.messages import HumanMessage
 from langchain_core.tools import tool
 from langchain_google_genai import ChatGoogleGenerativeAI
-from markdownify import markdownify
-from requests.exceptions import RequestException
-import wikipedia
 # [1] Import Built-in LangChain tools
 # ---
-from langchain_community.tools import (
-    DuckDuckGoSearchRun,
-    ArxivQueryRun,
-    ShellTool,
-)
-from langchain_community.utilities import (
-    DuckDuckGoSearchAPIWrapper,
-    ArxivAPIWrapper,
-)
 from langchain_experimental.tools import PythonREPLTool
 from langchain_community.document_loaders import AssemblyAIAudioTranscriptLoader
 from langchain_community.document_loaders.assemblyai import TranscriptFormat
 # Youtube related tools
-from youtube_transcript import (
     get_youtube_transcript_tool,
     get_youtube_title_description_tool,
 )
 load_dotenv()
 vision_llm = ChatGoogleGenerativeAI(model=os.getenv("GOOGLE_VISION_MODEL"))
@@ -187,33 +176,6 @@ def read_excel_file(file_path: str, sheet_name: Optional[str] = None) -> str:
         return f"Excel reading error: {str(e)}"
-@tool
-def visit_webpage(url: str) -> str:
-    """
-    Visits a webpage at the given URL and returns its content as a markdown string.
-    Use this to browse and extract readable content from webpages.
-    """
-    try:
-        response = requests.get(url, timeout=20)
-        response.raise_for_status()
-        markdown_content = markdownify(response.text).strip()
-        markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
-        MAX_LEN = 40000
-        if len(markdown_content) > MAX_LEN:
-            return (
-                markdown_content[: MAX_LEN // 2]
-                + f"\n\n...[Content truncated to {MAX_LEN} chars]...\n\n"
-                + markdown_content[-MAX_LEN // 2 :]
-            )
-        return markdown_content
-    except requests.exceptions.Timeout:
-        return "Timeout while trying to access the webpage."
-    except RequestException as e:
-        return f"Request error: {str(e)}"
-    except Exception as e:
-        return f"Unexpected error: {str(e)}"
 @tool
 def transcribe_mp3(
     file_path: str,
@@ -240,84 +202,6 @@ def transcribe_mp3(
         return f"Transcription error: {str(e)}"
-def _fetch_wikipedia_page_with_tables(page_url: str) -> Optional[str]:
-    """Fetch full Wikipedia page content including tables using markdownify."""
-    try:
-        response = requests.get(
-            page_url, timeout=10, headers={"User-Agent": "Mozilla/5.0"}
-        )
-        if response.status_code == 200:
-            soup = BeautifulSoup(response.text, "html.parser")
-            # Extract main content area (preserves infoboxes, tables, article content)
-            main_content = soup.find(id="mw-content-text") or soup.find(
-                class_="mw-parser-output"
-            )
-            if main_content:
-                # Remove UI elements only
-                for element in main_content.find_all(
-                    class_=lambda x: x
-                    and any(
-                        term in str(x).lower()
-                        for term in ["mw-jump-link", "mw-editsection", "toc"]
-                    )
-                ):
-                    element.decompose()
-                content = markdownify(str(main_content), heading_style="ATX")
-            else:
-                # Fallback: remove top-level navigation elements
-                for tag in soup.find_all(["nav", "aside", "footer", "header"]):
-                    tag.decompose()
-                content = markdownify(str(soup), heading_style="ATX")
-            return re.sub(r"\n{3,}", "\n\n", content)
-    except Exception:
-        pass
-    return None
-@tool
-def wikipedia_tool(query: str) -> str:
-    """
-    A wrapper around Wikipedia. Useful for when you need to answer general questions about
-    people, places, companies, facts, historical events, or other subjects.
-    Returns the FULL CONTENT of Wikipedia pages (not just summaries), including tables.
-    Input should be a search query.
-    """
-    try:
-        # Search returns page titles (strings) - need to resolve to get canonical URL
-        # Example: search("Mercedes") -> ["Mercedes Sosa", ...] (titles, not URLs)
-        page_titles = wikipedia.search(query[:300], results=3)
-        results = []
-        for page_title in page_titles[:3]:
-            try:
-                # Get page object to resolve canonical URL (handles redirects, special chars)
-                # Example: "Mercedes Sosa" -> "https://en.wikipedia.org/wiki/Mercedes_Sosa" (handles parentheses)
-                wiki_page = wikipedia.page(title=page_title, auto_suggest=False)
-                # Fetch full HTML content with tables (better than wiki_page.content which is text-only)
-                # Example: "Live albums" table preserved in HTML but missing from wiki_page.content
-                full_content = _fetch_wikipedia_page_with_tables(wiki_page.url)
-                content = f"Page: {page_title}\nURL: {wiki_page.url}\n\nContent:\n{full_content}"
-                results.append(content)
-            except (
-                wikipedia.exceptions.PageError,
-                wikipedia.exceptions.DisambiguationError,
-            ):
-                continue
-        if not results:
-            return "No good Wikipedia Search Result was found"
-        return "\n\n" + "=" * 80 + "\n\n".join(results)
-    except ImportError:
-        return "Error: wikipedia package not installed. Install with: pip install wikipedia"
-    except Exception as e:
-        return f"Wikipedia search error: {str(e)}"
 def build_tools():
     """
     Initialize and return a list of built-in and custom LangChain tools.
@@ -327,11 +211,6 @@ def build_tools():
     # ---
     # Initialize built-in LangChain tools
-    # Note: wikipedia_tool is now a custom tool defined above that returns full page content
-    duckduckgo_search = DuckDuckGoSearchRun(
-        api_wrapper=DuckDuckGoSearchAPIWrapper(max_results=15)
-    )
-    arxiv_tool = ArxivQueryRun(api_wrapper=ArxivAPIWrapper())
     python_repl = PythonREPLTool()
     shell_tool = ShellTool()
@@ -341,13 +220,11 @@ def build_tools():
     # Combine built-in tools with custom tools
     all_tools = [
         # Built-in LangChain tools
-        duckduckgo_search,
-        arxiv_tool,
         python_repl,
         shell_tool,
         # Custom tools for specialized tasks
-        wikipedia_tool,
-        visit_webpage,
         read_excel_file,
         get_youtube_transcript_tool,
         get_youtube_title_description_tool,
@@ -364,9 +241,6 @@ def build_tools():
 if __name__ == "__main__":
     from pprint import pprint
-    print("\n--- wikipedia_tool ---")
-    pprint(wikipedia_tool.invoke({"query": "Mercedes Sosa"}))
     print("\n--- reverse_text ---")
     pprint(reverse_text.invoke({"text": "hello"}))
@@ -388,10 +262,6 @@ if __name__ == "__main__":
         )
     )
-    print("\n--- visit_webpage ---")
-    result = visit_webpage.invoke({"url": "https://example.com"})
-    print(result[:200] + "...\n")  # tronqué pour affichage
     print("\n--- ask_question_on_image_content ---")
     pprint(
         ask_question_on_image_content.invoke(

 import base64
 import math
 import os
 from typing import Optional
 import pandas as pd
 from dotenv import load_dotenv
 from langchain_core.messages import HumanMessage
 from langchain_core.tools import tool
 from langchain_google_genai import ChatGoogleGenerativeAI
 # [1] Import Built-in LangChain tools
 # ---
+from langchain_community.tools import ShellTool
 from langchain_experimental.tools import PythonREPLTool
 from langchain_community.document_loaders import AssemblyAIAudioTranscriptLoader
 from langchain_community.document_loaders.assemblyai import TranscriptFormat
 # Youtube related tools
+from agents.youtube_transcript_tool import (
     get_youtube_transcript_tool,
     get_youtube_title_description_tool,
 )
+# Web search subagent
+from agents.web_search_subagent import web_search_agent
 load_dotenv()
 vision_llm = ChatGoogleGenerativeAI(model=os.getenv("GOOGLE_VISION_MODEL"))
         return f"Excel reading error: {str(e)}"
 @tool
 def transcribe_mp3(
     file_path: str,
         return f"Transcription error: {str(e)}"
 def build_tools():
     """
     Initialize and return a list of built-in and custom LangChain tools.
     # ---
     # Initialize built-in LangChain tools
     python_repl = PythonREPLTool()
     shell_tool = ShellTool()
     # Combine built-in tools with custom tools
     all_tools = [
         # Built-in LangChain tools
         python_repl,
         shell_tool,
+        # Web search subagent (replaces individual web/search tools for isolated context)
+        web_search_agent,
         # Custom tools for specialized tasks
         read_excel_file,
         get_youtube_transcript_tool,
         get_youtube_title_description_tool,
 if __name__ == "__main__":
     from pprint import pprint
     print("\n--- reverse_text ---")
     pprint(reverse_text.invoke({"text": "hello"}))
         )
     )
     print("\n--- ask_question_on_image_content ---")
     pprint(
         ask_question_on_image_content.invoke(

agents/web_search_subagent.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# =============================================================================
+# WEB SEARCH SUBAGENT - Isolated context for web/search operations
+#
+# TABLE OF CONTENTS
+# [1] Subagent State Definition
+# [2] Web Search Specialist Node
+# [3] Graph Builder
+# [4] Tool Wrapper
+# =============================================================================
+from typing import TypedDict, Annotated
+from langchain_core.messages import HumanMessage, SystemMessage, AnyMessage
+from langchain_core.tools import tool
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langgraph.graph import START, StateGraph
+from langgraph.graph.message import add_messages
+from langgraph.prebuilt import ToolNode, tools_condition
+from agents.web_search_tools import get_web_search_tools
+# [1] Subagent State Definition
+# ---
+class WebSearchState(TypedDict):
+    """State for web search subagent with isolated context."""
+    messages: Annotated[list[AnyMessage], add_messages]
+# [2] Web Search Specialist Node
+# ---
+def _web_search_specialist(state: WebSearchState, llm):
+    """Node that routes web/search queries to appropriate tools."""
+    sys_msg = SystemMessage(
+        content="""
+        You are a web search specialist agent. Answer queries using:
+        - wikipedia_tool: For general knowledge, people, places, historical facts
+        - arxiv_tool: For research papers, scientific articles
+        - duckduckgo_search: For current events, news, general web search
+        - visit_webpage: When a specific URL is provided or found
+        Use tools as needed and provide a clear, concise final answer.
+        """
+    )
+    web_tools = get_web_search_tools()
+    llm_with_tools = llm.bind_tools(web_tools)
+    return {"messages": [llm_with_tools.invoke([sys_msg] + state["messages"])]}
+# [3] Graph Builder
+# ---
+def _build_web_search_subagent(llm):
+    """Build a subagent that handles web/search tasks with isolated context."""
+    web_tools = get_web_search_tools()
+    def web_search_specialist(state: WebSearchState):
+        return _web_search_specialist(state, llm)
+    builder = StateGraph(WebSearchState)
+    builder.add_node("web_search_specialist", web_search_specialist)
+    builder.add_node("tools", ToolNode(web_tools))
+    builder.add_edge(START, "web_search_specialist")
+    builder.add_conditional_edges("web_search_specialist", tools_condition)
+    builder.add_edge("tools", "web_search_specialist")
+    return builder.compile()
+# [4] Tool Wrapper
+# ---
+@tool
+def web_search_agent(query: str) -> str:
+    """
+    Intelligent web search agent with isolated context.
+    Routes and executes web/search tasks. Use this for any web search,
+    Wikipedia lookups, arXiv papers, or webpage visits.
+    Returns only the final answer, keeping main agent context clean.
+    Example queries:
+    - "Who is Mercedes Sosa?" (uses Wikipedia)
+    - "Latest research on transformers" (uses arXiv)
+    - "Current news about AI" (uses DuckDuckGo)
+    - "Visit https://example.com and summarize" (uses visit_webpage)
+    """
+    llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.1)
+    subagent = _build_web_search_subagent(llm)
+    result = subagent.invoke({"messages": [HumanMessage(content=query)]})
+    return result["messages"][-1].content

agents/web_search_tools.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# =============================================================================
+# WEB SEARCH TOOLS - Tools for web/search operations
+#
+# TABLE OF CONTENTS
+# [1] Web Search Tools
+# =============================================================================
+import re
+from typing import Optional
+import requests
+import wikipedia
+from bs4 import BeautifulSoup
+from langchain_core.tools import tool
+from langchain_community.tools import DuckDuckGoSearchRun, ArxivQueryRun
+from langchain_community.utilities import (
+    DuckDuckGoSearchAPIWrapper,
+    ArxivAPIWrapper,
+)
+from markdownify import markdownify
+from requests.exceptions import RequestException
+# [1] Web Search Tools
+# ---
+def _fetch_wikipedia_page_with_tables(page_url: str) -> Optional[str]:
+    """Fetch full Wikipedia page content including tables using markdownify."""
+    try:
+        response = requests.get(
+            page_url, timeout=10, headers={"User-Agent": "Mozilla/5.0"}
+        )
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.text, "html.parser")
+            # Extract main content area (preserves infoboxes, tables, article content)
+            main_content = soup.find(id="mw-content-text") or soup.find(
+                class_="mw-parser-output"
+            )
+            if main_content:
+                # Remove UI elements only
+                for element in main_content.find_all(
+                    class_=lambda x: x
+                    and any(
+                        term in str(x).lower()
+                        for term in ["mw-jump-link", "mw-editsection", "toc"]
+                    )
+                ):
+                    element.decompose()
+                content = markdownify(str(main_content), heading_style="ATX")
+            else:
+                # Fallback: remove top-level navigation elements
+                for tag in soup.find_all(["nav", "aside", "footer", "header"]):
+                    tag.decompose()
+                content = markdownify(str(soup), heading_style="ATX")
+            return re.sub(r"\n{3,}", "\n\n", content)
+    except Exception:
+        pass
+    return None
+@tool
+def wikipedia_tool(query: str) -> str:
+    """
+    A wrapper around Wikipedia. Useful for when you need to answer general questions about
+    people, places, companies, facts, historical events, or other subjects.
+    Returns the FULL CONTENT of Wikipedia pages (not just summaries), including tables.
+    Input should be a search query.
+    """
+    try:
+        # Search returns page titles (strings) - need to resolve to get canonical URL
+        # Example: search("Mercedes") -> ["Mercedes Sosa", ...] (titles, not URLs)
+        page_titles = wikipedia.search(query[:300], results=3)
+        results = []
+        for page_title in page_titles[:3]:
+            try:
+                # Get page object to resolve canonical URL (handles redirects, special chars)
+                # Example: "Mercedes Sosa" -> "https://en.wikipedia.org/wiki/Mercedes_Sosa" (handles parentheses)
+                wiki_page = wikipedia.page(title=page_title, auto_suggest=False)
+                # Fetch full HTML content with tables (better than wiki_page.content which is text-only)
+                # Example: "Live albums" table preserved in HTML but missing from wiki_page.content
+                full_content = _fetch_wikipedia_page_with_tables(wiki_page.url)
+                content = f"Page: {page_title}\nURL: {wiki_page.url}\n\nContent:\n{full_content}"
+                results.append(content)
+            except (
+                wikipedia.exceptions.PageError,
+                wikipedia.exceptions.DisambiguationError,
+            ):
+                continue
+        if not results:
+            return "No good Wikipedia Search Result was found"
+        return "\n\n" + "=" * 80 + "\n\n".join(results)
+    except ImportError:
+        return "Error: wikipedia package not installed. Install with: pip install wikipedia"
+    except Exception as e:
+        return f"Wikipedia search error: {str(e)}"
+@tool
+def visit_webpage(url: str) -> str:
+    """
+    Visits a webpage at the given URL and returns its content as a markdown string.
+    Use this to browse and extract readable content from webpages.
+    """
+    try:
+        response = requests.get(url, timeout=20)
+        response.raise_for_status()
+        markdown_content = markdownify(response.text).strip()
+        markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
+        MAX_LEN = 40000
+        if len(markdown_content) > MAX_LEN:
+            return (
+                markdown_content[: MAX_LEN // 2]
+                + f"\n\n...[Content truncated to {MAX_LEN} chars]...\n\n"
+                + markdown_content[-MAX_LEN // 2 :]
+            )
+        return markdown_content
+    except requests.exceptions.Timeout:
+        return "Timeout while trying to access the webpage."
+    except RequestException as e:
+        return f"Request error: {str(e)}"
+    except Exception as e:
+        return f"Unexpected error: {str(e)}"
+def get_web_search_tools():
+    """Initialize and return web search tools."""
+    duckduckgo_search = DuckDuckGoSearchRun(
+        api_wrapper=DuckDuckGoSearchAPIWrapper(max_results=15)
+    )
+    arxiv_tool = ArxivQueryRun(api_wrapper=ArxivAPIWrapper())
+    return [duckduckgo_search, arxiv_tool, wikipedia_tool, visit_webpage]

youtube_transcript.py → agents/youtube_transcript_tool.py RENAMED Viewed

File without changes

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ import requests
 from dotenv import load_dotenv
 from langfuse import get_client
-from agent import AwesomeAgent
 load_dotenv()

 from dotenv import load_dotenv
 from langfuse import get_client
+from agents.assistant_agent import AwesomeAgent
 load_dotenv()