fmarky commited on
Commit
96d3600
·
1 Parent(s): f491e70

feat: create web search sub agent

Browse files
agents/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Agents package for the Final Assignment Agents Course."""
2
+
3
+ from agents.assistant_agent import AwesomeAgent
4
+
5
+ __all__ = ["AwesomeAgent"]
agent.py → agents/assistant_agent.py RENAMED
@@ -24,7 +24,7 @@ from langchain_google_genai import ChatGoogleGenerativeAI
24
  from langchain_openai import ChatOpenAI
25
  from langchain_groq import ChatGroq
26
  from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
27
- from tools import build_tools
28
  from langfuse.langchain import CallbackHandler
29
 
30
  load_dotenv()
@@ -83,10 +83,7 @@ class AgentState(TypedDict):
83
 
84
  tools_description = """
85
  WEB & SEARCH:
86
- - duckduckgo_search: Search the web
87
- - wikipedia_tool: Search Wikipedia for knowledge
88
- - visit_webpage: Visit a webpage and extract readable markdown content
89
- - arxiv_tool: Search arXiv for research papers
90
 
91
  CALCULATIONS:
92
  - calculator: Basic arithmetic (+, -, *, /)
 
24
  from langchain_openai import ChatOpenAI
25
  from langchain_groq import ChatGroq
26
  from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
27
+ from agents.assistant_tools import build_tools
28
  from langfuse.langchain import CallbackHandler
29
 
30
  load_dotenv()
 
83
 
84
  tools_description = """
85
  WEB & SEARCH:
86
+ - web_search_agent: web search subagent (for Wikipedia, arXiv, Web Search)
 
 
 
87
 
88
  CALCULATIONS:
89
  - calculator: Basic arithmetic (+, -, *, /)
tools.py → agents/assistant_tools.py RENAMED
@@ -11,42 +11,31 @@
11
  import base64
12
  import math
13
  import os
14
- import re
15
  from typing import Optional
16
 
17
  import pandas as pd
18
- import requests
19
- from bs4 import BeautifulSoup
20
  from dotenv import load_dotenv
21
  from langchain_core.messages import HumanMessage
22
  from langchain_core.tools import tool
23
  from langchain_google_genai import ChatGoogleGenerativeAI
24
- from markdownify import markdownify
25
- from requests.exceptions import RequestException
26
- import wikipedia
27
 
28
  # [1] Import Built-in LangChain tools
29
  # ---
30
 
31
- from langchain_community.tools import (
32
- DuckDuckGoSearchRun,
33
- ArxivQueryRun,
34
- ShellTool,
35
- )
36
- from langchain_community.utilities import (
37
- DuckDuckGoSearchAPIWrapper,
38
- ArxivAPIWrapper,
39
- )
40
  from langchain_experimental.tools import PythonREPLTool
41
  from langchain_community.document_loaders import AssemblyAIAudioTranscriptLoader
42
  from langchain_community.document_loaders.assemblyai import TranscriptFormat
43
 
44
  # Youtube related tools
45
- from youtube_transcript import (
46
  get_youtube_transcript_tool,
47
  get_youtube_title_description_tool,
48
  )
49
 
 
 
 
50
  load_dotenv()
51
  vision_llm = ChatGoogleGenerativeAI(model=os.getenv("GOOGLE_VISION_MODEL"))
52
 
@@ -187,33 +176,6 @@ def read_excel_file(file_path: str, sheet_name: Optional[str] = None) -> str:
187
  return f"Excel reading error: {str(e)}"
188
 
189
 
190
- @tool
191
- def visit_webpage(url: str) -> str:
192
- """
193
- Visits a webpage at the given URL and returns its content as a markdown string.
194
- Use this to browse and extract readable content from webpages.
195
- """
196
- try:
197
- response = requests.get(url, timeout=20)
198
- response.raise_for_status()
199
- markdown_content = markdownify(response.text).strip()
200
- markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
201
- MAX_LEN = 40000
202
- if len(markdown_content) > MAX_LEN:
203
- return (
204
- markdown_content[: MAX_LEN // 2]
205
- + f"\n\n...[Content truncated to {MAX_LEN} chars]...\n\n"
206
- + markdown_content[-MAX_LEN // 2 :]
207
- )
208
- return markdown_content
209
- except requests.exceptions.Timeout:
210
- return "Timeout while trying to access the webpage."
211
- except RequestException as e:
212
- return f"Request error: {str(e)}"
213
- except Exception as e:
214
- return f"Unexpected error: {str(e)}"
215
-
216
-
217
  @tool
218
  def transcribe_mp3(
219
  file_path: str,
@@ -240,84 +202,6 @@ def transcribe_mp3(
240
  return f"Transcription error: {str(e)}"
241
 
242
 
243
- def _fetch_wikipedia_page_with_tables(page_url: str) -> Optional[str]:
244
- """Fetch full Wikipedia page content including tables using markdownify."""
245
- try:
246
- response = requests.get(
247
- page_url, timeout=10, headers={"User-Agent": "Mozilla/5.0"}
248
- )
249
- if response.status_code == 200:
250
- soup = BeautifulSoup(response.text, "html.parser")
251
-
252
- # Extract main content area (preserves infoboxes, tables, article content)
253
- main_content = soup.find(id="mw-content-text") or soup.find(
254
- class_="mw-parser-output"
255
- )
256
-
257
- if main_content:
258
- # Remove UI elements only
259
- for element in main_content.find_all(
260
- class_=lambda x: x
261
- and any(
262
- term in str(x).lower()
263
- for term in ["mw-jump-link", "mw-editsection", "toc"]
264
- )
265
- ):
266
- element.decompose()
267
- content = markdownify(str(main_content), heading_style="ATX")
268
- else:
269
- # Fallback: remove top-level navigation elements
270
- for tag in soup.find_all(["nav", "aside", "footer", "header"]):
271
- tag.decompose()
272
- content = markdownify(str(soup), heading_style="ATX")
273
-
274
- return re.sub(r"\n{3,}", "\n\n", content)
275
- except Exception:
276
- pass
277
- return None
278
-
279
-
280
- @tool
281
- def wikipedia_tool(query: str) -> str:
282
- """
283
- A wrapper around Wikipedia. Useful for when you need to answer general questions about
284
- people, places, companies, facts, historical events, or other subjects.
285
- Returns the FULL CONTENT of Wikipedia pages (not just summaries), including tables.
286
- Input should be a search query.
287
- """
288
- try:
289
- # Search returns page titles (strings) - need to resolve to get canonical URL
290
- # Example: search("Mercedes") -> ["Mercedes Sosa", ...] (titles, not URLs)
291
- page_titles = wikipedia.search(query[:300], results=3)
292
- results = []
293
-
294
- for page_title in page_titles[:3]:
295
- try:
296
- # Get page object to resolve canonical URL (handles redirects, special chars)
297
- # Example: "Mercedes Sosa" -> "https://en.wikipedia.org/wiki/Mercedes_Sosa" (handles parentheses)
298
- wiki_page = wikipedia.page(title=page_title, auto_suggest=False)
299
- # Fetch full HTML content with tables (better than wiki_page.content which is text-only)
300
- # Example: "Live albums" table preserved in HTML but missing from wiki_page.content
301
- full_content = _fetch_wikipedia_page_with_tables(wiki_page.url)
302
- content = f"Page: {page_title}\nURL: {wiki_page.url}\n\nContent:\n{full_content}"
303
- results.append(content)
304
- except (
305
- wikipedia.exceptions.PageError,
306
- wikipedia.exceptions.DisambiguationError,
307
- ):
308
- continue
309
-
310
- if not results:
311
- return "No good Wikipedia Search Result was found"
312
-
313
- return "\n\n" + "=" * 80 + "\n\n".join(results)
314
-
315
- except ImportError:
316
- return "Error: wikipedia package not installed. Install with: pip install wikipedia"
317
- except Exception as e:
318
- return f"Wikipedia search error: {str(e)}"
319
-
320
-
321
  def build_tools():
322
  """
323
  Initialize and return a list of built-in and custom LangChain tools.
@@ -327,11 +211,6 @@ def build_tools():
327
  # ---
328
 
329
  # Initialize built-in LangChain tools
330
- # Note: wikipedia_tool is now a custom tool defined above that returns full page content
331
- duckduckgo_search = DuckDuckGoSearchRun(
332
- api_wrapper=DuckDuckGoSearchAPIWrapper(max_results=15)
333
- )
334
- arxiv_tool = ArxivQueryRun(api_wrapper=ArxivAPIWrapper())
335
  python_repl = PythonREPLTool()
336
  shell_tool = ShellTool()
337
 
@@ -341,13 +220,11 @@ def build_tools():
341
  # Combine built-in tools with custom tools
342
  all_tools = [
343
  # Built-in LangChain tools
344
- duckduckgo_search,
345
- arxiv_tool,
346
  python_repl,
347
  shell_tool,
 
 
348
  # Custom tools for specialized tasks
349
- wikipedia_tool,
350
- visit_webpage,
351
  read_excel_file,
352
  get_youtube_transcript_tool,
353
  get_youtube_title_description_tool,
@@ -364,9 +241,6 @@ def build_tools():
364
  if __name__ == "__main__":
365
  from pprint import pprint
366
 
367
- print("\n--- wikipedia_tool ---")
368
- pprint(wikipedia_tool.invoke({"query": "Mercedes Sosa"}))
369
-
370
  print("\n--- reverse_text ---")
371
  pprint(reverse_text.invoke({"text": "hello"}))
372
 
@@ -388,10 +262,6 @@ if __name__ == "__main__":
388
  )
389
  )
390
 
391
- print("\n--- visit_webpage ---")
392
- result = visit_webpage.invoke({"url": "https://example.com"})
393
- print(result[:200] + "...\n") # tronqué pour affichage
394
-
395
  print("\n--- ask_question_on_image_content ---")
396
  pprint(
397
  ask_question_on_image_content.invoke(
 
11
  import base64
12
  import math
13
  import os
 
14
  from typing import Optional
15
 
16
  import pandas as pd
 
 
17
  from dotenv import load_dotenv
18
  from langchain_core.messages import HumanMessage
19
  from langchain_core.tools import tool
20
  from langchain_google_genai import ChatGoogleGenerativeAI
 
 
 
21
 
22
  # [1] Import Built-in LangChain tools
23
  # ---
24
 
25
+ from langchain_community.tools import ShellTool
 
 
 
 
 
 
 
 
26
  from langchain_experimental.tools import PythonREPLTool
27
  from langchain_community.document_loaders import AssemblyAIAudioTranscriptLoader
28
  from langchain_community.document_loaders.assemblyai import TranscriptFormat
29
 
30
  # Youtube related tools
31
+ from agents.youtube_transcript_tool import (
32
  get_youtube_transcript_tool,
33
  get_youtube_title_description_tool,
34
  )
35
 
36
+ # Web search subagent
37
+ from agents.web_search_subagent import web_search_agent
38
+
39
  load_dotenv()
40
  vision_llm = ChatGoogleGenerativeAI(model=os.getenv("GOOGLE_VISION_MODEL"))
41
 
 
176
  return f"Excel reading error: {str(e)}"
177
 
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  @tool
180
  def transcribe_mp3(
181
  file_path: str,
 
202
  return f"Transcription error: {str(e)}"
203
 
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  def build_tools():
206
  """
207
  Initialize and return a list of built-in and custom LangChain tools.
 
211
  # ---
212
 
213
  # Initialize built-in LangChain tools
 
 
 
 
 
214
  python_repl = PythonREPLTool()
215
  shell_tool = ShellTool()
216
 
 
220
  # Combine built-in tools with custom tools
221
  all_tools = [
222
  # Built-in LangChain tools
 
 
223
  python_repl,
224
  shell_tool,
225
+ # Web search subagent (replaces individual web/search tools for isolated context)
226
+ web_search_agent,
227
  # Custom tools for specialized tasks
 
 
228
  read_excel_file,
229
  get_youtube_transcript_tool,
230
  get_youtube_title_description_tool,
 
241
  if __name__ == "__main__":
242
  from pprint import pprint
243
 
 
 
 
244
  print("\n--- reverse_text ---")
245
  pprint(reverse_text.invoke({"text": "hello"}))
246
 
 
262
  )
263
  )
264
 
 
 
 
 
265
  print("\n--- ask_question_on_image_content ---")
266
  pprint(
267
  ask_question_on_image_content.invoke(
agents/web_search_subagent.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # WEB SEARCH SUBAGENT - Isolated context for web/search operations
3
+ #
4
+ # TABLE OF CONTENTS
5
+ # [1] Subagent State Definition
6
+ # [2] Web Search Specialist Node
7
+ # [3] Graph Builder
8
+ # [4] Tool Wrapper
9
+ # =============================================================================
10
+
11
+ from typing import TypedDict, Annotated
12
+
13
+ from langchain_core.messages import HumanMessage, SystemMessage, AnyMessage
14
+ from langchain_core.tools import tool
15
+ from langchain_google_genai import ChatGoogleGenerativeAI
16
+ from langgraph.graph import START, StateGraph
17
+ from langgraph.graph.message import add_messages
18
+ from langgraph.prebuilt import ToolNode, tools_condition
19
+
20
+ from agents.web_search_tools import get_web_search_tools
21
+
22
+
23
+ # [1] Subagent State Definition
24
+ # ---
25
+
26
+
27
+ class WebSearchState(TypedDict):
28
+ """State for web search subagent with isolated context."""
29
+
30
+ messages: Annotated[list[AnyMessage], add_messages]
31
+
32
+
33
+ # [2] Web Search Specialist Node
34
+ # ---
35
+
36
+
37
+ def _web_search_specialist(state: WebSearchState, llm):
38
+ """Node that routes web/search queries to appropriate tools."""
39
+ sys_msg = SystemMessage(
40
+ content="""
41
+ You are a web search specialist agent. Answer queries using:
42
+ - wikipedia_tool: For general knowledge, people, places, historical facts
43
+ - arxiv_tool: For research papers, scientific articles
44
+ - duckduckgo_search: For current events, news, general web search
45
+ - visit_webpage: When a specific URL is provided or found
46
+
47
+ Use tools as needed and provide a clear, concise final answer.
48
+ """
49
+ )
50
+ web_tools = get_web_search_tools()
51
+ llm_with_tools = llm.bind_tools(web_tools)
52
+ return {"messages": [llm_with_tools.invoke([sys_msg] + state["messages"])]}
53
+
54
+
55
+ # [3] Graph Builder
56
+ # ---
57
+
58
+
59
+ def _build_web_search_subagent(llm):
60
+ """Build a subagent that handles web/search tasks with isolated context."""
61
+ web_tools = get_web_search_tools()
62
+
63
+ def web_search_specialist(state: WebSearchState):
64
+ return _web_search_specialist(state, llm)
65
+
66
+ builder = StateGraph(WebSearchState)
67
+ builder.add_node("web_search_specialist", web_search_specialist)
68
+ builder.add_node("tools", ToolNode(web_tools))
69
+ builder.add_edge(START, "web_search_specialist")
70
+ builder.add_conditional_edges("web_search_specialist", tools_condition)
71
+ builder.add_edge("tools", "web_search_specialist")
72
+
73
+ return builder.compile()
74
+
75
+
76
+ # [4] Tool Wrapper
77
+ # ---
78
+
79
+
80
+ @tool
81
+ def web_search_agent(query: str) -> str:
82
+ """
83
+ Intelligent web search agent with isolated context.
84
+
85
+ Routes and executes web/search tasks. Use this for any web search,
86
+ Wikipedia lookups, arXiv papers, or webpage visits.
87
+ Returns only the final answer, keeping main agent context clean.
88
+
89
+ Example queries:
90
+ - "Who is Mercedes Sosa?" (uses Wikipedia)
91
+ - "Latest research on transformers" (uses arXiv)
92
+ - "Current news about AI" (uses DuckDuckGo)
93
+ - "Visit https://example.com and summarize" (uses visit_webpage)
94
+ """
95
+ llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.1)
96
+ subagent = _build_web_search_subagent(llm)
97
+
98
+ result = subagent.invoke({"messages": [HumanMessage(content=query)]})
99
+
100
+ return result["messages"][-1].content
agents/web_search_tools.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # WEB SEARCH TOOLS - Tools for web/search operations
3
+ #
4
+ # TABLE OF CONTENTS
5
+ # [1] Web Search Tools
6
+ # =============================================================================
7
+
8
+ import re
9
+ from typing import Optional
10
+
11
+ import requests
12
+ import wikipedia
13
+ from bs4 import BeautifulSoup
14
+ from langchain_core.tools import tool
15
+ from langchain_community.tools import DuckDuckGoSearchRun, ArxivQueryRun
16
+ from langchain_community.utilities import (
17
+ DuckDuckGoSearchAPIWrapper,
18
+ ArxivAPIWrapper,
19
+ )
20
+ from markdownify import markdownify
21
+ from requests.exceptions import RequestException
22
+
23
+
24
+ # [1] Web Search Tools
25
+ # ---
26
+
27
+
28
+ def _fetch_wikipedia_page_with_tables(page_url: str) -> Optional[str]:
29
+ """Fetch full Wikipedia page content including tables using markdownify."""
30
+ try:
31
+ response = requests.get(
32
+ page_url, timeout=10, headers={"User-Agent": "Mozilla/5.0"}
33
+ )
34
+ if response.status_code == 200:
35
+ soup = BeautifulSoup(response.text, "html.parser")
36
+
37
+ # Extract main content area (preserves infoboxes, tables, article content)
38
+ main_content = soup.find(id="mw-content-text") or soup.find(
39
+ class_="mw-parser-output"
40
+ )
41
+
42
+ if main_content:
43
+ # Remove UI elements only
44
+ for element in main_content.find_all(
45
+ class_=lambda x: x
46
+ and any(
47
+ term in str(x).lower()
48
+ for term in ["mw-jump-link", "mw-editsection", "toc"]
49
+ )
50
+ ):
51
+ element.decompose()
52
+ content = markdownify(str(main_content), heading_style="ATX")
53
+ else:
54
+ # Fallback: remove top-level navigation elements
55
+ for tag in soup.find_all(["nav", "aside", "footer", "header"]):
56
+ tag.decompose()
57
+ content = markdownify(str(soup), heading_style="ATX")
58
+
59
+ return re.sub(r"\n{3,}", "\n\n", content)
60
+ except Exception:
61
+ pass
62
+ return None
63
+
64
+
65
+ @tool
66
+ def wikipedia_tool(query: str) -> str:
67
+ """
68
+ A wrapper around Wikipedia. Useful for when you need to answer general questions about
69
+ people, places, companies, facts, historical events, or other subjects.
70
+ Returns the FULL CONTENT of Wikipedia pages (not just summaries), including tables.
71
+ Input should be a search query.
72
+ """
73
+ try:
74
+ # Search returns page titles (strings) - need to resolve to get canonical URL
75
+ # Example: search("Mercedes") -> ["Mercedes Sosa", ...] (titles, not URLs)
76
+ page_titles = wikipedia.search(query[:300], results=3)
77
+ results = []
78
+
79
+ for page_title in page_titles[:3]:
80
+ try:
81
+ # Get page object to resolve canonical URL (handles redirects, special chars)
82
+ # Example: "Mercedes Sosa" -> "https://en.wikipedia.org/wiki/Mercedes_Sosa" (handles parentheses)
83
+ wiki_page = wikipedia.page(title=page_title, auto_suggest=False)
84
+ # Fetch full HTML content with tables (better than wiki_page.content which is text-only)
85
+ # Example: "Live albums" table preserved in HTML but missing from wiki_page.content
86
+ full_content = _fetch_wikipedia_page_with_tables(wiki_page.url)
87
+ content = f"Page: {page_title}\nURL: {wiki_page.url}\n\nContent:\n{full_content}"
88
+ results.append(content)
89
+ except (
90
+ wikipedia.exceptions.PageError,
91
+ wikipedia.exceptions.DisambiguationError,
92
+ ):
93
+ continue
94
+
95
+ if not results:
96
+ return "No good Wikipedia Search Result was found"
97
+
98
+ return "\n\n" + "=" * 80 + "\n\n".join(results)
99
+
100
+ except ImportError:
101
+ return "Error: wikipedia package not installed. Install with: pip install wikipedia"
102
+ except Exception as e:
103
+ return f"Wikipedia search error: {str(e)}"
104
+
105
+
106
+ @tool
107
+ def visit_webpage(url: str) -> str:
108
+ """
109
+ Visits a webpage at the given URL and returns its content as a markdown string.
110
+ Use this to browse and extract readable content from webpages.
111
+ """
112
+ try:
113
+ response = requests.get(url, timeout=20)
114
+ response.raise_for_status()
115
+ markdown_content = markdownify(response.text).strip()
116
+ markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
117
+ MAX_LEN = 40000
118
+ if len(markdown_content) > MAX_LEN:
119
+ return (
120
+ markdown_content[: MAX_LEN // 2]
121
+ + f"\n\n...[Content truncated to {MAX_LEN} chars]...\n\n"
122
+ + markdown_content[-MAX_LEN // 2 :]
123
+ )
124
+ return markdown_content
125
+ except requests.exceptions.Timeout:
126
+ return "Timeout while trying to access the webpage."
127
+ except RequestException as e:
128
+ return f"Request error: {str(e)}"
129
+ except Exception as e:
130
+ return f"Unexpected error: {str(e)}"
131
+
132
+
133
+ def get_web_search_tools():
134
+ """Initialize and return web search tools."""
135
+ duckduckgo_search = DuckDuckGoSearchRun(
136
+ api_wrapper=DuckDuckGoSearchAPIWrapper(max_results=15)
137
+ )
138
+ arxiv_tool = ArxivQueryRun(api_wrapper=ArxivAPIWrapper())
139
+ return [duckduckgo_search, arxiv_tool, wikipedia_tool, visit_webpage]
140
+
youtube_transcript.py → agents/youtube_transcript_tool.py RENAMED
File without changes
app.py CHANGED
@@ -13,7 +13,7 @@ import requests
13
  from dotenv import load_dotenv
14
  from langfuse import get_client
15
 
16
- from agent import AwesomeAgent
17
 
18
  load_dotenv()
19
 
 
13
  from dotenv import load_dotenv
14
  from langfuse import get_client
15
 
16
+ from agents.assistant_agent import AwesomeAgent
17
 
18
  load_dotenv()
19