Charles Azam commited on
Commit
7c1f478
·
1 Parent(s): bb62e6b

feat: add test for scrawl web agent

Browse files
src/deepengineer/deepsearch/scawl_web_agent.py CHANGED
@@ -1,10 +1,9 @@
1
- from smolagents import CodeAgent, Tool, LiteLLMModel, tool, ToolCallingAgent
2
  from deepengineer.webcrawler.async_search import (
3
  linkup_search_async, arxiv_search_async,
4
  pubmed_search_async, scientific_search_async,
5
  )
6
  from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown, convert_ocr_response_to_markdown, get_markdown_by_page_numbers, find_in_markdown
7
- from mistralai import OCRResponse
8
  from enum import Enum
9
  import asyncio
10
  from deepengineer.webcrawler.async_search import SearchResponse
@@ -12,7 +11,7 @@ from deepengineer.webcrawler.crawl_database import DataBase
12
 
13
  class ToolNames(Enum):
14
  # Search tools
15
- SEARCH_TOOL = "web search tool"
16
  ARXIV_SEARCH = "arxiv_search"
17
  PUBMED_SEARCH = "pubmed_search"
18
  SCIENCEDIRECT_SEARCH = "sciencedirect_search"
@@ -24,9 +23,9 @@ class ToolNames(Enum):
24
  GET_PAGES_CONTENT = "get_pages_content"
25
  FIND_IN_MARKDOWN = "find_in_markdown"
26
 
27
- def filter_search_results(search_response: SearchResponse, max_nb_results: int = 10) -> SearchResponse:
28
  search_response.search_results = search_response.search_results[:max_nb_results]
29
- return search_response
30
 
31
 
32
  class SearchTool(Tool):
@@ -41,13 +40,12 @@ class SearchTool(Tool):
41
  },
42
  }
43
  output_type = "object"
44
- max_nb_results = 10
45
 
46
- def forward(self, search_query: str) -> SearchResponse:
47
  result = asyncio.run(linkup_search_async(
48
  search_query=search_query,
49
  ))
50
- return filter_search_results(result, SearchTool.max_nb_results)
51
 
52
  class ArxivSearchTool(Tool):
53
  name = ToolNames.ARXIV_SEARCH.value
@@ -62,9 +60,9 @@ class ArxivSearchTool(Tool):
62
  }
63
  output_type = "object"
64
 
65
- def forward(self, search_query: str) -> SearchResponse:
66
  result = asyncio.run(arxiv_search_async(search_query))
67
- return filter_search_results(result, ArxivSearchTool.max_nb_results)
68
 
69
  class PubmedSearchTool(Tool):
70
  name = ToolNames.PUBMED_SEARCH.value
@@ -79,9 +77,9 @@ class PubmedSearchTool(Tool):
79
  }
80
  output_type = "object"
81
 
82
- def forward(self, search_query: str) -> SearchResponse:
83
  result = asyncio.run(pubmed_search_async(search_query))
84
- return filter_search_results(result, PubmedSearchTool.max_nb_results)
85
 
86
  class ScientificSearchTool(Tool):
87
  name = ToolNames.SCIENTIFIC_SEARCH.value
@@ -95,10 +93,9 @@ class ScientificSearchTool(Tool):
95
  }
96
  }
97
  output_type = "object"
98
-
99
  def forward(self, search_query: str) -> dict:
100
  result = asyncio.run(scientific_search_async(search_query))
101
- return result.model_dump()
102
 
103
  URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
104
 
@@ -194,6 +191,7 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
194
  """Create a web search agent with search, crawling, and PDF analysis capabilities."""
195
 
196
  model = LiteLLMModel(model_id=model_id)
 
197
 
198
  # Web search and crawling tools
199
  WEB_SEARCH_TOOLS = [
@@ -201,13 +199,13 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
201
  ArxivSearchTool(),
202
  PubmedSearchTool(),
203
  ScientificSearchTool(),
204
- GetTableOfContentsTool(),
205
- GetMarkdownTool(),
206
- GetPagesContentTool(),
207
- FindInMarkdownTool(),
208
  ]
209
 
210
- web_search_agent = ToolCallingAgent(
211
  model=model,
212
  tools=WEB_SEARCH_TOOLS,
213
  max_steps=20,
 
1
+ from smolagents import CodeAgent, Tool, LiteLLMModel
2
  from deepengineer.webcrawler.async_search import (
3
  linkup_search_async, arxiv_search_async,
4
  pubmed_search_async, scientific_search_async,
5
  )
6
  from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown, convert_ocr_response_to_markdown, get_markdown_by_page_numbers, find_in_markdown
 
7
  from enum import Enum
8
  import asyncio
9
  from deepengineer.webcrawler.async_search import SearchResponse
 
11
 
12
  class ToolNames(Enum):
13
  # Search tools
14
+ SEARCH_TOOL = "web_search_tool"
15
  ARXIV_SEARCH = "arxiv_search"
16
  PUBMED_SEARCH = "pubmed_search"
17
  SCIENCEDIRECT_SEARCH = "sciencedirect_search"
 
23
  GET_PAGES_CONTENT = "get_pages_content"
24
  FIND_IN_MARKDOWN = "find_in_markdown"
25
 
26
+ def filter_search_results(search_response: SearchResponse, max_nb_results: int = 5) -> SearchResponse:
27
  search_response.search_results = search_response.search_results[:max_nb_results]
28
+ return search_response.to_string()
29
 
30
 
31
  class SearchTool(Tool):
 
40
  },
41
  }
42
  output_type = "object"
 
43
 
44
+ def forward(self, search_query: str) -> str:
45
  result = asyncio.run(linkup_search_async(
46
  search_query=search_query,
47
  ))
48
+ return filter_search_results(result)
49
 
50
  class ArxivSearchTool(Tool):
51
  name = ToolNames.ARXIV_SEARCH.value
 
60
  }
61
  output_type = "object"
62
 
63
+ def forward(self, search_query: str) -> str:
64
  result = asyncio.run(arxiv_search_async(search_query))
65
+ return filter_search_results(result)
66
 
67
  class PubmedSearchTool(Tool):
68
  name = ToolNames.PUBMED_SEARCH.value
 
77
  }
78
  output_type = "object"
79
 
80
+ def forward(self, search_query: str) -> str:
81
  result = asyncio.run(pubmed_search_async(search_query))
82
+ return filter_search_results(result)
83
 
84
  class ScientificSearchTool(Tool):
85
  name = ToolNames.SCIENTIFIC_SEARCH.value
 
93
  }
94
  }
95
  output_type = "object"
 
96
  def forward(self, search_query: str) -> dict:
97
  result = asyncio.run(scientific_search_async(search_query))
98
+ return filter_search_results(result)
99
 
100
  URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
101
 
 
191
  """Create a web search agent with search, crawling, and PDF analysis capabilities."""
192
 
193
  model = LiteLLMModel(model_id=model_id)
194
+ database = DataBase()
195
 
196
  # Web search and crawling tools
197
  WEB_SEARCH_TOOLS = [
 
199
  ArxivSearchTool(),
200
  PubmedSearchTool(),
201
  ScientificSearchTool(),
202
+ GetTableOfContentsTool(database),
203
+ GetMarkdownTool(database),
204
+ GetPagesContentTool(database),
205
+ FindInMarkdownTool(database),
206
  ]
207
 
208
+ web_search_agent = CodeAgent(
209
  model=model,
210
  tools=WEB_SEARCH_TOOLS,
211
  max_steps=20,
src/deepengineer/webcrawler/async_search.py CHANGED
@@ -24,6 +24,32 @@ class SearchResponse(BaseModel):
24
  answer: str | None = Field(None, description="Direct answer from the search API if available")
25
  search_results: list[SearchResult] = Field(default_factory=list, description="List of search results")
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  class ScientificDomains(str, Enum):
28
  wikipedia = "wikipedia.org"
29
  arxiv = "arxiv.org"
 
24
  answer: str | None = Field(None, description="Direct answer from the search API if available")
25
  search_results: list[SearchResult] = Field(default_factory=list, description="List of search results")
26
 
27
+ def to_string(self):
28
+ """Convert search response to a formatted string suitable for LLM consumption."""
29
+ result_parts = []
30
+
31
+ # Add the query
32
+ result_parts.append(f"Search Query: {self.query}\n")
33
+
34
+ # Add the direct answer if available
35
+ if self.answer:
36
+ result_parts.append(f"Direct Answer: {self.answer}\n")
37
+
38
+ # Add search results
39
+ if self.search_results:
40
+ result_parts.append(f"Found {len(self.search_results)} search results:\n")
41
+
42
+ for i, result in enumerate(self.search_results, 1):
43
+ result_parts.append(f"\n--- Result {i} ---")
44
+ result_parts.append(f"Title: {result.title}")
45
+ result_parts.append(f"URL: {result.url}")
46
+ result_parts.append(f"Content: {result.content[:2000]}...")
47
+ result_parts.append("") # Empty line for separation
48
+ else:
49
+ result_parts.append("No search results found.")
50
+
51
+ return "\n".join(result_parts)
52
+
53
  class ScientificDomains(str, Enum):
54
  wikipedia = "wikipedia.org"
55
  arxiv = "arxiv.org"
tests/{webcrawler/deepsearch → deepsearch}/test_pdf_agent.py RENAMED
File without changes
tests/deepsearch/test_web_agent.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from deepengineer.deepsearch.scawl_web_agent import create_web_search_agent
2
+
3
+ def test_create_web_search_agent():
4
+ agent = create_web_search_agent()
5
+ agent.run("Est il possible de faire un réacteur thermique avec du graphite et du plomb?")