Spaces:
Runtime error
Runtime error
Charles Azam
commited on
Commit
·
7c1f478
1
Parent(s):
bb62e6b
feat: add test for scrawl web agent
Browse files
src/deepengineer/deepsearch/scawl_web_agent.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
-
from smolagents import CodeAgent, Tool, LiteLLMModel
|
| 2 |
from deepengineer.webcrawler.async_search import (
|
| 3 |
linkup_search_async, arxiv_search_async,
|
| 4 |
pubmed_search_async, scientific_search_async,
|
| 5 |
)
|
| 6 |
from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown, convert_ocr_response_to_markdown, get_markdown_by_page_numbers, find_in_markdown
|
| 7 |
-
from mistralai import OCRResponse
|
| 8 |
from enum import Enum
|
| 9 |
import asyncio
|
| 10 |
from deepengineer.webcrawler.async_search import SearchResponse
|
|
@@ -12,7 +11,7 @@ from deepengineer.webcrawler.crawl_database import DataBase
|
|
| 12 |
|
| 13 |
class ToolNames(Enum):
|
| 14 |
# Search tools
|
| 15 |
-
SEARCH_TOOL = "
|
| 16 |
ARXIV_SEARCH = "arxiv_search"
|
| 17 |
PUBMED_SEARCH = "pubmed_search"
|
| 18 |
SCIENCEDIRECT_SEARCH = "sciencedirect_search"
|
|
@@ -24,9 +23,9 @@ class ToolNames(Enum):
|
|
| 24 |
GET_PAGES_CONTENT = "get_pages_content"
|
| 25 |
FIND_IN_MARKDOWN = "find_in_markdown"
|
| 26 |
|
| 27 |
-
def filter_search_results(search_response: SearchResponse, max_nb_results: int =
|
| 28 |
search_response.search_results = search_response.search_results[:max_nb_results]
|
| 29 |
-
return search_response
|
| 30 |
|
| 31 |
|
| 32 |
class SearchTool(Tool):
|
|
@@ -41,13 +40,12 @@ class SearchTool(Tool):
|
|
| 41 |
},
|
| 42 |
}
|
| 43 |
output_type = "object"
|
| 44 |
-
max_nb_results = 10
|
| 45 |
|
| 46 |
-
def forward(self, search_query: str) ->
|
| 47 |
result = asyncio.run(linkup_search_async(
|
| 48 |
search_query=search_query,
|
| 49 |
))
|
| 50 |
-
return filter_search_results(result
|
| 51 |
|
| 52 |
class ArxivSearchTool(Tool):
|
| 53 |
name = ToolNames.ARXIV_SEARCH.value
|
|
@@ -62,9 +60,9 @@ class ArxivSearchTool(Tool):
|
|
| 62 |
}
|
| 63 |
output_type = "object"
|
| 64 |
|
| 65 |
-
def forward(self, search_query: str) ->
|
| 66 |
result = asyncio.run(arxiv_search_async(search_query))
|
| 67 |
-
return filter_search_results(result
|
| 68 |
|
| 69 |
class PubmedSearchTool(Tool):
|
| 70 |
name = ToolNames.PUBMED_SEARCH.value
|
|
@@ -79,9 +77,9 @@ class PubmedSearchTool(Tool):
|
|
| 79 |
}
|
| 80 |
output_type = "object"
|
| 81 |
|
| 82 |
-
def forward(self, search_query: str) ->
|
| 83 |
result = asyncio.run(pubmed_search_async(search_query))
|
| 84 |
-
return filter_search_results(result
|
| 85 |
|
| 86 |
class ScientificSearchTool(Tool):
|
| 87 |
name = ToolNames.SCIENTIFIC_SEARCH.value
|
|
@@ -95,10 +93,9 @@ class ScientificSearchTool(Tool):
|
|
| 95 |
}
|
| 96 |
}
|
| 97 |
output_type = "object"
|
| 98 |
-
|
| 99 |
def forward(self, search_query: str) -> dict:
|
| 100 |
result = asyncio.run(scientific_search_async(search_query))
|
| 101 |
-
return result
|
| 102 |
|
| 103 |
URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
|
| 104 |
|
|
@@ -194,6 +191,7 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
|
|
| 194 |
"""Create a web search agent with search, crawling, and PDF analysis capabilities."""
|
| 195 |
|
| 196 |
model = LiteLLMModel(model_id=model_id)
|
|
|
|
| 197 |
|
| 198 |
# Web search and crawling tools
|
| 199 |
WEB_SEARCH_TOOLS = [
|
|
@@ -201,13 +199,13 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
|
|
| 201 |
ArxivSearchTool(),
|
| 202 |
PubmedSearchTool(),
|
| 203 |
ScientificSearchTool(),
|
| 204 |
-
GetTableOfContentsTool(),
|
| 205 |
-
GetMarkdownTool(),
|
| 206 |
-
GetPagesContentTool(),
|
| 207 |
-
FindInMarkdownTool(),
|
| 208 |
]
|
| 209 |
|
| 210 |
-
web_search_agent =
|
| 211 |
model=model,
|
| 212 |
tools=WEB_SEARCH_TOOLS,
|
| 213 |
max_steps=20,
|
|
|
|
| 1 |
+
from smolagents import CodeAgent, Tool, LiteLLMModel
|
| 2 |
from deepengineer.webcrawler.async_search import (
|
| 3 |
linkup_search_async, arxiv_search_async,
|
| 4 |
pubmed_search_async, scientific_search_async,
|
| 5 |
)
|
| 6 |
from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown, convert_ocr_response_to_markdown, get_markdown_by_page_numbers, find_in_markdown
|
|
|
|
| 7 |
from enum import Enum
|
| 8 |
import asyncio
|
| 9 |
from deepengineer.webcrawler.async_search import SearchResponse
|
|
|
|
| 11 |
|
| 12 |
class ToolNames(Enum):
|
| 13 |
# Search tools
|
| 14 |
+
SEARCH_TOOL = "web_search_tool"
|
| 15 |
ARXIV_SEARCH = "arxiv_search"
|
| 16 |
PUBMED_SEARCH = "pubmed_search"
|
| 17 |
SCIENCEDIRECT_SEARCH = "sciencedirect_search"
|
|
|
|
| 23 |
GET_PAGES_CONTENT = "get_pages_content"
|
| 24 |
FIND_IN_MARKDOWN = "find_in_markdown"
|
| 25 |
|
| 26 |
+
def filter_search_results(search_response: SearchResponse, max_nb_results: int = 5) -> SearchResponse:
|
| 27 |
search_response.search_results = search_response.search_results[:max_nb_results]
|
| 28 |
+
return search_response.to_string()
|
| 29 |
|
| 30 |
|
| 31 |
class SearchTool(Tool):
|
|
|
|
| 40 |
},
|
| 41 |
}
|
| 42 |
output_type = "object"
|
|
|
|
| 43 |
|
| 44 |
+
def forward(self, search_query: str) -> str:
|
| 45 |
result = asyncio.run(linkup_search_async(
|
| 46 |
search_query=search_query,
|
| 47 |
))
|
| 48 |
+
return filter_search_results(result)
|
| 49 |
|
| 50 |
class ArxivSearchTool(Tool):
|
| 51 |
name = ToolNames.ARXIV_SEARCH.value
|
|
|
|
| 60 |
}
|
| 61 |
output_type = "object"
|
| 62 |
|
| 63 |
+
def forward(self, search_query: str) -> str:
|
| 64 |
result = asyncio.run(arxiv_search_async(search_query))
|
| 65 |
+
return filter_search_results(result)
|
| 66 |
|
| 67 |
class PubmedSearchTool(Tool):
|
| 68 |
name = ToolNames.PUBMED_SEARCH.value
|
|
|
|
| 77 |
}
|
| 78 |
output_type = "object"
|
| 79 |
|
| 80 |
+
def forward(self, search_query: str) -> str:
|
| 81 |
result = asyncio.run(pubmed_search_async(search_query))
|
| 82 |
+
return filter_search_results(result)
|
| 83 |
|
| 84 |
class ScientificSearchTool(Tool):
|
| 85 |
name = ToolNames.SCIENTIFIC_SEARCH.value
|
|
|
|
| 93 |
}
|
| 94 |
}
|
| 95 |
output_type = "object"
|
|
|
|
| 96 |
def forward(self, search_query: str) -> dict:
|
| 97 |
result = asyncio.run(scientific_search_async(search_query))
|
| 98 |
+
return filter_search_results(result)
|
| 99 |
|
| 100 |
URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
|
| 101 |
|
|
|
|
| 191 |
"""Create a web search agent with search, crawling, and PDF analysis capabilities."""
|
| 192 |
|
| 193 |
model = LiteLLMModel(model_id=model_id)
|
| 194 |
+
database = DataBase()
|
| 195 |
|
| 196 |
# Web search and crawling tools
|
| 197 |
WEB_SEARCH_TOOLS = [
|
|
|
|
| 199 |
ArxivSearchTool(),
|
| 200 |
PubmedSearchTool(),
|
| 201 |
ScientificSearchTool(),
|
| 202 |
+
GetTableOfContentsTool(database),
|
| 203 |
+
GetMarkdownTool(database),
|
| 204 |
+
GetPagesContentTool(database),
|
| 205 |
+
FindInMarkdownTool(database),
|
| 206 |
]
|
| 207 |
|
| 208 |
+
web_search_agent = CodeAgent(
|
| 209 |
model=model,
|
| 210 |
tools=WEB_SEARCH_TOOLS,
|
| 211 |
max_steps=20,
|
src/deepengineer/webcrawler/async_search.py
CHANGED
|
@@ -24,6 +24,32 @@ class SearchResponse(BaseModel):
|
|
| 24 |
answer: str | None = Field(None, description="Direct answer from the search API if available")
|
| 25 |
search_results: list[SearchResult] = Field(default_factory=list, description="List of search results")
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
class ScientificDomains(str, Enum):
|
| 28 |
wikipedia = "wikipedia.org"
|
| 29 |
arxiv = "arxiv.org"
|
|
|
|
| 24 |
answer: str | None = Field(None, description="Direct answer from the search API if available")
|
| 25 |
search_results: list[SearchResult] = Field(default_factory=list, description="List of search results")
|
| 26 |
|
| 27 |
+
def to_string(self):
|
| 28 |
+
"""Convert search response to a formatted string suitable for LLM consumption."""
|
| 29 |
+
result_parts = []
|
| 30 |
+
|
| 31 |
+
# Add the query
|
| 32 |
+
result_parts.append(f"Search Query: {self.query}\n")
|
| 33 |
+
|
| 34 |
+
# Add the direct answer if available
|
| 35 |
+
if self.answer:
|
| 36 |
+
result_parts.append(f"Direct Answer: {self.answer}\n")
|
| 37 |
+
|
| 38 |
+
# Add search results
|
| 39 |
+
if self.search_results:
|
| 40 |
+
result_parts.append(f"Found {len(self.search_results)} search results:\n")
|
| 41 |
+
|
| 42 |
+
for i, result in enumerate(self.search_results, 1):
|
| 43 |
+
result_parts.append(f"\n--- Result {i} ---")
|
| 44 |
+
result_parts.append(f"Title: {result.title}")
|
| 45 |
+
result_parts.append(f"URL: {result.url}")
|
| 46 |
+
result_parts.append(f"Content: {result.content[:2000]}...")
|
| 47 |
+
result_parts.append("") # Empty line for separation
|
| 48 |
+
else:
|
| 49 |
+
result_parts.append("No search results found.")
|
| 50 |
+
|
| 51 |
+
return "\n".join(result_parts)
|
| 52 |
+
|
| 53 |
class ScientificDomains(str, Enum):
|
| 54 |
wikipedia = "wikipedia.org"
|
| 55 |
arxiv = "arxiv.org"
|
tests/{webcrawler/deepsearch → deepsearch}/test_pdf_agent.py
RENAMED
|
File without changes
|
tests/deepsearch/test_web_agent.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from deepengineer.deepsearch.scawl_web_agent import create_web_search_agent
|
| 2 |
+
|
| 3 |
+
def test_create_web_search_agent():
|
| 4 |
+
agent = create_web_search_agent()
|
| 5 |
+
agent.run("Est il possible de faire un réacteur thermique avec du graphite et du plomb?")
|