Spaces:

charles-azam
/

deepdraft

Runtime error

App Files Files Community

Charles Azam commited on Jul 6, 2025

Commit

7c1f478

1 Parent(s): bb62e6b

feat: add test for scrawl web agent

Browse files

Files changed (4) hide show

src/deepengineer/deepsearch/scawl_web_agent.py +17 -19
src/deepengineer/webcrawler/async_search.py +26 -0
tests/{webcrawler/deepsearch → deepsearch}/test_pdf_agent.py +0 -0
tests/deepsearch/test_web_agent.py +5 -0

src/deepengineer/deepsearch/scawl_web_agent.py CHANGED Viewed

@@ -1,10 +1,9 @@
-from smolagents import CodeAgent, Tool, LiteLLMModel, tool, ToolCallingAgent
 from deepengineer.webcrawler.async_search import (
     linkup_search_async, arxiv_search_async,
     pubmed_search_async, scientific_search_async,
 )
 from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown, convert_ocr_response_to_markdown, get_markdown_by_page_numbers, find_in_markdown
-from mistralai import OCRResponse
 from enum import Enum
 import asyncio
 from deepengineer.webcrawler.async_search import SearchResponse
@@ -12,7 +11,7 @@ from deepengineer.webcrawler.crawl_database import DataBase
 class ToolNames(Enum):
     # Search tools
-    SEARCH_TOOL = "web search tool"
     ARXIV_SEARCH = "arxiv_search"
     PUBMED_SEARCH = "pubmed_search"
     SCIENCEDIRECT_SEARCH = "sciencedirect_search"
@@ -24,9 +23,9 @@ class ToolNames(Enum):
     GET_PAGES_CONTENT = "get_pages_content"
     FIND_IN_MARKDOWN = "find_in_markdown"
-def filter_search_results(search_response: SearchResponse, max_nb_results: int = 10) -> SearchResponse:
     search_response.search_results = search_response.search_results[:max_nb_results]
-    return search_response
 class SearchTool(Tool):
@@ -41,13 +40,12 @@ class SearchTool(Tool):
         },
     }
     output_type = "object"
-    max_nb_results = 10
-    def forward(self, search_query: str) -> SearchResponse:
         result = asyncio.run(linkup_search_async(
             search_query=search_query,
         ))
-        return filter_search_results(result, SearchTool.max_nb_results)
 class ArxivSearchTool(Tool):
     name = ToolNames.ARXIV_SEARCH.value
@@ -62,9 +60,9 @@ class ArxivSearchTool(Tool):
     }
     output_type = "object"
-    def forward(self, search_query: str) -> SearchResponse:
         result = asyncio.run(arxiv_search_async(search_query))
-        return filter_search_results(result, ArxivSearchTool.max_nb_results)
 class PubmedSearchTool(Tool):
     name = ToolNames.PUBMED_SEARCH.value
@@ -79,9 +77,9 @@ class PubmedSearchTool(Tool):
     }
     output_type = "object"
-    def forward(self, search_query: str) -> SearchResponse:
         result = asyncio.run(pubmed_search_async(search_query))
-        return filter_search_results(result, PubmedSearchTool.max_nb_results)
 class ScientificSearchTool(Tool):
     name = ToolNames.SCIENTIFIC_SEARCH.value
@@ -95,10 +93,9 @@ class ScientificSearchTool(Tool):
         }
     }
     output_type = "object"
     def forward(self, search_query: str) -> dict:
         result = asyncio.run(scientific_search_async(search_query))
-        return result.model_dump()
 URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
@@ -194,6 +191,7 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
     """Create a web search agent with search, crawling, and PDF analysis capabilities."""
     model = LiteLLMModel(model_id=model_id)
     # Web search and crawling tools
     WEB_SEARCH_TOOLS = [
@@ -201,13 +199,13 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
         ArxivSearchTool(),
         PubmedSearchTool(),
         ScientificSearchTool(),
-        GetTableOfContentsTool(),
-        GetMarkdownTool(),
-        GetPagesContentTool(),
-        FindInMarkdownTool(),
     ]
-    web_search_agent = ToolCallingAgent(
         model=model,
         tools=WEB_SEARCH_TOOLS,
         max_steps=20,

+from smolagents import CodeAgent, Tool, LiteLLMModel
 from deepengineer.webcrawler.async_search import (
     linkup_search_async, arxiv_search_async,
     pubmed_search_async, scientific_search_async,
 )
 from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown, convert_ocr_response_to_markdown, get_markdown_by_page_numbers, find_in_markdown
 from enum import Enum
 import asyncio
 from deepengineer.webcrawler.async_search import SearchResponse
 class ToolNames(Enum):
     # Search tools
+    SEARCH_TOOL = "web_search_tool"
     ARXIV_SEARCH = "arxiv_search"
     PUBMED_SEARCH = "pubmed_search"
     SCIENCEDIRECT_SEARCH = "sciencedirect_search"
     GET_PAGES_CONTENT = "get_pages_content"
     FIND_IN_MARKDOWN = "find_in_markdown"
+def filter_search_results(search_response: SearchResponse, max_nb_results: int = 5) -> SearchResponse:
     search_response.search_results = search_response.search_results[:max_nb_results]
+    return search_response.to_string()
 class SearchTool(Tool):
         },
     }
     output_type = "object"
+    def forward(self, search_query: str) -> str:
         result = asyncio.run(linkup_search_async(
             search_query=search_query,
         ))
+        return filter_search_results(result)
 class ArxivSearchTool(Tool):
     name = ToolNames.ARXIV_SEARCH.value
     }
     output_type = "object"
+    def forward(self, search_query: str) -> str:
         result = asyncio.run(arxiv_search_async(search_query))
+        return filter_search_results(result)
 class PubmedSearchTool(Tool):
     name = ToolNames.PUBMED_SEARCH.value
     }
     output_type = "object"
+    def forward(self, search_query: str) -> str:
         result = asyncio.run(pubmed_search_async(search_query))
+        return filter_search_results(result)
 class ScientificSearchTool(Tool):
     name = ToolNames.SCIENTIFIC_SEARCH.value
         }
     }
     output_type = "object"
     def forward(self, search_query: str) -> dict:
         result = asyncio.run(scientific_search_async(search_query))
+        return filter_search_results(result)
 URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
     """Create a web search agent with search, crawling, and PDF analysis capabilities."""
     model = LiteLLMModel(model_id=model_id)
+    database = DataBase()
     # Web search and crawling tools
     WEB_SEARCH_TOOLS = [
         ArxivSearchTool(),
         PubmedSearchTool(),
         ScientificSearchTool(),
+        GetTableOfContentsTool(database),
+        GetMarkdownTool(database),
+        GetPagesContentTool(database),
+        FindInMarkdownTool(database),
     ]
+    web_search_agent = CodeAgent(
         model=model,
         tools=WEB_SEARCH_TOOLS,
         max_steps=20,

src/deepengineer/webcrawler/async_search.py CHANGED Viewed

@@ -24,6 +24,32 @@ class SearchResponse(BaseModel):
     answer: str | None = Field(None, description="Direct answer from the search API if available")
     search_results: list[SearchResult] = Field(default_factory=list, description="List of search results")
 class ScientificDomains(str, Enum):
     wikipedia = "wikipedia.org"
     arxiv = "arxiv.org"

     answer: str | None = Field(None, description="Direct answer from the search API if available")
     search_results: list[SearchResult] = Field(default_factory=list, description="List of search results")
+    def to_string(self):
+        """Convert search response to a formatted string suitable for LLM consumption."""
+        result_parts = []
+        # Add the query
+        result_parts.append(f"Search Query: {self.query}\n")
+        # Add the direct answer if available
+        if self.answer:
+            result_parts.append(f"Direct Answer: {self.answer}\n")
+        # Add search results
+        if self.search_results:
+            result_parts.append(f"Found {len(self.search_results)} search results:\n")
+            for i, result in enumerate(self.search_results, 1):
+                result_parts.append(f"\n--- Result {i} ---")
+                result_parts.append(f"Title: {result.title}")
+                result_parts.append(f"URL: {result.url}")
+                result_parts.append(f"Content: {result.content[:2000]}...")
+                result_parts.append("")  # Empty line for separation
+        else:
+            result_parts.append("No search results found.")
+        return "\n".join(result_parts)
 class ScientificDomains(str, Enum):
     wikipedia = "wikipedia.org"
     arxiv = "arxiv.org"

tests/{webcrawler/deepsearch → deepsearch}/test_pdf_agent.py RENAMED Viewed

File without changes

tests/deepsearch/test_web_agent.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from deepengineer.deepsearch.scawl_web_agent import create_web_search_agent
+def test_create_web_search_agent():
+    agent = create_web_search_agent()
+    agent.run("Est il possible de faire un réacteur thermique avec du graphite et du plomb?")