Spaces:

charles-azam
/

deepdraft

Runtime error

App Files Files Community

Charles Azam commited on Jul 6, 2025

Commit

bb62e6b

1 Parent(s): ce79b68

feat: scrawl web agent

Browse files

Files changed (5) hide show

docs/webcrawler.py +0 -3
src/deepengineer/deepsearch/analyse_markdown_agent.py +0 -1
src/deepengineer/deepsearch/scawl_web_agent.py +47 -84
tests/webcrawler/{test_pdf_agent.py → deepsearch/test_pdf_agent.py} +0 -0
tests/webcrawler/test_sync_search_speed.py +0 -75

docs/webcrawler.py DELETED Viewed

@@ -1,3 +0,0 @@
-from dotenv import load_dotenv
-load_dotenv()

src/deepengineer/deepsearch/analyse_markdown_agent.py CHANGED Viewed

@@ -6,7 +6,6 @@ from smolagents import CodeAgent, tool, Tool, LiteLLMModel
 from deepengineer.webcrawler.pdf_utils import get_markdown_by_page_numbers, get_table_of_contents_per_page_markdown, find_in_markdown, convert_ocr_response_to_markdown
 from mistralai import OCRResponse
 from enum import Enum
-from pathlib import Path
 class ToolNames(Enum):
     GET_TABLE_OF_CONTENTS = "get_table_of_contents"

 from deepengineer.webcrawler.pdf_utils import get_markdown_by_page_numbers, get_table_of_contents_per_page_markdown, find_in_markdown, convert_ocr_response_to_markdown
 from mistralai import OCRResponse
 from enum import Enum
 class ToolNames(Enum):
     GET_TABLE_OF_CONTENTS = "get_table_of_contents"

src/deepengineer/deepsearch/scawl_web_agent.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from smolagents import CodeAgent, Tool, LiteLLMModel, tool
 from deepengineer.webcrawler.async_search import (
     linkup_search_async, arxiv_search_async,
     pubmed_search_async, scientific_search_async,
@@ -8,7 +8,7 @@ from mistralai import OCRResponse
 from enum import Enum
 import asyncio
 from deepengineer.webcrawler.async_search import SearchResponse
 class ToolNames(Enum):
     # Search tools
@@ -115,32 +115,43 @@ class GetTableOfContentsTool(Tool):
     }
     output_type = "string"
-    def __init__(self, markdown: OCRResponse):
         super().__init__()
-        self.markdown: OCRResponse = markdown
-        self.table_of_contents: str = get_table_of_contents_per_page_markdown(self.markdown)
     def forward(self, url: str) -> str:
-        return self.table_of_contents
 class GetMarkdownTool(Tool):
     name = ToolNames.GET_MARKDOWN.value
-    description = f"Returns the markdown entire content of the document. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages."
-    inputs = {}
     output_type = "string"
-    def __init__(self, markdown: OCRResponse):
         super().__init__()
-        self.markdown: OCRResponse = markdown
-        self.markdown_content: str = convert_ocr_response_to_markdown(self.markdown)
-    def forward(self) -> str:
-        return self.markdown_content
 class GetPagesContentTool(Tool):
     name = ToolNames.GET_PAGES_CONTENT.value
-    description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages. Expects a list of page numbers as integers as input."
     inputs = {
         "page_numbers": {
             "type": "array",
             "description": "The page numbers to get the content of."
@@ -148,30 +159,36 @@ class GetPagesContentTool(Tool):
     }
     output_type = "string"
-    def __init__(self, markdown: OCRResponse):
         super().__init__()
-        self.markdown: OCRResponse = markdown
-    def forward(self, page_numbers: list[int]) -> str:
-        return get_markdown_by_page_numbers(self.markdown, page_numbers)
 class FindInMarkdownTool(Tool):
     name = ToolNames.FIND_IN_MARKDOWN.value
-    description = f"Finds the page numbers of the document that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the document that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages."
     inputs = {
         "search_queries": {
             "type": "array",
-            "description": "The search queries to find in the document. List of strings."
         }
     }
     output_type = "array"
-    def __init__(self, markdown: OCRResponse):
         super().__init__()
-        self.markdown: OCRResponse = markdown
-    def forward(self, search_queries: list[str]) -> list[int]:
-        return find_in_markdown(self.markdown, search_queries)
 def create_web_search_agent(model_id="deepseek/deepseek-chat"):
     """Create a web search agent with search, crawling, and PDF analysis capabilities."""
@@ -180,17 +197,17 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
     # Web search and crawling tools
     WEB_SEARCH_TOOLS = [
-        TavilySearchTool(),
-        LinkupSearchTool(),
         ArxivSearchTool(),
         PubmedSearchTool(),
         ScientificSearchTool(),
-        CrawlUrlTool(),
-        DownloadPdfTool(),
-        ArxivDownloadPdfTool(),
     ]
-    web_search_agent = CodeAgent(
         model=model,
         tools=WEB_SEARCH_TOOLS,
         max_steps=20,
@@ -200,58 +217,4 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
         description="""A team member that can search the web, crawl URLs, download PDFs, and analyze documents.""",
     )
-    web_search_agent.prompt_templates["managed_agent"]["task"] += """
-    You can search the web using various APIs (Tavily, Linkup, arXiv, PubMed, ScienceDirect).
-    You can crawl URLs to extract markdown content.
-    You can download PDFs from URLs or arXiv and store them in the data/pdfs directory.
-    For PDF analysis, you'll need to first download the PDF and then use the markdown analysis tools.
-    """
-    return web_search_agent
-def create_web_search_agent_with_pdf_analysis(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
-    """Create a web search agent that also includes PDF analysis capabilities."""
-    model = LiteLLMModel(model_id=model_id)
-    # Web search and crawling tools
-    WEB_SEARCH_TOOLS = [
-        TavilySearchTool(),
-        LinkupSearchTool(),
-        ArxivSearchTool(),
-        PubmedSearchTool(),
-        ScientificSearchTool(),
-        CrawlUrlTool(),
-        DownloadPdfTool(),
-        ArxivDownloadPdfTool(),
-    ]
-    # PDF analysis tools (if markdown is provided)
-    PDF_ANALYSIS_TOOLS = [
-        GetTableOfContentsTool(markdown),
-        GetMarkdownTool(markdown),
-        GetPagesContentTool(markdown),
-        FindInMarkdownTool(markdown),
-    ]
-    all_tools = WEB_SEARCH_TOOLS + PDF_ANALYSIS_TOOLS
-    web_search_agent = CodeAgent(
-        model=model,
-        tools=all_tools,
-        max_steps=20,
-        verbosity_level=2,
-        planning_interval=4,
-        name="web_search_agent_with_pdf_analysis",
-        description="""A team member that can search the web, crawl URLs, download PDFs, and analyze the provided PDF document.""",
-        additional_authorized_imports=["numpy", "matplotlib", "scipy", "sympy", "pandas", ],
-    )
-    web_search_agent.prompt_templates["managed_agent"]["task"] += """
-    You can search the web using various APIs (Linkup, arXiv, PubMed, ScienceDirect).
-    You can crawl URLs to extract markdown content.
-    You can download PDFs from URLs or arXiv and store them in the data/pdfs directory.
-    You can analyze the provided PDF document using the markdown analysis tools.
-    """
     return web_search_agent

+from smolagents import CodeAgent, Tool, LiteLLMModel, tool, ToolCallingAgent
 from deepengineer.webcrawler.async_search import (
     linkup_search_async, arxiv_search_async,
     pubmed_search_async, scientific_search_async,
 from enum import Enum
 import asyncio
 from deepengineer.webcrawler.async_search import SearchResponse
+from deepengineer.webcrawler.crawl_database import DataBase
 class ToolNames(Enum):
     # Search tools
     }
     output_type = "string"
+    def __init__(self, database: DataBase):
         super().__init__()
+        self.database: DataBase = database
     def forward(self, url: str) -> str:
+        markdown = self.database.get_markdown_of_url(url)
+        table_of_contents: str = get_table_of_contents_per_page_markdown(markdown)
+        return table_of_contents
 class GetMarkdownTool(Tool):
     name = ToolNames.GET_MARKDOWN.value
+    description = f"Returns in markdown entire content of the url. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can also use {ToolNames.GET_TABLE_OF_CONTENTS.value} first to get the table of contents of the document including the number of pages."
+    inputs = {
+        "url": {
+            "type": "string",
+            "description": "The URL to get the markdown of."
+        }
+    }
     output_type = "string"
+    def __init__(self, database: DataBase):
         super().__init__()
+        self.database: DataBase = database
+    def forward(self, url: str) -> str:
+        markdown = self.database.get_markdown_of_url(url)
+        markdown_content: str = convert_ocr_response_to_markdown(markdown)
+        return markdown_content
 class GetPagesContentTool(Tool):
     name = ToolNames.GET_PAGES_CONTENT.value
+    description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the url including the number of pages. Expects a list of page numbers as integers as input. {URL_EXPLAINATION}"
     inputs = {
+        "url": {
+            "type": "string",
+            "description": "The URL to get the content of."
+        },
         "page_numbers": {
             "type": "array",
             "description": "The page numbers to get the content of."
     }
     output_type = "string"
+    def __init__(self, database: DataBase):
         super().__init__()
+        self.database: DataBase = database
+    def forward(self, url: str, page_numbers: list[int]) -> str:
+        markdown = self.database.get_markdown_of_url(url)
+        return get_markdown_by_page_numbers(markdown, page_numbers)
 class FindInMarkdownTool(Tool):
     name = ToolNames.FIND_IN_MARKDOWN.value
+    description = f"Finds the page numbers of the url that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the url that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages. {URL_EXPLAINATION}"
     inputs = {
+        "url": {
+            "type": "string",
+            "description": "The URL to find in."
+        },
         "search_queries": {
             "type": "array",
+            "description": "The search queries to find in the url. List of strings."
         }
     }
     output_type = "array"
+    def __init__(self, database: DataBase):
         super().__init__()
+        self.database: DataBase = database
+    def forward(self, url: str, search_queries: list[str]) -> list[int]:
+        markdown = self.database.get_markdown_of_url(url)
+        return find_in_markdown(markdown, search_queries)
 def create_web_search_agent(model_id="deepseek/deepseek-chat"):
     """Create a web search agent with search, crawling, and PDF analysis capabilities."""
     # Web search and crawling tools
     WEB_SEARCH_TOOLS = [
+        SearchTool(),
         ArxivSearchTool(),
         PubmedSearchTool(),
         ScientificSearchTool(),
+        GetTableOfContentsTool(),
+        GetMarkdownTool(),
+        GetPagesContentTool(),
+        FindInMarkdownTool(),
     ]
+    web_search_agent = ToolCallingAgent(
         model=model,
         tools=WEB_SEARCH_TOOLS,
         max_steps=20,
         description="""A team member that can search the web, crawl URLs, download PDFs, and analyze documents.""",
     )
     return web_search_agent

tests/webcrawler/{test_pdf_agent.py → deepsearch/test_pdf_agent.py} RENAMED Viewed

File without changes

tests/webcrawler/test_sync_search_speed.py DELETED Viewed

@@ -1,75 +0,0 @@
-from deepengineer.webcrawler.async_search import linkup_search_async, SearchResponse, SearchResult, ScientificDomains
-from linkup import LinkupClient, LinkupSourcedAnswer
-from typing import Literal
-import time
-import asyncio
-def _linkup_search_sync(
-    search_query: str,
-    depth: Literal["standard", "deep"] = "standard",
-    output_type: Literal['searchResults', 'sourcedAnswer', 'structured'] = "sourcedAnswer",
-    include_images: bool = False,
-    include_domains: list[ScientificDomains] = None,
-) -> SearchResponse:
-    client = LinkupClient()
-    search_response: LinkupSourcedAnswer = client.search(
-        query=search_query,
-        depth=depth,
-        output_type=output_type,
-        include_images=include_images,
-        include_domains=include_domains,
-    )
-    search_results = [
-        SearchResult(
-            title=result.name,
-            url=result.url,
-            content=result.snippet,
-            raw_content=None,
-        )
-        for result in search_response.sources
-    ]
-    # Convert to our Pydantic models
-    responses: SearchResponse = SearchResponse(
-        query=search_query,
-        answer=search_response.answer,
-        search_results=search_results
-    )
-    return responses
-def linkup_search_speed_test():
-    """
-Conclusion: no need to rewrite the async version to sync version. It takes roughly 6 seconds in both cases
-    """
-    print("Testing linkup search speed asynchronously...")
-    start_time = time.time()
-    for i in range(5):
-        start_loop_time = time.time()
-        output = asyncio.run(linkup_search_async(
-            search_query="Would it be possible to make a thermal reactor with graphite and lead?",
-        ))
-        print(output.answer[:10])
-        end_loop_time = time.time()
-        print(f"Time taken for loop {i}: {end_loop_time - start_loop_time} seconds")
-    print("Testing linkup search speed syncronoulsy...")
-    start_time = time.time()
-    for i in range(5):
-        start_loop_time = time.time()
-        _linkup_search_sync(
-            search_query="Would it be possible to make a thermal reactor with graphite and lead?",
-        )
-        end_loop_time = time.time()
-        print(f"Time taken for loop {i}: {end_loop_time - start_loop_time} seconds")
-    end_time = time.time()
-    print(f"Total time taken: {end_time - start_time} seconds")