Spaces:

charles-azam
/

deepdraft

Runtime error

App Files Files Community

Charles Azam commited on Jul 6, 2025

Commit

e040f4f

1 Parent(s): bcce487

feat: add database to process markdowns or links

Browse files

Files changed (8) hide show

src/deepengineer/deepsearch/analyse_markdown_agent.py +2 -0
src/deepengineer/deepsearch/scawl_web_agent.py +39 -123
src/deepengineer/webcrawler/async_crawl.py +3 -2
src/deepengineer/webcrawler/crawl_database.py +42 -0
src/deepengineer/webcrawler/pdf_utils.py +4 -2
tests/webcrawler/test_async_crawl.py +2 -2
tests/webcrawler/test_crawl_database.py +19 -0
tests/webcrawler/test_pdf_agent.py +4 -5

src/deepengineer/deepsearch/analyse_markdown_agent.py CHANGED Viewed

@@ -78,6 +78,8 @@ class FindInMarkdownTool(Tool):
 def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
     model = LiteLLMModel(model_id=model_id)

 def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
+    """This agent is just a test and will not be used as is by the main agent."""
     model = LiteLLMModel(model_id=model_id)

src/deepengineer/deepsearch/scawl_web_agent.py CHANGED Viewed

@@ -1,61 +1,39 @@
-from smolagents import CodeAgent, Tool, LiteLLMModel
 from deepengineer.webcrawler.async_search import (
-    linkup_search_async, tavily_search_async, arxiv_search_async,
     pubmed_search_async, scientific_search_async,
 )
-from deepengineer.webcrawler.async_crawl import (
-    crawl4ai_extract_markdown_of_url_async, arxiv_download_pdf_async, download_pdf_async
-)
 from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown, convert_ocr_response_to_markdown, get_markdown_by_page_numbers, find_in_markdown
 from mistralai import OCRResponse
 from enum import Enum
-from pathlib import Path
 import asyncio
-from typing import Literal
-from deepengineer.webcrawler.utils import sanitize_filename
-from deepengineer.common_path import DATA_DIR
-from deepengineer.webcrawler.async_search import SearchResult
-class DataBase():
-    def __init__(self):
-        self.storage_path = DATA_DIR
-        self.storage_path.mkdir(exist_ok=True, parents=True)
-        self.sources = dict[str, SearchResult]
-    def add_sources(self, sources: list[SearchResult]):
-        for source in sources:
-            self.sources[source.url] = source
-    def get_sources_by_url(self, url: str) -> SearchResult:
-        return self.sources[url]
 class ToolNames(Enum):
     # Search tools
-    TAVILY_SEARCH = "tavily_search"
-    LINKUP_SEARCH = "linkup_search"
     ARXIV_SEARCH = "arxiv_search"
     PUBMED_SEARCH = "pubmed_search"
     SCIENCEDIRECT_SEARCH = "sciencedirect_search"
     SCIENTIFIC_SEARCH = "scientific_search"
-    # Crawling tools
-    CRAWL_URL = "crawl_url"
-    DOWNLOAD_PDF = "download_pdf"
-    ARXIV_DOWNLOAD_PDF = "arxiv_download_pdf"
-    # PDF analysis tools (reusing from markdown agent)
-    GET_TABLE_OF_CONTENTS = "get_table_of_contents"
-    GET_MARKDOWN = "get_markdown"
     GET_PAGES_CONTENT = "get_pages_content"
     FIND_IN_MARKDOWN = "find_in_markdown"
 class SearchTool(Tool):
-    provider: Literal["tavily", "linkup"]
-    name = ToolNames.LINKUP_SEARCH.value
-    description = "Search the web using Linkup API. Good for deep research with sourced answers."
     inputs = {
         "search_query": {
             "type": "string",
@@ -63,19 +41,19 @@ class SearchTool(Tool):
         },
     }
     output_type = "object"
-    def forward(self, search_query: str, depth: str = "standard",
-                output_type: str = "sourcedAnswer") -> dict:
         result = asyncio.run(linkup_search_async(
             search_query=search_query,
-            depth=depth,
-            output_type=output_type
         ))
-        return result.model_dump()
 class ArxivSearchTool(Tool):
     name = ToolNames.ARXIV_SEARCH.value
-    description = "Search arXiv for academic papers and preprints."
     inputs = {
         "search_query": {
             "type": "string",
@@ -84,13 +62,15 @@ class ArxivSearchTool(Tool):
     }
     output_type = "object"
-    def forward(self, search_query: str) -> dict:
         result = asyncio.run(arxiv_search_async(search_query))
-        return result.model_dump()
 class PubmedSearchTool(Tool):
     name = ToolNames.PUBMED_SEARCH.value
-    description = "Search PubMed for medical and scientific literature."
     inputs = {
         "search_query": {
             "type": "string",
@@ -99,13 +79,15 @@ class PubmedSearchTool(Tool):
     }
     output_type = "object"
-    def forward(self, search_query: str) -> dict:
         result = asyncio.run(pubmed_search_async(search_query))
-        return result.model_dump()
 class ScientificSearchTool(Tool):
     name = ToolNames.SCIENTIFIC_SEARCH.value
-    description = "Search across multiple scientific domains: Wikipedia, arXiv, PubMed, and ScienceDirect."
     inputs = {
         "search_query": {
             "type": "string",
@@ -118,93 +100,27 @@ class ScientificSearchTool(Tool):
         result = asyncio.run(scientific_search_async(search_query))
         return result.model_dump()
-class CrawlUrlTool(Tool):
-    name = ToolNames.CRAWL_URL.value
-    description = "Extract markdown content from a URL using crawl4ai."
-    inputs = {
-        "url": {
-            "type": "string",
-            "description": "The URL to crawl and extract markdown from"
-        }
-    }
-    output_type = "string"
-    def forward(self, url: str) -> str:
-        return asyncio.run(crawl4ai_extract_markdown_of_url_async(url))
-class DownloadPdfTool(Tool):
-    name = ToolNames.DOWNLOAD_PDF.value
-    description = "Download a PDF file from a URL and store it in the data directory."
-    inputs = {
-        "url": {
-            "type": "string",
-            "description": "The URL of the PDF to download"
-        },
-        "filename": {
-            "type": "string",
-            "description": "The filename to save the PDF as (without .pdf extension)"
-        }
-    }
-    output_type = "string"
-    def forward(self, url: str, filename: str) -> str:
-        # Create data directory if it doesn't exist
-        data_dir = Path("data")
-        data_dir.mkdir(exist_ok=True)
-        # Create PDFs subdirectory
-        pdfs_dir = data_dir / "pdfs"
-        pdfs_dir.mkdir(exist_ok=True)
-        output_path = pdfs_dir / f"{filename}.pdf"
-        # Download the PDF
-        result_path = asyncio.run(download_pdf_async(url, output_path))
-        return f"PDF downloaded successfully to: {result_path}"
-class ArxivDownloadPdfTool(Tool):
-    name = ToolNames.ARXIV_DOWNLOAD_PDF.value
-    description = "Download a PDF from arXiv by converting the abstract URL to PDF URL."
     inputs = {
         "url": {
             "type": "string",
-            "description": "The arXiv abstract URL (e.g., https://arxiv.org/abs/1234.5678)"
-        },
-        "filename": {
-            "type": "string",
-            "description": "The filename to save the PDF as (without .pdf extension)"
         }
     }
     output_type = "string"
-    def forward(self, url: str, filename: str) -> str:
-        # Create data directory if it doesn't exist
-        data_dir = Path("data")
-        data_dir.mkdir(exist_ok=True)
-        # Create PDFs subdirectory
-        pdfs_dir = data_dir / "pdfs"
-        pdfs_dir.mkdir(exist_ok=True)
-        output_path = pdfs_dir / f"{filename}.pdf"
-        # Download the PDF
-        result_path = asyncio.run(arxiv_download_pdf_async(url, output_path))
-        return f"arXiv PDF downloaded successfully to: {result_path}"
-# Reuse the markdown analysis tools from analyse_markdown_agent.py
-class GetTableOfContentsTool(Tool):
-    name = ToolNames.GET_TABLE_OF_CONTENTS.value
-    description = "Returns all of the titles in the document along with the page number they are on."
-    inputs = {}
-    output_type = "string"
     def __init__(self, markdown: OCRResponse):
         super().__init__()
         self.markdown: OCRResponse = markdown
         self.table_of_contents: str = get_table_of_contents_per_page_markdown(self.markdown)
-    def forward(self) -> str:
         return self.table_of_contents
 class GetMarkdownTool(Tool):

+from smolagents import CodeAgent, Tool, LiteLLMModel, tool
 from deepengineer.webcrawler.async_search import (
+    linkup_search_async, arxiv_search_async,
     pubmed_search_async, scientific_search_async,
 )
 from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown, convert_ocr_response_to_markdown, get_markdown_by_page_numbers, find_in_markdown
 from mistralai import OCRResponse
 from enum import Enum
 import asyncio
+from deepengineer.webcrawler.async_search import SearchResponse
 class ToolNames(Enum):
     # Search tools
+    SEARCH_TOOL = "web search tool"
     ARXIV_SEARCH = "arxiv_search"
     PUBMED_SEARCH = "pubmed_search"
     SCIENCEDIRECT_SEARCH = "sciencedirect_search"
     SCIENTIFIC_SEARCH = "scientific_search"
+    # Exploring link tools
+    GET_TABLE_OF_CONTENTS = "get_table_of_contents_of_url"
+    GET_MARKDOWN = "get_markdown_of_url"
     GET_PAGES_CONTENT = "get_pages_content"
     FIND_IN_MARKDOWN = "find_in_markdown"
+def filter_search_results(search_response: SearchResponse, max_nb_results: int = 10) -> SearchResponse:
+    search_response.search_results = search_response.search_results[:max_nb_results]
+    return search_response
 class SearchTool(Tool):
+    name = ToolNames.SEARCH_TOOL.value
+    description = f"""Search the web using Linkup API. Good for deep research with sourced answers.
+    Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
+    """
     inputs = {
         "search_query": {
             "type": "string",
         },
     }
     output_type = "object"
+    max_nb_results = 10
+    def forward(self, search_query: str) -> SearchResponse:
         result = asyncio.run(linkup_search_async(
             search_query=search_query,
         ))
+        return filter_search_results(result, SearchTool.max_nb_results)
 class ArxivSearchTool(Tool):
     name = ToolNames.ARXIV_SEARCH.value
+    description = """Search arXiv for academic papers and preprints with Linkup API.
+    Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
+    """
     inputs = {
         "search_query": {
             "type": "string",
     }
     output_type = "object"
+    def forward(self, search_query: str) -> SearchResponse:
         result = asyncio.run(arxiv_search_async(search_query))
+        return filter_search_results(result, ArxivSearchTool.max_nb_results)
 class PubmedSearchTool(Tool):
     name = ToolNames.PUBMED_SEARCH.value
+    description = """Search PubMed for medical and scientific literature with Linkup API.
+    Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
+    """
     inputs = {
         "search_query": {
             "type": "string",
     }
     output_type = "object"
+    def forward(self, search_query: str) -> SearchResponse:
         result = asyncio.run(pubmed_search_async(search_query))
+        return filter_search_results(result, PubmedSearchTool.max_nb_results)
 class ScientificSearchTool(Tool):
     name = ToolNames.SCIENTIFIC_SEARCH.value
+    description = """Search across multiple scientific domains: Wikipedia, arXiv, PubMed, and ScienceDirect.
+    Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
+    """
     inputs = {
         "search_query": {
             "type": "string",
         result = asyncio.run(scientific_search_async(search_query))
         return result.model_dump()
+URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
+class GetTableOfContentsTool(Tool):
+    name = ToolNames.GET_TABLE_OF_CONTENTS.value
+    description = f"""Returns all of the titles in the document along with the page number they are on.
+    {URL_EXPLAINATION}
+    """
     inputs = {
         "url": {
             "type": "string",
+            "description": "The URL to get the table of contents of."
         }
     }
     output_type = "string"
     def __init__(self, markdown: OCRResponse):
         super().__init__()
         self.markdown: OCRResponse = markdown
         self.table_of_contents: str = get_table_of_contents_per_page_markdown(self.markdown)
+    def forward(self, url: str) -> str:
         return self.table_of_contents
 class GetMarkdownTool(Tool):

src/deepengineer/webcrawler/async_crawl.py CHANGED Viewed

@@ -20,8 +20,8 @@ async def download_pdf_async(url: str, output_path: Path) -> str:
         await f.write(response.content)
     return output_path
-async def arxiv_download_pdf_async(url: str, output_path: Path) -> str:
-    """Download a PDF from arXiv by converting the abstract URL to PDF URL."""
     # Extract the arXiv ID from the URL
     if "/abs/" in url:
         arxiv_id = url.split("/abs/")[1].rstrip("/")
@@ -31,3 +31,4 @@ async def arxiv_download_pdf_async(url: str, output_path: Path) -> str:
         pdf_url = url
     return await download_pdf_async(pdf_url, output_path)

         await f.write(response.content)
     return output_path
+async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str:
+    """Download a PDF from arXiv by converting the abstract URL to PDF URL. Works also for non arXiv URLs."""
     # Extract the arXiv ID from the URL
     if "/abs/" in url:
         arxiv_id = url.split("/abs/")[1].rstrip("/")
         pdf_url = url
     return await download_pdf_async(pdf_url, output_path)

src/deepengineer/webcrawler/crawl_database.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from deepengineer.webcrawler.utils import sanitize_filename
+from deepengineer.common_path import DATA_DIR
+from deepengineer.webcrawler.async_search import SearchResult, SearchResponse
+import asyncio
+from mistralai import OCRResponse
+from deepengineer.webcrawler.async_crawl import download_pdf_or_arxiv_pdf_async, crawl4ai_extract_markdown_of_url_async
+from deepengineer.webcrawler.pdf_utils import convert_raw_markdown_to_ocr_response
+from deepengineer.webcrawler.pdf_utils import convert_pdf_to_markdown_async
+class DataBase():
+    def __init__(self):
+        self.urls_to_markdown: dict[str, OCRResponse] = {}
+    @staticmethod
+    def preprocess_url(url: str) -> str:
+        """Preprocess the url to make it a valid url."""
+        if "arxiv.org/abs/" in url:
+            return url.replace("arxiv.org/abs/", "arxiv.org/pdf/")
+        else:
+            return url
+    def crawl_url(self, url: str) -> str:
+        """Crawl the url, if the url is a pdf, download the pdf and save and return the markdown."""
+        url = self.preprocess_url(url)
+        if "pdf" in url:
+            output_path = (DATA_DIR / sanitize_filename(url)).with_suffix(".pdf")
+            pdf_path = asyncio.run(download_pdf_or_arxiv_pdf_async(url, output_path=output_path))
+            ocr_response = asyncio.run(convert_pdf_to_markdown_async(pdf_path))
+        else:
+            markdown = asyncio.run(crawl4ai_extract_markdown_of_url_async(url))
+            ocr_response = convert_raw_markdown_to_ocr_response(markdown)
+        self.urls_to_markdown[url] = ocr_response
+        return ocr_response
+    def get_markdown_of_url(self, url: str) -> OCRResponse:
+        url = self.preprocess_url(url)
+        if url in self.urls_to_markdown:
+            return self.urls_to_markdown[url]
+        else:
+            return self.crawl_url(url)

src/deepengineer/webcrawler/pdf_utils.py CHANGED Viewed

@@ -19,7 +19,7 @@ MAX_SIZE_BYTES = 49 * 1024 * 1024
 async def convert_pdf_to_markdown_async(
     pdf_path: Path,
     with_image_description: bool = False,
-) -> tuple[OCRResponse]:
     mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
@@ -62,7 +62,7 @@ def get_markdown_by_page_numbers(markdown: OCRResponse, page_numbers: list[int],
         markdowns.append(f"*Page {page_number}*\n{markdown.pages[page_number].markdown}")
     return "\n\n".join(markdowns)
-def find_in_markdown(markdown: OCRResponse, search_queries: list[str]) -> list[int]:
     """
     Find the page numbers of the pdf that contain the search query.
@@ -73,6 +73,8 @@ def find_in_markdown(markdown: OCRResponse, search_queries: list[str]) -> list[i
     Returns:
         list[int]: The page numbers of the pdf that contain the search query.
     """
     page_numbers: list[int] = []
     for page_number, page in enumerate(markdown.pages):
         for search_query in search_queries:

 async def convert_pdf_to_markdown_async(
     pdf_path: Path,
     with_image_description: bool = False,
+) -> OCRResponse:
     mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
         markdowns.append(f"*Page {page_number}*\n{markdown.pages[page_number].markdown}")
     return "\n\n".join(markdowns)
+def find_in_markdown(markdown: OCRResponse, search_queries: list[str] | str) -> list[int]:
     """
     Find the page numbers of the pdf that contain the search query.
     Returns:
         list[int]: The page numbers of the pdf that contain the search query.
     """
+    if isinstance(search_queries, str):
+        search_queries = [search_queries]
     page_numbers: list[int] = []
     for page_number, page in enumerate(markdown.pages):
         for search_query in search_queries:

tests/webcrawler/test_async_crawl.py CHANGED Viewed

@@ -2,7 +2,7 @@ import pytest
 from deepengineer.webcrawler.async_crawl import (
     crawl4ai_extract_markdown_of_url_async,
     download_pdf_async,
-    arxiv_download_pdf_async,
 )
 from mistralai import OCRResponse
 from deepengineer.webcrawler.testing import URL_WIKIPEDIA, URL_PDF, ARXIV_URL
@@ -27,6 +27,6 @@ async def test_arxiv_download_pdf_async():
     output_path = DATA_DIR / "temp.pdf"
     output_path.unlink(missing_ok=True)
     assert not output_path.exists()
-    pdf_path = await arxiv_download_pdf_async(ARXIV_URL, output_path=output_path)
     assert pdf_path == output_path
     assert output_path.exists()

 from deepengineer.webcrawler.async_crawl import (
     crawl4ai_extract_markdown_of_url_async,
     download_pdf_async,
+    download_pdf_or_arxiv_pdf_async,
 )
 from mistralai import OCRResponse
 from deepengineer.webcrawler.testing import URL_WIKIPEDIA, URL_PDF, ARXIV_URL
     output_path = DATA_DIR / "temp.pdf"
     output_path.unlink(missing_ok=True)
     assert not output_path.exists()
+    pdf_path = await download_pdf_or_arxiv_pdf_async(ARXIV_URL, output_path=output_path)
     assert pdf_path == output_path
     assert output_path.exists()

tests/webcrawler/test_crawl_database.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from deepengineer.webcrawler.crawl_database import DataBase
+def test_crawl_database_arxiv_pdf():
+    db = DataBase()
+    db.crawl_url("https://arxiv.org/pdf/2105.00643")
+    assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
+    assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
+    assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages[0].markdown is not None
+    assert len(db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages) == 20
+def test_crawl_database_arxiv_link():
+    db = DataBase()
+    db.crawl_url("https://arxiv.org/abs/2105.00643")
+    assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
+    assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
+    assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages[0].markdown is not None
+    assert len(db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages) == 20

tests/webcrawler/test_pdf_agent.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from deepengineer.deepsearch.analyse_markdown_agent import create_agent, GetTableOfContentsTool, GetMarkdownTool, GetPagesContentTool, FindInPdfTool
 from mistralai import OCRResponse
 from deepengineer.common_path import DATA_DIR
@@ -11,7 +11,7 @@ def test_pdf_agent():
     ocr_response = load_mock_ocr_response()
     pdf_agent = create_agent(ocr_response)
     assert pdf_agent is not None
-    assert pdf_agent.name == "pdf_agent"
     assert pdf_agent.tools is not None
     assert len(pdf_agent.tools) == 4 + 1 # +1 for the final answer
@@ -19,8 +19,7 @@ def test_pdf_agent():
     GetTableOfContentsTool(ocr_response).forward()
     GetMarkdownTool(ocr_response).forward()
     GetPagesContentTool(ocr_response).forward([1,2,3])
-    FindInPdfTool(ocr_response).forward(["thermal neutron", "neutron"])
-    pdf_agent.run("Give me a summary of the document.")
-test_pdf_agent()

+from deepengineer.deepsearch.analyse_markdown_agent import create_agent, GetTableOfContentsTool, GetMarkdownTool, GetPagesContentTool, FindInMarkdownTool
 from mistralai import OCRResponse
 from deepengineer.common_path import DATA_DIR
     ocr_response = load_mock_ocr_response()
     pdf_agent = create_agent(ocr_response)
     assert pdf_agent is not None
+    assert pdf_agent.name == "markdown_agent"
     assert pdf_agent.tools is not None
     assert len(pdf_agent.tools) == 4 + 1 # +1 for the final answer
     GetTableOfContentsTool(ocr_response).forward()
     GetMarkdownTool(ocr_response).forward()
     GetPagesContentTool(ocr_response).forward([1,2,3])
+    FindInMarkdownTool(ocr_response).forward(["thermal neutron", "neutron"])
+    # pdf_agent.run("Give me a summary of the document.")