Spaces:

charles-azam
/

deepdraft

Runtime error

App Files Files Community

Charles Azam commited on Jul 6, 2025

Commit

b5fafa1

1 Parent(s): 663d842

clean: run linting and formating on repo

Browse files

Files changed (19) hide show

Makefile +1 -4
pyproject.toml +4 -6
src/deepengineer/common_path.py +1 -1
src/deepengineer/deepsearch/analyse_markdown_agent.py +34 -21
src/deepengineer/deepsearch/draw_agent.py +0 -158
src/deepengineer/deepsearch/scawl_web_agent.py +62 -46
src/deepengineer/webcrawler/async_crawl.py +8 -6
src/deepengineer/webcrawler/async_search.py +61 -42
src/deepengineer/webcrawler/crawl_database.py +21 -14
src/deepengineer/webcrawler/pdf_utils.py +37 -25
src/deepengineer/webcrawler/testing.py +7 -3
src/deepengineer/webcrawler/utils.py +15 -8
tests/deepsearch/test_pdf_agent.py +19 -8
tests/deepsearch/test_web_agent.py +13 -1
tests/webcrawler/test_async_crawl.py +5 -3
tests/webcrawler/test_async_search.py +13 -20
tests/webcrawler/test_crawl_database.py +27 -7
tests/webcrawler/test_pdfs_utils.py +15 -6
tests/webcrawler/test_utils.py +28 -19

Makefile CHANGED Viewed

@@ -19,10 +19,7 @@ lint:  ## Run ruff linter
 lint-fix:  ## Run ruff linter and auto-fix issues
 	uv run ruff check --fix src tests
-type-check:  ## Run mypy type checker
-	uv run mypy src
-check: format lint type-check  ## Run all checks (format, lint, type-check)
 test:  ## Run tests
 	uv run pytest tests

 lint-fix:  ## Run ruff linter and auto-fix issues
 	uv run ruff check --fix src tests
+check: format lint  ## Run all checks (format, lint)
 test:  ## Run tests
 	uv run pytest tests

pyproject.toml CHANGED Viewed

@@ -34,13 +34,13 @@ build-backend = "hatchling.build"
 [tool.black]
 line-length = 88
-target-version = ['py313']
 [tool.ruff]
-target-version = "py313"
 line-length = 88
-select = [
     "E",  # pycodestyle errors
     "W",  # pycodestyle warnings
     "F",  # pyflakes
@@ -49,11 +49,9 @@ select = [
     "C4", # flake8-comprehensions
     "UP", # pyupgrade
 ]
-ignore = [
     "E501",  # line too long, handled by black
     "B008",  # do not perform function calls in argument defaults
     "C901",  # too complex
 ]
-[tool.ruff.per-file-ignores]
-"__init__.py" = ["F401"]

 [tool.black]
 line-length = 88
+target-version = ['py312']
 [tool.ruff]
+target-version = "py312"
 line-length = 88
+lint.select = [
     "E",  # pycodestyle errors
     "W",  # pycodestyle warnings
     "F",  # pyflakes
     "C4", # flake8-comprehensions
     "UP", # pyupgrade
 ]
+lint.ignore = [
     "E501",  # line too long, handled by black
     "B008",  # do not perform function calls in argument defaults
     "C901",  # too complex
 ]

src/deepengineer/common_path.py CHANGED Viewed

@@ -8,4 +8,4 @@ assert DEEPENGINEER_CODE_DIR.name == "deepengineer"
 assert DEEPENGINEER_SRC_DIR.name == "src"
 DATA_DIR = DEEPENGINEER_ROOT_DIR / "data"
-assert DATA_DIR.exists()

 assert DEEPENGINEER_SRC_DIR.name == "src"
 DATA_DIR = DEEPENGINEER_ROOT_DIR / "data"
+assert DATA_DIR.exists()

src/deepengineer/deepsearch/analyse_markdown_agent.py CHANGED Viewed

@@ -2,86 +2,97 @@
 Simple agent to analyse a markdown, just to test some ideas.
 """
-from smolagents import CodeAgent, tool, Tool, LiteLLMModel
-from deepengineer.webcrawler.pdf_utils import get_markdown_by_page_numbers, get_table_of_contents_per_page_markdown, find_in_markdown, convert_ocr_response_to_markdown
-from mistralai import OCRResponse
 from enum import Enum
 class ToolNames(Enum):
     GET_TABLE_OF_CONTENTS = "get_table_of_contents"
     GET_MARKDOWN = "get_markdown"
     GET_PAGES_CONTENT = "get_pages_content"
     FIND_IN_MARKDOWN = "find_in_markdown"
 class GetTableOfContentsTool(Tool):
     name = ToolNames.GET_TABLE_OF_CONTENTS.value
     description = "Returns all of the titles in the document along with the page number they are on."
     inputs = {}
     output_type = "string"
     def __init__(self, markdown: OCRResponse):
         super().__init__()
         self.markdown: OCRResponse = markdown
-        self.table_of_contents: str = get_table_of_contents_per_page_markdown(self.markdown)
     def forward(self) -> str:
         return self.table_of_contents
 class GetMarkdownTool(Tool):
     name = ToolNames.GET_MARKDOWN.value
     description = f"Returns the markdown entire content of the document. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages."
     inputs = {}
     output_type = "string"
     def __init__(self, markdown: OCRResponse):
         super().__init__()
         self.markdown: OCRResponse = markdown
         self.markdown_content: str = convert_ocr_response_to_markdown(self.markdown)
     def forward(self) -> str:
         return self.markdown_content
 class GetPagesContentTool(Tool):
     name = ToolNames.GET_PAGES_CONTENT.value
     description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages. Expects a list of page numbers as integers as input."
     inputs = {
         "page_numbers": {
             "type": "array",
-            "description": "The page numbers to get the content of."
         },
     }
     output_type = "string"
     def __init__(self, markdown: OCRResponse):
         super().__init__()
         self.markdown: OCRResponse = markdown
     def forward(self, page_numbers: list[int]) -> str:
         return get_markdown_by_page_numbers(self.markdown, page_numbers)
 class FindInMarkdownTool(Tool):
     name = ToolNames.FIND_IN_MARKDOWN.value
     description = f"Finds the page numbers of the document that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the document that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages."
     inputs = {
         "search_queries": {
             "type": "array",
-            "description": "The search queries to find in the document. List of strings."
         }
     }
     output_type = "array"
     def __init__(self, markdown: OCRResponse):
         super().__init__()
         self.markdown: OCRResponse = markdown
     def forward(self, search_queries: list[str]) -> list[int]:
         return find_in_markdown(self.markdown, search_queries)
 def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
     """This agent is just a test and will not be used as is by the main agent."""
     model = LiteLLMModel(model_id=model_id)
@@ -101,6 +112,8 @@ def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
         name="markdown_agent",
         description="""A team member that can analyse a markdown.""",
     )
-    markdown_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files."""
-    return markdown_agent

 Simple agent to analyse a markdown, just to test some ideas.
 """
 from enum import Enum
+from mistralai import OCRResponse
+from smolagents import CodeAgent, LiteLLMModel, Tool
+from deepengineer.webcrawler.pdf_utils import (
+    convert_ocr_response_to_markdown,
+    find_in_markdown,
+    get_markdown_by_page_numbers,
+    get_table_of_contents_per_page_markdown,
+)
 class ToolNames(Enum):
     GET_TABLE_OF_CONTENTS = "get_table_of_contents"
     GET_MARKDOWN = "get_markdown"
     GET_PAGES_CONTENT = "get_pages_content"
     FIND_IN_MARKDOWN = "find_in_markdown"
 class GetTableOfContentsTool(Tool):
     name = ToolNames.GET_TABLE_OF_CONTENTS.value
     description = "Returns all of the titles in the document along with the page number they are on."
     inputs = {}
     output_type = "string"
     def __init__(self, markdown: OCRResponse):
         super().__init__()
         self.markdown: OCRResponse = markdown
+        self.table_of_contents: str = get_table_of_contents_per_page_markdown(
+            self.markdown
+        )
     def forward(self) -> str:
         return self.table_of_contents
 class GetMarkdownTool(Tool):
     name = ToolNames.GET_MARKDOWN.value
     description = f"Returns the markdown entire content of the document. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages."
     inputs = {}
     output_type = "string"
     def __init__(self, markdown: OCRResponse):
         super().__init__()
         self.markdown: OCRResponse = markdown
         self.markdown_content: str = convert_ocr_response_to_markdown(self.markdown)
     def forward(self) -> str:
         return self.markdown_content
 class GetPagesContentTool(Tool):
     name = ToolNames.GET_PAGES_CONTENT.value
     description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages. Expects a list of page numbers as integers as input."
     inputs = {
         "page_numbers": {
             "type": "array",
+            "description": "The page numbers to get the content of.",
         },
     }
     output_type = "string"
     def __init__(self, markdown: OCRResponse):
         super().__init__()
         self.markdown: OCRResponse = markdown
     def forward(self, page_numbers: list[int]) -> str:
         return get_markdown_by_page_numbers(self.markdown, page_numbers)
 class FindInMarkdownTool(Tool):
     name = ToolNames.FIND_IN_MARKDOWN.value
     description = f"Finds the page numbers of the document that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the document that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages."
     inputs = {
         "search_queries": {
             "type": "array",
+            "description": "The search queries to find in the document. List of strings.",
         }
     }
     output_type = "array"
     def __init__(self, markdown: OCRResponse):
         super().__init__()
         self.markdown: OCRResponse = markdown
     def forward(self, search_queries: list[str]) -> list[int]:
         return find_in_markdown(self.markdown, search_queries)
 def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
     """This agent is just a test and will not be used as is by the main agent."""
     model = LiteLLMModel(model_id=model_id)
         name="markdown_agent",
         description="""A team member that can analyse a markdown.""",
     )
+    markdown_agent.prompt_templates["managed_agent"][
+        "task"
+    ] += """You can navigate to .txt online files."""
+    return markdown_agent

src/deepengineer/deepsearch/draw_agent.py CHANGED Viewed

@@ -1,158 +0,0 @@
-from io import BytesIO
-from time import sleep
-import helium
-from dotenv import load_dotenv
-from PIL import Image
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.common.keys import Keys
-from smolagents import CodeAgent, tool
-from smolagents.agents import ActionStep
-# Load environment variables
-load_dotenv()
-@tool
-def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
-    """
-    Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
-    Args:
-        text: The text to search for
-        nth_result: Which occurrence to jump to (default: 1)
-    """
-    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
-    if nth_result > len(elements):
-        raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
-    result = f"Found {len(elements)} matches for '{text}'."
-    elem = elements[nth_result - 1]
-    driver.execute_script("arguments[0].scrollIntoView(true);", elem)
-    result += f"Focused on element {nth_result} of {len(elements)}"
-    return result
-@tool
-def go_back() -> None:
-    """Goes back to previous page."""
-    driver.back()
-@tool
-def close_popups() -> str:
-    """
-    Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows!
-    This does not work on cookie consent banners.
-    """
-    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
-# Configure Chrome options
-chrome_options = webdriver.ChromeOptions()
-chrome_options.add_argument("--force-device-scale-factor=1")
-chrome_options.add_argument("--window-size=1000,1350")
-chrome_options.add_argument("--disable-pdf-viewer")
-chrome_options.add_argument("--window-position=0,0")
-# Initialize the browser
-driver = helium.start_chrome(headless=False, options=chrome_options)
-# Set up screenshot callback
-def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
-    sleep(1.0)  # Let JavaScript animations happen before taking the screenshot
-    driver = helium.get_driver()
-    current_step = memory_step.step_number
-    if driver is not None:
-        for previous_memory_step in agent.memory.steps:  # Remove previous screenshots for lean processing
-            if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
-                previous_memory_step.observations_images = None
-        png_bytes = driver.get_screenshot_as_png()
-        image = Image.open(BytesIO(png_bytes))
-        print(f"Captured a browser screenshot: {image.size} pixels")
-        memory_step.observations_images = [image.copy()]  # Create a copy to ensure it persists
-    # Update observations with current URL
-    url_info = f"Current url: {driver.current_url}"
-    memory_step.observations = (
-        url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
-    )
-from smolagents import InferenceClientModel
-# Initialize the model
-model_id = "Qwen/Qwen2-VL-72B-Instruct"  # You can change this to your preferred VLM model
-model = InferenceClientModel(model_id=model_id)
-# Create the agent
-agent = CodeAgent(
-    tools=[go_back, close_popups, search_item_ctrl_f],
-    model=model,
-    additional_authorized_imports=["helium"],
-    step_callbacks=[save_screenshot],
-    max_steps=20,
-    verbosity_level=2,
-)
-# Import helium for the agent
-agent.python_executor("from helium import *", agent.state)
-helium_instructions = """
-You can use helium to access websites. Don't bother about the helium driver, it's already managed.
-We've already ran "from helium import *"
-Then you can go to pages!
-Code:
-```py
-go_to('github.com/trending')
-```<end_code>
-You can directly click clickable elements by inputting the text that appears on them.
-Code:
-```py
-click("Top products")
-```<end_code>
-If it's a link:
-Code:
-```py
-click(Link("Top products"))
-```<end_code>
-If you try to interact with an element and it's not found, you'll get a LookupError.
-In general stop your action after each button click to see what happens on your screenshot.
-Never try to login in a page.
-To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
-Code:
-```py
-scroll_down(num_pixels=1200) # This will scroll one viewport down
-```<end_code>
-When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
-Just use your built-in tool `close_popups` to close them:
-Code:
-```py
-close_popups()
-```<end_code>
-You can use .exists() to check for the existence of an element. For example:
-Code:
-```py
-if Text('Accept cookies?').exists():
-    click('I accept')
-```<end_code>
-"""
-search_request = """
-Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
-"""
-agent_output = agent.run(search_request + helium_instructions)
-print("Final output:")
-print(agent_output)
-github_request = """
-I'm trying to find how hard I have to work to get a repo in github.com/trending.
-Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
-"""
-agent_output = agent.run(github_request + helium_instructions)
-print("Final output:")
-print(agent_output)

src/deepengineer/deepsearch/scawl_web_agent.py CHANGED Viewed

@@ -1,13 +1,23 @@
-from smolagents import CodeAgent, Tool, LiteLLMModel
 from deepengineer.webcrawler.async_search import (
-    linkup_search_async, arxiv_search_async,
-    pubmed_search_async, scientific_search_async,
 )
-from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown, convert_ocr_response_to_markdown, get_markdown_by_page_numbers, find_in_markdown
-from enum import Enum
-import asyncio
-from deepengineer.webcrawler.async_search import SearchResponse
 from deepengineer.webcrawler.crawl_database import DataBase
 class ToolNames(Enum):
     # Search tools
@@ -16,37 +26,43 @@ class ToolNames(Enum):
     PUBMED_SEARCH = "pubmed_search"
     SCIENCEDIRECT_SEARCH = "sciencedirect_search"
     SCIENTIFIC_SEARCH = "scientific_search"
     # Exploring link tools
     GET_TABLE_OF_CONTENTS = "get_table_of_contents_of_url"
     GET_MARKDOWN = "get_markdown_of_url"
     GET_PAGES_CONTENT = "get_pages_content"
     FIND_IN_MARKDOWN = "find_in_markdown"
-def filter_search_results(search_response: SearchResponse, max_nb_results: int = 5) -> SearchResponse:
     search_response.search_results = search_response.search_results[:max_nb_results]
     return search_response.to_string()
 class SearchTool(Tool):
     name = ToolNames.SEARCH_TOOL.value
-    description = f"""Search the web using Linkup API. Good for deep research with sourced answers.
     Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
     """
     inputs = {
         "search_query": {
             "type": "string",
-            "description": "The search query to execute"
         },
     }
     output_type = "object"
     def forward(self, search_query: str) -> str:
-        result = asyncio.run(linkup_search_async(
-            search_query=search_query,
-        ))
         return filter_search_results(result)
 class ArxivSearchTool(Tool):
     name = ToolNames.ARXIV_SEARCH.value
     description = """Search arXiv for academic papers and preprints with Linkup API.
@@ -55,15 +71,16 @@ class ArxivSearchTool(Tool):
     inputs = {
         "search_query": {
             "type": "string",
-            "description": "The search query to execute on arXiv"
         }
     }
     output_type = "object"
     def forward(self, search_query: str) -> str:
         result = asyncio.run(arxiv_search_async(search_query))
         return filter_search_results(result)
 class PubmedSearchTool(Tool):
     name = ToolNames.PUBMED_SEARCH.value
     description = """Search PubMed for medical and scientific literature with Linkup API.
@@ -72,15 +89,16 @@ class PubmedSearchTool(Tool):
     inputs = {
         "search_query": {
             "type": "string",
-            "description": "The search query to execute on PubMed"
         }
     }
     output_type = "object"
     def forward(self, search_query: str) -> str:
         result = asyncio.run(pubmed_search_async(search_query))
         return filter_search_results(result)
 class ScientificSearchTool(Tool):
     name = ToolNames.SCIENTIFIC_SEARCH.value
     description = """Search across multiple scientific domains: Wikipedia, arXiv, PubMed, and ScienceDirect.
@@ -89,16 +107,19 @@ class ScientificSearchTool(Tool):
     inputs = {
         "search_query": {
             "type": "string",
-            "description": "The search query to execute across scientific domains"
         }
     }
     output_type = "object"
     def forward(self, search_query: str) -> dict:
         result = asyncio.run(scientific_search_async(search_query))
         return filter_search_results(result)
 URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
 class GetTableOfContentsTool(Tool):
     name = ToolNames.GET_TABLE_OF_CONTENTS.value
     description = f"""Returns all of the titles in the document along with the page number they are on.
@@ -107,55 +128,51 @@ class GetTableOfContentsTool(Tool):
     inputs = {
         "url": {
             "type": "string",
-            "description": "The URL to get the table of contents of."
         }
     }
     output_type = "string"
     def __init__(self, database: DataBase):
         super().__init__()
         self.database: DataBase = database
     def forward(self, url: str) -> str:
         markdown = self.database.get_markdown_of_url(url)
         table_of_contents: str = get_table_of_contents_per_page_markdown(markdown)
         return table_of_contents
 class GetMarkdownTool(Tool):
     name = ToolNames.GET_MARKDOWN.value
     description = f"Returns in markdown entire content of the url. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can also use {ToolNames.GET_TABLE_OF_CONTENTS.value} first to get the table of contents of the document including the number of pages."
     inputs = {
-        "url": {
-            "type": "string",
-            "description": "The URL to get the markdown of."
-        }
     }
     output_type = "string"
     def __init__(self, database: DataBase):
         super().__init__()
         self.database: DataBase = database
     def forward(self, url: str) -> str:
         markdown = self.database.get_markdown_of_url(url)
         markdown_content: str = convert_ocr_response_to_markdown(markdown)
         return markdown_content
 class GetPagesContentTool(Tool):
     name = ToolNames.GET_PAGES_CONTENT.value
     description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the url including the number of pages. Expects a list of page numbers as integers as input. {URL_EXPLAINATION}"
     inputs = {
-        "url": {
-            "type": "string",
-            "description": "The URL to get the content of."
-        },
         "page_numbers": {
             "type": "array",
-            "description": "The page numbers to get the content of."
         },
     }
     output_type = "string"
     def __init__(self, database: DataBase):
         super().__init__()
         self.database: DataBase = database
@@ -164,32 +181,31 @@ class GetPagesContentTool(Tool):
         markdown = self.database.get_markdown_of_url(url)
         return get_markdown_by_page_numbers(markdown, page_numbers)
 class FindInMarkdownTool(Tool):
     name = ToolNames.FIND_IN_MARKDOWN.value
     description = f"Finds the page numbers of the url that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the url that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages. {URL_EXPLAINATION}"
     inputs = {
-        "url": {
-            "type": "string",
-            "description": "The URL to find in."
-        },
         "search_queries": {
             "type": "array",
-            "description": "The search queries to find in the url. List of strings."
-        }
     }
     output_type = "array"
     def __init__(self, database: DataBase):
         super().__init__()
         self.database: DataBase = database
     def forward(self, url: str, search_queries: list[str]) -> list[int]:
         markdown = self.database.get_markdown_of_url(url)
         return find_in_markdown(markdown, search_queries)
 def create_web_search_agent(model_id="deepseek/deepseek-chat"):
     """Create a web search agent with search, crawling, and PDF analysis capabilities."""
     model = LiteLLMModel(model_id=model_id)
     database = DataBase()
@@ -204,7 +220,7 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
         GetPagesContentTool(database),
         FindInMarkdownTool(database),
     ]
     web_search_agent = CodeAgent(
         model=model,
         tools=WEB_SEARCH_TOOLS,
@@ -214,5 +230,5 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
         name="web_search_agent",
         description="""A team member that can search the web, crawl URLs, download PDFs, and analyze documents.""",
     )
     return web_search_agent

+import asyncio
+from enum import Enum
+from smolagents import CodeAgent, LiteLLMModel, Tool
 from deepengineer.webcrawler.async_search import (
+    SearchResponse,
+    arxiv_search_async,
+    linkup_search_async,
+    pubmed_search_async,
+    scientific_search_async,
 )
 from deepengineer.webcrawler.crawl_database import DataBase
+from deepengineer.webcrawler.pdf_utils import (
+    convert_ocr_response_to_markdown,
+    find_in_markdown,
+    get_markdown_by_page_numbers,
+    get_table_of_contents_per_page_markdown,
+)
 class ToolNames(Enum):
     # Search tools
     PUBMED_SEARCH = "pubmed_search"
     SCIENCEDIRECT_SEARCH = "sciencedirect_search"
     SCIENTIFIC_SEARCH = "scientific_search"
     # Exploring link tools
     GET_TABLE_OF_CONTENTS = "get_table_of_contents_of_url"
     GET_MARKDOWN = "get_markdown_of_url"
     GET_PAGES_CONTENT = "get_pages_content"
     FIND_IN_MARKDOWN = "find_in_markdown"
+def filter_search_results(
+    search_response: SearchResponse, max_nb_results: int = 5
+) -> SearchResponse:
     search_response.search_results = search_response.search_results[:max_nb_results]
     return search_response.to_string()
 class SearchTool(Tool):
     name = ToolNames.SEARCH_TOOL.value
+    description = """Search the web using Linkup API. Good for deep research with sourced answers.
     Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
     """
     inputs = {
         "search_query": {
             "type": "string",
+            "description": "The search query to execute",
         },
     }
     output_type = "object"
     def forward(self, search_query: str) -> str:
+        result = asyncio.run(
+            linkup_search_async(
+                search_query=search_query,
+            )
+        )
         return filter_search_results(result)
 class ArxivSearchTool(Tool):
     name = ToolNames.ARXIV_SEARCH.value
     description = """Search arXiv for academic papers and preprints with Linkup API.
     inputs = {
         "search_query": {
             "type": "string",
+            "description": "The search query to execute on arXiv",
         }
     }
     output_type = "object"
     def forward(self, search_query: str) -> str:
         result = asyncio.run(arxiv_search_async(search_query))
         return filter_search_results(result)
 class PubmedSearchTool(Tool):
     name = ToolNames.PUBMED_SEARCH.value
     description = """Search PubMed for medical and scientific literature with Linkup API.
     inputs = {
         "search_query": {
             "type": "string",
+            "description": "The search query to execute on PubMed",
         }
     }
     output_type = "object"
     def forward(self, search_query: str) -> str:
         result = asyncio.run(pubmed_search_async(search_query))
         return filter_search_results(result)
 class ScientificSearchTool(Tool):
     name = ToolNames.SCIENTIFIC_SEARCH.value
     description = """Search across multiple scientific domains: Wikipedia, arXiv, PubMed, and ScienceDirect.
     inputs = {
         "search_query": {
             "type": "string",
+            "description": "The search query to execute across scientific domains",
         }
     }
     output_type = "object"
     def forward(self, search_query: str) -> dict:
         result = asyncio.run(scientific_search_async(search_query))
         return filter_search_results(result)
 URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
 class GetTableOfContentsTool(Tool):
     name = ToolNames.GET_TABLE_OF_CONTENTS.value
     description = f"""Returns all of the titles in the document along with the page number they are on.
     inputs = {
         "url": {
             "type": "string",
+            "description": "The URL to get the table of contents of.",
         }
     }
     output_type = "string"
     def __init__(self, database: DataBase):
         super().__init__()
         self.database: DataBase = database
     def forward(self, url: str) -> str:
         markdown = self.database.get_markdown_of_url(url)
         table_of_contents: str = get_table_of_contents_per_page_markdown(markdown)
         return table_of_contents
 class GetMarkdownTool(Tool):
     name = ToolNames.GET_MARKDOWN.value
     description = f"Returns in markdown entire content of the url. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can also use {ToolNames.GET_TABLE_OF_CONTENTS.value} first to get the table of contents of the document including the number of pages."
     inputs = {
+        "url": {"type": "string", "description": "The URL to get the markdown of."}
     }
     output_type = "string"
     def __init__(self, database: DataBase):
         super().__init__()
         self.database: DataBase = database
     def forward(self, url: str) -> str:
         markdown = self.database.get_markdown_of_url(url)
         markdown_content: str = convert_ocr_response_to_markdown(markdown)
         return markdown_content
 class GetPagesContentTool(Tool):
     name = ToolNames.GET_PAGES_CONTENT.value
     description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the url including the number of pages. Expects a list of page numbers as integers as input. {URL_EXPLAINATION}"
     inputs = {
+        "url": {"type": "string", "description": "The URL to get the content of."},
         "page_numbers": {
             "type": "array",
+            "description": "The page numbers to get the content of.",
         },
     }
     output_type = "string"
     def __init__(self, database: DataBase):
         super().__init__()
         self.database: DataBase = database
         markdown = self.database.get_markdown_of_url(url)
         return get_markdown_by_page_numbers(markdown, page_numbers)
 class FindInMarkdownTool(Tool):
     name = ToolNames.FIND_IN_MARKDOWN.value
     description = f"Finds the page numbers of the url that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the url that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages. {URL_EXPLAINATION}"
     inputs = {
+        "url": {"type": "string", "description": "The URL to find in."},
         "search_queries": {
             "type": "array",
+            "description": "The search queries to find in the url. List of strings.",
+        },
     }
     output_type = "array"
     def __init__(self, database: DataBase):
         super().__init__()
         self.database: DataBase = database
     def forward(self, url: str, search_queries: list[str]) -> list[int]:
         markdown = self.database.get_markdown_of_url(url)
         return find_in_markdown(markdown, search_queries)
 def create_web_search_agent(model_id="deepseek/deepseek-chat"):
     """Create a web search agent with search, crawling, and PDF analysis capabilities."""
     model = LiteLLMModel(model_id=model_id)
     database = DataBase()
         GetPagesContentTool(database),
         FindInMarkdownTool(database),
     ]
     web_search_agent = CodeAgent(
         model=model,
         tools=WEB_SEARCH_TOOLS,
         name="web_search_agent",
         description="""A team member that can search the web, crawl URLs, download PDFs, and analyze documents.""",
     )
     return web_search_agent

src/deepengineer/webcrawler/async_crawl.py CHANGED Viewed

@@ -1,15 +1,17 @@
 import aiofiles
-import httpx
 import crawl4ai
-import os
-from pathlib import Path
 async def crawl4ai_extract_markdown_of_url_async(url: str) -> str:
     """Extract markdown content from a URL using crawl4ai."""
     async with crawl4ai.AsyncWebCrawler() as crawler:
         result = await crawler.arun(url=url)
         return result.markdown
 async def download_pdf_async(url: str, output_path: Path) -> str:
     """Download a PDF file from a URL."""
     timeout = httpx.Timeout(30.0, connect=10.0)
@@ -20,6 +22,7 @@ async def download_pdf_async(url: str, output_path: Path) -> str:
         await f.write(response.content)
     return output_path
 async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str:
     """Download a PDF from arXiv by converting the abstract URL to PDF URL. Works also for non arXiv URLs."""
     # Extract the arXiv ID from the URL
@@ -29,6 +32,5 @@ async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str:
     else:
         # If it's already a PDF URL, use it as is
         pdf_url = url
-    return await download_pdf_async(pdf_url, output_path)

+from pathlib import Path
 import aiofiles
 import crawl4ai
+import httpx
 async def crawl4ai_extract_markdown_of_url_async(url: str) -> str:
     """Extract markdown content from a URL using crawl4ai."""
     async with crawl4ai.AsyncWebCrawler() as crawler:
         result = await crawler.arun(url=url)
         return result.markdown
 async def download_pdf_async(url: str, output_path: Path) -> str:
     """Download a PDF file from a URL."""
     timeout = httpx.Timeout(30.0, connect=10.0)
         await f.write(response.content)
     return output_path
 async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str:
     """Download a PDF from arXiv by converting the abstract URL to PDF URL. Works also for non arXiv URLs."""
     # Extract the arXiv ID from the URL
     else:
         # If it's already a PDF URL, use it as is
         pdf_url = url
+    return await download_pdf_async(pdf_url, output_path)

src/deepengineer/webcrawler/async_search.py CHANGED Viewed

@@ -1,61 +1,67 @@
 import os
-import asyncio
-import requests
-from pydantic import BaseModel, Field
-from typing import List, Optional, Literal
 from enum import Enum
 from linkup import LinkupClient, LinkupSourcedAnswer
 from tavily import AsyncTavilyClient
-from langchain_community.retrievers import ArxivRetriever
-from langchain_community.utilities.pubmed import PubMedAPIWrapper
 class SearchResult(BaseModel):
     """Represents a single search result from any search API."""
     title: str = Field(..., description="Title of the search result")
     url: str = Field(..., description="URL of the result")
     content: str = Field(..., description="Summary/snippet of content")
-    raw_content: Optional[str] = Field(None, description="Full page content if available")
 class SearchResponse(BaseModel):
     """Represents a search response from any search API."""
     query: str = Field(..., description="The original search query")
-    answer: str | None = Field(None, description="Direct answer from the search API if available")
-    search_results: list[SearchResult] = Field(default_factory=list, description="List of search results")
     def to_string(self):
         """Convert search response to a formatted string suitable for LLM consumption."""
         result_parts = []
         # Add the query
         result_parts.append(f"Search Query: {self.query}\n")
         # Add the direct answer if available
         if self.answer:
             result_parts.append(f"Direct Answer: {self.answer}\n")
         # Add search results
         if self.search_results:
             result_parts.append(f"Found {len(self.search_results)} search results:\n")
             for i, result in enumerate(self.search_results, 1):
                 result_parts.append(f"\n--- Result {i} ---")
                 result_parts.append(f"Title: {result.title}")
                 result_parts.append(f"URL: {result.url}")
-                result_parts.append(f"Content: {result.content[:2000]}...")
                 result_parts.append("")  # Empty line for separation
         else:
             result_parts.append("No search results found.")
         return "\n".join(result_parts)
 class ScientificDomains(str, Enum):
     wikipedia = "wikipedia.org"
     arxiv = "arxiv.org"
     pubmed = "pubmed.ncbi.nlm.nih.gov"
     sciencedirect = "sciencedirect.com"
 def get_tavily_usage():
     url = "https://api.tavily.com/usage"
     headers = {"Authorization": f"Bearer {os.getenv('TAVILY_API_KEY')}"}
@@ -71,14 +77,14 @@ async def tavily_search_async(
     include_answer: Literal["basic", "advanced"] | None = "advanced",
     include_raw_content: Literal["text", "markdown"] | None = "markdown",
     include_images: bool = False,
-    search_depth: Literal['basic', 'advanced'] | None = "basic",
     include_domains: list[ScientificDomains] = None,
 ) -> SearchResponse:
     """
     Performs concurrent web searches with the Tavily API
     """
     tavily_async_client = AsyncTavilyClient()
     search_response = await tavily_async_client.search(
         query=search_query,
         search_depth=search_depth,
@@ -88,29 +94,29 @@ async def tavily_search_async(
         include_images=include_images,
         include_domains=include_domains,
     )
     search_results = [
         SearchResult(
-            title=result.get('title', ''),
-            url=result.get('url', ''),
-            content=result.get('content', ''),
-            raw_content=result.get('raw_content')
         )
-        for result in search_response.get('results', [])
     ]
     # Convert to our Pydantic models
     responses: SearchResponse = SearchResponse(
         query=search_query,
-        answer=search_response.get('answer', None),
-        search_results=search_results
     )
     return responses
 def get_linkup_balance():
     url = "https://api.linkup.so/v1/credits/balance"
     headers = {"Authorization": f"Bearer {os.getenv('LINKUP_API_KEY')}"}
     response = requests.request("GET", url, headers=headers)
@@ -122,14 +128,16 @@ def get_linkup_balance():
 async def linkup_search_async(
     search_query: str,
     depth: Literal["standard", "deep"] = "standard",
-    output_type: Literal['searchResults', 'sourcedAnswer', 'structured'] = "sourcedAnswer",
     include_images: bool = False,
     include_domains: list[ScientificDomains] = None,
 ) -> SearchResponse:
     """
     Performs concurrent web searches using the Linkup API.
     """
     client = LinkupClient()
     search_response: LinkupSourcedAnswer = await client.async_search(
         query=search_query,
@@ -138,7 +146,7 @@ async def linkup_search_async(
         include_images=include_images,
         include_domains=include_domains,
     )
     search_results = [
         SearchResult(
             title=result.name,
@@ -151,37 +159,48 @@ async def linkup_search_async(
     # Convert to our Pydantic models
     responses: SearchResponse = SearchResponse(
-        query=search_query,
-        answer=search_response.answer,
-        search_results=search_results
     )
     return responses
 async def arxiv_search_async(
     search_query: str,
 ) -> SearchResponse:
-    response = await linkup_search_async(search_query, include_domains=[ScientificDomains.arxiv])
     return response
 async def pubmed_search_async(
     search_query: str,
 ) -> SearchResponse:
-    response = await linkup_search_async(search_query, include_domains=[ScientificDomains.pubmed])
     return response
 async def sciencedirect_search_async(
     search_query: str,
 ) -> SearchResponse:
-    response = await linkup_search_async(search_query, include_domains=[ScientificDomains.sciencedirect])
     return response
 async def scientific_search_async(
     search_query: str,
 ) -> SearchResponse:
-    response = await linkup_search_async(search_query, include_domains=[ScientificDomains.wikipedia, ScientificDomains.arxiv, ScientificDomains.pubmed, ScientificDomains.sciencedirect])
-    return response

 import os
 from enum import Enum
+from typing import Literal
+import requests
 from linkup import LinkupClient, LinkupSourcedAnswer
+from pydantic import BaseModel, Field
 from tavily import AsyncTavilyClient
 class SearchResult(BaseModel):
     """Represents a single search result from any search API."""
     title: str = Field(..., description="Title of the search result")
     url: str = Field(..., description="URL of the result")
     content: str = Field(..., description="Summary/snippet of content")
+    raw_content: str | None = Field(None, description="Full page content if available")
 class SearchResponse(BaseModel):
     """Represents a search response from any search API."""
     query: str = Field(..., description="The original search query")
+    answer: str | None = Field(
+        None, description="Direct answer from the search API if available"
+    )
+    search_results: list[SearchResult] = Field(
+        default_factory=list, description="List of search results"
+    )
     def to_string(self):
         """Convert search response to a formatted string suitable for LLM consumption."""
         result_parts = []
         # Add the query
         result_parts.append(f"Search Query: {self.query}\n")
         # Add the direct answer if available
         if self.answer:
             result_parts.append(f"Direct Answer: {self.answer}\n")
         # Add search results
         if self.search_results:
             result_parts.append(f"Found {len(self.search_results)} search results:\n")
             for i, result in enumerate(self.search_results, 1):
                 result_parts.append(f"\n--- Result {i} ---")
                 result_parts.append(f"Title: {result.title}")
                 result_parts.append(f"URL: {result.url}")
+                result_parts.append(f"Content: {result.content[:2000]}...")
                 result_parts.append("")  # Empty line for separation
         else:
             result_parts.append("No search results found.")
         return "\n".join(result_parts)
 class ScientificDomains(str, Enum):
     wikipedia = "wikipedia.org"
     arxiv = "arxiv.org"
     pubmed = "pubmed.ncbi.nlm.nih.gov"
     sciencedirect = "sciencedirect.com"
 def get_tavily_usage():
     url = "https://api.tavily.com/usage"
     headers = {"Authorization": f"Bearer {os.getenv('TAVILY_API_KEY')}"}
     include_answer: Literal["basic", "advanced"] | None = "advanced",
     include_raw_content: Literal["text", "markdown"] | None = "markdown",
     include_images: bool = False,
+    search_depth: Literal["basic", "advanced"] | None = "basic",
     include_domains: list[ScientificDomains] = None,
 ) -> SearchResponse:
     """
     Performs concurrent web searches with the Tavily API
     """
     tavily_async_client = AsyncTavilyClient()
     search_response = await tavily_async_client.search(
         query=search_query,
         search_depth=search_depth,
         include_images=include_images,
         include_domains=include_domains,
     )
     search_results = [
         SearchResult(
+            title=result.get("title", ""),
+            url=result.get("url", ""),
+            content=result.get("content", ""),
+            raw_content=result.get("raw_content"),
         )
+        for result in search_response.get("results", [])
     ]
     # Convert to our Pydantic models
     responses: SearchResponse = SearchResponse(
         query=search_query,
+        answer=search_response.get("answer", None),
+        search_results=search_results,
     )
     return responses
 def get_linkup_balance():
     url = "https://api.linkup.so/v1/credits/balance"
     headers = {"Authorization": f"Bearer {os.getenv('LINKUP_API_KEY')}"}
     response = requests.request("GET", url, headers=headers)
 async def linkup_search_async(
     search_query: str,
     depth: Literal["standard", "deep"] = "standard",
+    output_type: Literal[
+        "searchResults", "sourcedAnswer", "structured"
+    ] = "sourcedAnswer",
     include_images: bool = False,
     include_domains: list[ScientificDomains] = None,
 ) -> SearchResponse:
     """
     Performs concurrent web searches using the Linkup API.
     """
     client = LinkupClient()
     search_response: LinkupSourcedAnswer = await client.async_search(
         query=search_query,
         include_images=include_images,
         include_domains=include_domains,
     )
     search_results = [
         SearchResult(
             title=result.name,
     # Convert to our Pydantic models
     responses: SearchResponse = SearchResponse(
+        query=search_query, answer=search_response.answer, search_results=search_results
     )
     return responses
 async def arxiv_search_async(
     search_query: str,
 ) -> SearchResponse:
+    response = await linkup_search_async(
+        search_query, include_domains=[ScientificDomains.arxiv]
+    )
     return response
 async def pubmed_search_async(
     search_query: str,
 ) -> SearchResponse:
+    response = await linkup_search_async(
+        search_query, include_domains=[ScientificDomains.pubmed]
+    )
     return response
 async def sciencedirect_search_async(
     search_query: str,
 ) -> SearchResponse:
+    response = await linkup_search_async(
+        search_query, include_domains=[ScientificDomains.sciencedirect]
+    )
     return response
 async def scientific_search_async(
     search_query: str,
 ) -> SearchResponse:
+    response = await linkup_search_async(
+        search_query,
+        include_domains=[
+            ScientificDomains.wikipedia,
+            ScientificDomains.arxiv,
+            ScientificDomains.pubmed,
+            ScientificDomains.sciencedirect,
+        ],
+    )
+    return response

src/deepengineer/webcrawler/crawl_database.py CHANGED Viewed

@@ -1,16 +1,23 @@
-from deepengineer.webcrawler.utils import sanitize_filename
-from deepengineer.common_path import DATA_DIR
-from deepengineer.webcrawler.async_search import SearchResult, SearchResponse
 import asyncio
 from mistralai import OCRResponse
-from deepengineer.webcrawler.async_crawl import download_pdf_or_arxiv_pdf_async, crawl4ai_extract_markdown_of_url_async
-from deepengineer.webcrawler.pdf_utils import convert_raw_markdown_to_ocr_response
-from deepengineer.webcrawler.pdf_utils import convert_pdf_to_markdown_async
-class DataBase():
     def __init__(self):
         self.urls_to_markdown: dict[str, OCRResponse] = {}
     @staticmethod
     def preprocess_url(url: str) -> str:
         """Preprocess the url to make it a valid url."""
@@ -18,25 +25,25 @@ class DataBase():
             return url.replace("arxiv.org/abs/", "arxiv.org/pdf/")
         else:
             return url
     def crawl_url(self, url: str) -> str:
         """Crawl the url, if the url is a pdf, download the pdf and save and return the markdown."""
         url = self.preprocess_url(url)
         if "pdf" in url:
             output_path = (DATA_DIR / sanitize_filename(url)).with_suffix(".pdf")
-            pdf_path = asyncio.run(download_pdf_or_arxiv_pdf_async(url, output_path=output_path))
             ocr_response = asyncio.run(convert_pdf_to_markdown_async(pdf_path))
         else:
             markdown = asyncio.run(crawl4ai_extract_markdown_of_url_async(url))
             ocr_response = convert_raw_markdown_to_ocr_response(markdown)
         self.urls_to_markdown[url] = ocr_response
         return ocr_response
     def get_markdown_of_url(self, url: str) -> OCRResponse:
         url = self.preprocess_url(url)
         if url in self.urls_to_markdown:
             return self.urls_to_markdown[url]
         else:
             return self.crawl_url(url)

 import asyncio
 from mistralai import OCRResponse
+from deepengineer.common_path import DATA_DIR
+from deepengineer.webcrawler.async_crawl import (
+    crawl4ai_extract_markdown_of_url_async,
+    download_pdf_or_arxiv_pdf_async,
+)
+from deepengineer.webcrawler.pdf_utils import (
+    convert_pdf_to_markdown_async,
+    convert_raw_markdown_to_ocr_response,
+)
+from deepengineer.webcrawler.utils import sanitize_filename
+class DataBase:
     def __init__(self):
         self.urls_to_markdown: dict[str, OCRResponse] = {}
     @staticmethod
     def preprocess_url(url: str) -> str:
         """Preprocess the url to make it a valid url."""
             return url.replace("arxiv.org/abs/", "arxiv.org/pdf/")
         else:
             return url
     def crawl_url(self, url: str) -> str:
         """Crawl the url, if the url is a pdf, download the pdf and save and return the markdown."""
         url = self.preprocess_url(url)
         if "pdf" in url:
             output_path = (DATA_DIR / sanitize_filename(url)).with_suffix(".pdf")
+            pdf_path = asyncio.run(
+                download_pdf_or_arxiv_pdf_async(url, output_path=output_path)
+            )
             ocr_response = asyncio.run(convert_pdf_to_markdown_async(pdf_path))
         else:
             markdown = asyncio.run(crawl4ai_extract_markdown_of_url_async(url))
             ocr_response = convert_raw_markdown_to_ocr_response(markdown)
         self.urls_to_markdown[url] = ocr_response
         return ocr_response
     def get_markdown_of_url(self, url: str) -> OCRResponse:
         url = self.preprocess_url(url)
         if url in self.urls_to_markdown:
             return self.urls_to_markdown[url]
         else:
             return self.crawl_url(url)

src/deepengineer/webcrawler/pdf_utils.py CHANGED Viewed

@@ -1,16 +1,10 @@
 import os
 from pathlib import Path
-from pypdf import PdfReader, PdfWriter
-import io
-from pathlib import Path
-from mistralai import Mistral
-import os
-from litellm import completion
-from mistralai.models import OCRResponse, OCRPageObject, OCRUsageInfo
-import yaml
-from tenacity import retry, stop_after_attempt, wait_fixed, RetryError
 from litellm.exceptions import BadRequestError
 # Define the size limit in bytes
 MAX_SIZE_BYTES = 49 * 1024 * 1024
@@ -20,7 +14,6 @@ async def convert_pdf_to_markdown_async(
     pdf_path: Path,
     with_image_description: bool = False,
 ) -> OCRResponse:
     mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
     uploaded_pdf = await mistral_client.files.upload_async(
@@ -31,7 +24,9 @@ async def convert_pdf_to_markdown_async(
         purpose="ocr",
     )
-    signed_url = await mistral_client.files.get_signed_url_async(file_id=uploaded_pdf.id)
     ocr_response = await mistral_client.ocr.process_async(
         model="mistral-ocr-latest",
@@ -42,27 +37,33 @@ async def convert_pdf_to_markdown_async(
     return ocr_response
-def convert_ocr_response_to_markdown(
-    ocr_response: OCRResponse
-) -> str:
     markdowns: list[str] = []
     for page in ocr_response.pages:
         page_description = page.markdown
         markdowns.append(page_description)
     return "\n\n".join(markdowns)
-def get_markdown_by_page_numbers(markdown: OCRResponse, page_numbers: list[int], get_full_content: bool = False) -> str:
     markdowns: list[str] = []
     page_numbers_to_get = set(page_numbers)
     if get_full_content:
         page_numbers_to_get = set(range(len(markdown.pages)))
     for page_number in page_numbers_to_get:
-        markdowns.append(f"*Page {page_number}*\n{markdown.pages[page_number].markdown}")
     return "\n\n".join(markdowns)
-def find_in_markdown(markdown: OCRResponse, search_queries: list[str] | str) -> list[int]:
     """
     Find the page numbers of the pdf that contain the search query.
@@ -82,12 +83,13 @@ def find_in_markdown(markdown: OCRResponse, search_queries: list[str] | str) ->
                 page_numbers.append(page_number)
     return page_numbers
 def get_table_of_contents_per_page_markdown(markdown: OCRResponse) -> str:
     """
     Get the table of contents of the pdf.
     Finds all the titles of the pdf to reconstruct the table of contents.
     Args:
         markdown (OCRResponse): The markdown of the pdf.
@@ -102,15 +104,26 @@ def get_table_of_contents_per_page_markdown(markdown: OCRResponse) -> str:
             if line.startswith("#"):
                 title_to_page_number[line] = page_number
-    table_of_contents = "\n".join([f"{title} - Page {page_number}" for title, page_number in title_to_page_number.items()])
     return table_of_contents
 def convert_raw_markdown_to_ocr_response(raw_markdown: str) -> OCRResponse:
     pages = raw_markdown.split("# ")
     usage_info_empty = OCRUsageInfo(pages_processed=0)
-    return OCRResponse(pages=[OCRPageObject(index=i, markdown="# " + page, images=[], dimensions=None) for i, page in enumerate(pages)], usage_info=usage_info_empty, model="",)
 def get_images_from_pdf(pdf_path: Path, image_ids: list[str]) -> list[str]:
@@ -141,4 +154,3 @@ def get_images_from_pdf(pdf_path: Path, image_ids: list[str]) -> list[str]:
         except BadRequestError:
             output = ""
         return output

 import os
 from pathlib import Path
+from litellm import completion
 from litellm.exceptions import BadRequestError
+from mistralai import Mistral
+from mistralai.models import OCRPageObject, OCRResponse, OCRUsageInfo
 # Define the size limit in bytes
 MAX_SIZE_BYTES = 49 * 1024 * 1024
     pdf_path: Path,
     with_image_description: bool = False,
 ) -> OCRResponse:
     mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
     uploaded_pdf = await mistral_client.files.upload_async(
         purpose="ocr",
     )
+    signed_url = await mistral_client.files.get_signed_url_async(
+        file_id=uploaded_pdf.id
+    )
     ocr_response = await mistral_client.ocr.process_async(
         model="mistral-ocr-latest",
     return ocr_response
+def convert_ocr_response_to_markdown(ocr_response: OCRResponse) -> str:
     markdowns: list[str] = []
     for page in ocr_response.pages:
         page_description = page.markdown
         markdowns.append(page_description)
     return "\n\n".join(markdowns)
+def get_markdown_by_page_numbers(
+    markdown: OCRResponse, page_numbers: list[int], get_full_content: bool = False
+) -> str:
     markdowns: list[str] = []
     page_numbers_to_get = set(page_numbers)
     if get_full_content:
         page_numbers_to_get = set(range(len(markdown.pages)))
     for page_number in page_numbers_to_get:
+        markdowns.append(
+            f"*Page {page_number}*\n{markdown.pages[page_number].markdown}"
+        )
     return "\n\n".join(markdowns)
+def find_in_markdown(
+    markdown: OCRResponse, search_queries: list[str] | str
+) -> list[int]:
     """
     Find the page numbers of the pdf that contain the search query.
                 page_numbers.append(page_number)
     return page_numbers
 def get_table_of_contents_per_page_markdown(markdown: OCRResponse) -> str:
     """
     Get the table of contents of the pdf.
     Finds all the titles of the pdf to reconstruct the table of contents.
     Args:
         markdown (OCRResponse): The markdown of the pdf.
             if line.startswith("#"):
                 title_to_page_number[line] = page_number
+    table_of_contents = "\n".join(
+        [
+            f"{title} - Page {page_number}"
+            for title, page_number in title_to_page_number.items()
+        ]
+    )
     return table_of_contents
 def convert_raw_markdown_to_ocr_response(raw_markdown: str) -> OCRResponse:
     pages = raw_markdown.split("# ")
     usage_info_empty = OCRUsageInfo(pages_processed=0)
+    return OCRResponse(
+        pages=[
+            OCRPageObject(index=i, markdown="# " + page, images=[], dimensions=None)
+            for i, page in enumerate(pages)
+        ],
+        usage_info=usage_info_empty,
+        model="",
+    )
 def get_images_from_pdf(pdf_path: Path, image_ids: list[str]) -> list[str]:
         except BadRequestError:
             output = ""
         return output

src/deepengineer/webcrawler/testing.py CHANGED Viewed

@@ -6,15 +6,19 @@ TAVILY_RESPONSE_FILE = DATA_DIR / "answers" / "tavily_response.json"
 def load_linkup_response() -> SearchResponse:
-    with open(LINKUP_RESPONSE_FILE, "r") as f:
         return SearchResponse.model_validate_json(f.read())
 def load_tavily_response() -> SearchResponse:
-    with open(TAVILY_RESPONSE_FILE, "r") as f:
         return SearchResponse.model_validate_json(f.read())
 URL_WIKIPEDIA = "https://en.wikipedia.org/wiki/Graphite-moderated_reactor"
 URL_PDF = "https://arxiv.org/pdf/1301.1699.pdf"
 ARXIV_URL = "https://arxiv.org/abs/1301.1699"
 PUBMED_URL = "https://pubmed.ncbi.nlm.nih.gov/34100000/"
-SCIENCEDIRECT_URL = "https://www.sciencedirect.com/science/article/abs/pii/0168900289901964"

 def load_linkup_response() -> SearchResponse:
+    with open(LINKUP_RESPONSE_FILE) as f:
         return SearchResponse.model_validate_json(f.read())
 def load_tavily_response() -> SearchResponse:
+    with open(TAVILY_RESPONSE_FILE) as f:
         return SearchResponse.model_validate_json(f.read())
 URL_WIKIPEDIA = "https://en.wikipedia.org/wiki/Graphite-moderated_reactor"
 URL_PDF = "https://arxiv.org/pdf/1301.1699.pdf"
 ARXIV_URL = "https://arxiv.org/abs/1301.1699"
 PUBMED_URL = "https://pubmed.ncbi.nlm.nih.gov/34100000/"
+SCIENCEDIRECT_URL = (
+    "https://www.sciencedirect.com/science/article/abs/pii/0168900289901964"
+)

src/deepengineer/webcrawler/utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import re
 import unicodedata
 def sanitize_filename(filename, replacement_char="_", max_length=255):
     """
     Sanitizes a string to be suitable for use as a filename.
@@ -31,26 +32,32 @@ def sanitize_filename(filename, replacement_char="_", max_length=255):
     # 1. Replace spaces with the replacement_char
     # This is done early to ensure spaces are handled before other replacements
     # to avoid issues with double replacement characters in subsequent steps.
-    cleaned_filename = filename.replace(' ', replacement_char)
     # 2. Convert to NFKD and encode to ASCII to handle accented characters
     # This transforms 'crème brûlée' into 'creme brulee'
-    cleaned_filename = unicodedata.normalize('NFKD', cleaned_filename).encode('ascii', 'ignore').decode('utf-8')
     # 3. Remove characters that are not alphanumeric, hyphen, underscore, or period.
     #    Replace them with the specified replacement_char.
     #    The regex pattern `[^a-zA-Z0-9\-_.]` matches any character that is NOT
     #    (a-z, A-Z, 0-9, hyphen, underscore, or period).
-    cleaned_filename = re.sub(r'[^a-zA-Z0-9\-_.]', replacement_char, cleaned_filename)
     # 4. Replace multiple consecutive replacement_char characters with a single one
-    cleaned_filename = re.sub(re.escape(replacement_char) + r'+', replacement_char, cleaned_filename)
     # 5. Trim leading/trailing replacement_char characters
     cleaned_filename = cleaned_filename.strip(replacement_char)
     # 6. Ensure the filename doesn't start with a period (hidden file on some systems)
-    if cleaned_filename.startswith('.'):
         cleaned_filename = replacement_char + cleaned_filename[1:]
     # 7. Truncate to max_length
@@ -58,12 +65,12 @@ def sanitize_filename(filename, replacement_char="_", max_length=255):
     if len(cleaned_filename) > max_length:
         # Try to keep the file extension if present
         name, ext = "", ""
-        if '.' in cleaned_filename:
-            parts = cleaned_filename.rsplit('.', 1)
             name, ext = parts[0], "." + parts[1]
         if len(name) > max_length - len(ext):
-            cleaned_filename = name[:max_length - len(ext)] + ext
         else:
             cleaned_filename = cleaned_filename[:max_length]

 import re
 import unicodedata
 def sanitize_filename(filename, replacement_char="_", max_length=255):
     """
     Sanitizes a string to be suitable for use as a filename.
     # 1. Replace spaces with the replacement_char
     # This is done early to ensure spaces are handled before other replacements
     # to avoid issues with double replacement characters in subsequent steps.
+    cleaned_filename = filename.replace(" ", replacement_char)
     # 2. Convert to NFKD and encode to ASCII to handle accented characters
     # This transforms 'crème brûlée' into 'creme brulee'
+    cleaned_filename = (
+        unicodedata.normalize("NFKD", cleaned_filename)
+        .encode("ascii", "ignore")
+        .decode("utf-8")
+    )
     # 3. Remove characters that are not alphanumeric, hyphen, underscore, or period.
     #    Replace them with the specified replacement_char.
     #    The regex pattern `[^a-zA-Z0-9\-_.]` matches any character that is NOT
     #    (a-z, A-Z, 0-9, hyphen, underscore, or period).
+    cleaned_filename = re.sub(r"[^a-zA-Z0-9\-_.]", replacement_char, cleaned_filename)
     # 4. Replace multiple consecutive replacement_char characters with a single one
+    cleaned_filename = re.sub(
+        re.escape(replacement_char) + r"+", replacement_char, cleaned_filename
+    )
     # 5. Trim leading/trailing replacement_char characters
     cleaned_filename = cleaned_filename.strip(replacement_char)
     # 6. Ensure the filename doesn't start with a period (hidden file on some systems)
+    if cleaned_filename.startswith("."):
         cleaned_filename = replacement_char + cleaned_filename[1:]
     # 7. Truncate to max_length
     if len(cleaned_filename) > max_length:
         # Try to keep the file extension if present
         name, ext = "", ""
+        if "." in cleaned_filename:
+            parts = cleaned_filename.rsplit(".", 1)
             name, ext = parts[0], "." + parts[1]
         if len(name) > max_length - len(ext):
+            cleaned_filename = name[: max_length - len(ext)] + ext
         else:
             cleaned_filename = cleaned_filename[:max_length]

tests/deepsearch/test_pdf_agent.py CHANGED Viewed

@@ -1,9 +1,17 @@
-from deepengineer.deepsearch.analyse_markdown_agent import create_agent, GetTableOfContentsTool, GetMarkdownTool, GetPagesContentTool, FindInMarkdownTool
-from mistralai import OCRResponse
 from deepengineer.common_path import DATA_DIR
 def load_mock_ocr_response() -> OCRResponse:
-    with open(DATA_DIR / "report_thermal_neutron.json", "r") as f:
         return OCRResponse.model_validate_json(f.read())
@@ -13,13 +21,16 @@ def test_pdf_agent():
     assert pdf_agent is not None
     assert pdf_agent.name == "markdown_agent"
     assert pdf_agent.tools is not None
-    assert len(pdf_agent.tools) == 4 + 1 # +1 for the final answer
     GetTableOfContentsTool(ocr_response).forward()
     GetMarkdownTool(ocr_response).forward()
-    GetPagesContentTool(ocr_response).forward([1,2,3])
     FindInMarkdownTool(ocr_response).forward(["thermal neutron", "neutron"])
-    # pdf_agent.run("Give me a summary of the document.")

+import pytest
 from deepengineer.common_path import DATA_DIR
+from deepengineer.deepsearch.analyse_markdown_agent import (
+    FindInMarkdownTool,
+    GetMarkdownTool,
+    GetPagesContentTool,
+    GetTableOfContentsTool,
+    create_agent,
+)
+from mistralai import OCRResponse
 def load_mock_ocr_response() -> OCRResponse:
+    with open(DATA_DIR / "report_thermal_neutron.json") as f:
         return OCRResponse.model_validate_json(f.read())
     assert pdf_agent is not None
     assert pdf_agent.name == "markdown_agent"
     assert pdf_agent.tools is not None
+    assert len(pdf_agent.tools) == 4 + 1  # +1 for the final answer
     GetTableOfContentsTool(ocr_response).forward()
     GetMarkdownTool(ocr_response).forward()
+    GetPagesContentTool(ocr_response).forward([1, 2, 3])
     FindInMarkdownTool(ocr_response).forward(["thermal neutron", "neutron"])
+@pytest.mark.skip(reason="This test is too expensive to run on CI")
+def test_run_pdf_agent():
+    ocr_response = load_mock_ocr_response()
+    pdf_agent = create_agent(ocr_response)
+    assert pdf_agent.run("Give me a summary of the document.") is not None

tests/deepsearch/test_web_agent.py CHANGED Viewed

@@ -1,5 +1,17 @@
 from deepengineer.deepsearch.scawl_web_agent import create_web_search_agent
 def test_create_web_search_agent():
     agent = create_web_search_agent()
-    agent.run("Est il possible de faire un réacteur thermique avec du graphite et du plomb?")

+import pytest
 from deepengineer.deepsearch.scawl_web_agent import create_web_search_agent
 def test_create_web_search_agent():
+    create_web_search_agent()
+@pytest.mark.skip(reason="This test is too expensive to run on CI")
+def test_run_web_search_agent():
     agent = create_web_search_agent()
+    assert (
+        agent.run(
+            "Est il possible de faire un réacteur thermique avec du graphite et du plomb?"
+        )
+        is not None
+    )

tests/webcrawler/test_async_crawl.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import pytest
 from deepengineer.webcrawler.async_crawl import (
     crawl4ai_extract_markdown_of_url_async,
     download_pdf_async,
     download_pdf_or_arxiv_pdf_async,
 )
-from mistralai import OCRResponse
-from deepengineer.webcrawler.testing import URL_WIKIPEDIA, URL_PDF, ARXIV_URL
-from deepengineer.common_path import DATA_DIR
 @pytest.mark.asyncio
 async def test_crawl4ai_extract_markdown_of_url_async():
@@ -14,6 +14,7 @@ async def test_crawl4ai_extract_markdown_of_url_async():
     assert isinstance(markdown, str)
     assert "Graphite-moderated reactor" in markdown
 @pytest.mark.asyncio
 async def test_download_pdf_async():
     output_path = DATA_DIR / "temp.pdf"
@@ -22,6 +23,7 @@ async def test_download_pdf_async():
     assert pdf_path == output_path
     assert output_path.exists()
 @pytest.mark.asyncio
 async def test_arxiv_download_pdf_async():
     output_path = DATA_DIR / "temp.pdf"

 import pytest
+from deepengineer.common_path import DATA_DIR
 from deepengineer.webcrawler.async_crawl import (
     crawl4ai_extract_markdown_of_url_async,
     download_pdf_async,
     download_pdf_or_arxiv_pdf_async,
 )
+from deepengineer.webcrawler.testing import ARXIV_URL, URL_PDF, URL_WIKIPEDIA
 @pytest.mark.asyncio
 async def test_crawl4ai_extract_markdown_of_url_async():
     assert isinstance(markdown, str)
     assert "Graphite-moderated reactor" in markdown
 @pytest.mark.asyncio
 async def test_download_pdf_async():
     output_path = DATA_DIR / "temp.pdf"
     assert pdf_path == output_path
     assert output_path.exists()
 @pytest.mark.asyncio
 async def test_arxiv_download_pdf_async():
     output_path = DATA_DIR / "temp.pdf"

tests/webcrawler/test_async_search.py CHANGED Viewed

@@ -1,28 +1,25 @@
-import asyncio
 import pytest
 from deepengineer.webcrawler.async_search import (
-    tavily_search_async,
     SearchResponse,
     get_tavily_usage,
     linkup_search_async,
-    get_linkup_balance,
-    arxiv_search_async
 )
-import numpy as np
 @pytest.mark.expensive
 @pytest.mark.asyncio
 async def test_tavily_search_async():
     usage_before = get_tavily_usage()
     print(usage_before)
     response = await tavily_search_async(
         search_query="Would it be possible to make a thermal reactor with graphite and lead?",
     )
     print(response.answer)
     assert response is not None
     assert isinstance(response, SearchResponse)
@@ -43,10 +40,10 @@ async def test_tavily_search_async():
     print(usage_after)
     assert usage_after == usage_before + 1
 @pytest.mark.expensive
 @pytest.mark.asyncio
 async def test_linkup_search_async():
     balance_before = get_linkup_balance()
     print(balance_before)
@@ -69,7 +66,8 @@ async def test_linkup_search_async():
     balance_after = get_linkup_balance()
     print(balance_after)
     assert np.isclose(balance_after, balance_before - 0.005)
 @pytest.mark.expensive
 @pytest.mark.asyncio
 async def test_arxiv_search_async():
@@ -78,22 +76,17 @@ async def test_arxiv_search_async():
     response = await arxiv_search_async(
         search_query="Would it be possible to make a thermal reactor with graphite and lead?",
     )
     assert response is not None
     assert isinstance(response, SearchResponse)
     assert response.query is not None
     assert response.answer is not None
     assert response.search_results is not None
     assert len(response.search_results) >= 10
-    assert any(result.url.startswith("https://arxiv.org/abs/") for result in response.search_results)
     balance_after = get_linkup_balance()
     assert np.isclose(balance_after, balance_before - 0.005)

+import numpy as np
 import pytest
 from deepengineer.webcrawler.async_search import (
     SearchResponse,
+    arxiv_search_async,
+    get_linkup_balance,
     get_tavily_usage,
     linkup_search_async,
+    tavily_search_async,
 )
 @pytest.mark.expensive
 @pytest.mark.asyncio
 async def test_tavily_search_async():
     usage_before = get_tavily_usage()
     print(usage_before)
     response = await tavily_search_async(
         search_query="Would it be possible to make a thermal reactor with graphite and lead?",
     )
     print(response.answer)
     assert response is not None
     assert isinstance(response, SearchResponse)
     print(usage_after)
     assert usage_after == usage_before + 1
 @pytest.mark.expensive
 @pytest.mark.asyncio
 async def test_linkup_search_async():
     balance_before = get_linkup_balance()
     print(balance_before)
     balance_after = get_linkup_balance()
     print(balance_after)
     assert np.isclose(balance_after, balance_before - 0.005)
 @pytest.mark.expensive
 @pytest.mark.asyncio
 async def test_arxiv_search_async():
     response = await arxiv_search_async(
         search_query="Would it be possible to make a thermal reactor with graphite and lead?",
     )
     assert response is not None
     assert isinstance(response, SearchResponse)
     assert response.query is not None
     assert response.answer is not None
     assert response.search_results is not None
     assert len(response.search_results) >= 10
+    assert any(
+        result.url.startswith("https://arxiv.org/abs/")
+        for result in response.search_results
+    )
     balance_after = get_linkup_balance()
     assert np.isclose(balance_after, balance_before - 0.005)

tests/webcrawler/test_crawl_database.py CHANGED Viewed

@@ -1,24 +1,44 @@
 from deepengineer.webcrawler.crawl_database import DataBase
 def test_crawl_database_arxiv_pdf():
     db = DataBase()
     db.crawl_url("https://arxiv.org/pdf/2105.00643")
     assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
     assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
-    assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages[0].markdown is not None
     assert len(db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages) == 20
 def test_crawl_database_arxiv_link():
     db = DataBase()
     db.crawl_url("https://arxiv.org/abs/2105.00643")
     assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
     assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
-    assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages[0].markdown is not None
     assert len(db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages) == 20
 def test_crawl_database_wikipedia_url():
     db = DataBase()
     db.crawl_url("https://en.wikipedia.org/wiki/Deep_learning")
-    assert db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning") is not None
-    assert db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning").pages[0].markdown is not None
-    assert len(db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning").pages) >= 40

 from deepengineer.webcrawler.crawl_database import DataBase
 def test_crawl_database_arxiv_pdf():
     db = DataBase()
     db.crawl_url("https://arxiv.org/pdf/2105.00643")
     assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
     assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
+    assert (
+        db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages[0].markdown
+        is not None
+    )
     assert len(db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages) == 20
 def test_crawl_database_arxiv_link():
     db = DataBase()
     db.crawl_url("https://arxiv.org/abs/2105.00643")
     assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
     assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
+    assert (
+        db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages[0].markdown
+        is not None
+    )
     assert len(db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages) == 20
 def test_crawl_database_wikipedia_url():
     db = DataBase()
     db.crawl_url("https://en.wikipedia.org/wiki/Deep_learning")
+    assert (
+        db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning")
+        is not None
+    )
+    assert (
+        db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning")
+        .pages[0]
+        .markdown
+        is not None
+    )
+    assert (
+        len(db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning").pages)
+        >= 40
+    )

tests/webcrawler/test_pdfs_utils.py CHANGED Viewed

@@ -1,10 +1,17 @@
-from deepengineer.webcrawler.pdf_utils import convert_pdf_to_markdown_async, convert_ocr_response_to_markdown, find_in_markdown, get_table_of_contents_per_page_markdown, get_markdown_by_page_numbers
-from mistralai import OCRResponse
-from deepengineer.common_path import DATA_DIR
 import pytest
 def load_mock_ocr_response() -> OCRResponse:
-    with open(DATA_DIR / "report_thermal_neutron.json", "r") as f:
         return OCRResponse.model_validate_json(f.read())
@@ -18,18 +25,20 @@ async def test_convert_pdf_to_markdown_async():
     assert isinstance(ocr_response, OCRResponse)
     assert len(ocr_response.pages) == 16
     assert "where each cylinder represent" in markdown
 def test_table_of_contents_per_page_pdf():
     ocr_response = load_mock_ocr_response()
     table_of_contents = get_table_of_contents_per_page_markdown(ocr_response)
     assert "References - Page 15" in table_of_contents
 def test_find_in_pdf():
     ocr_response = load_mock_ocr_response()
     page_numbers = find_in_markdown(ocr_response, "where each cylinder represent")
     assert page_numbers == [7]
 def test_get_markdown_by_page_numbers():
     ocr_response = load_mock_ocr_response()
     page_numbers = [7, 15]
@@ -37,4 +46,4 @@ def test_get_markdown_by_page_numbers():
     assert "Page 7" in markdown
     assert "Page 15" in markdown
     assert "References" in markdown
-    assert "where each cylinder represent" in markdown

 import pytest
+from deepengineer.common_path import DATA_DIR
+from deepengineer.webcrawler.pdf_utils import (
+    convert_ocr_response_to_markdown,
+    convert_pdf_to_markdown_async,
+    find_in_markdown,
+    get_markdown_by_page_numbers,
+    get_table_of_contents_per_page_markdown,
+)
+from mistralai import OCRResponse
 def load_mock_ocr_response() -> OCRResponse:
+    with open(DATA_DIR / "report_thermal_neutron.json") as f:
         return OCRResponse.model_validate_json(f.read())
     assert isinstance(ocr_response, OCRResponse)
     assert len(ocr_response.pages) == 16
     assert "where each cylinder represent" in markdown
 def test_table_of_contents_per_page_pdf():
     ocr_response = load_mock_ocr_response()
     table_of_contents = get_table_of_contents_per_page_markdown(ocr_response)
     assert "References - Page 15" in table_of_contents
 def test_find_in_pdf():
     ocr_response = load_mock_ocr_response()
     page_numbers = find_in_markdown(ocr_response, "where each cylinder represent")
     assert page_numbers == [7]
 def test_get_markdown_by_page_numbers():
     ocr_response = load_mock_ocr_response()
     page_numbers = [7, 15]
     assert "Page 7" in markdown
     assert "Page 15" in markdown
     assert "References" in markdown
+    assert "where each cylinder represent" in markdown

tests/webcrawler/test_utils.py CHANGED Viewed

@@ -1,28 +1,37 @@
 from deepengineer.webcrawler.utils import sanitize_filename
-def test_sanitize_filename():
     assert sanitize_filename("My Document!@#$%^&*.txt") == "My_Document_.txt"
-    assert sanitize_filename("  Another file with spaces & special_chars  ") == "Another_file_with_spaces_special_chars"
-    assert sanitize_filename("Düsseldorf_Report_2023.pdf") == "Dusseldorf_Report_2023.pdf"
-    assert sanitize_filename("File with an é, ö, ü, ç, ñ.docx") == "File_with_an_e_o_u_c_n.docx"
     assert sanitize_filename("Очень важное дело.xlsx") == "_xlsx"
-    assert sanitize_filename("My.Super.Duper.File.Name.with.lots.of.dots.and.A@#!!%@#$%^&*.txt") == "My.Super.Duper.File.Name.with.lots.of.dots.and.A_.txt"
     assert sanitize_filename("........hidden_file.txt") == "_.......hidden_file.txt"
-    assert sanitize_filename("A very long file name that exceeds typical operating system limits and needs to be truncated gracefully.zip") == "A_very_long_file_name_that_exceeds_typical_operating_system_limits_and_needs_to_be_truncated_gracefully.zip"
     assert sanitize_filename(" ") == "untitled_file"
     assert sanitize_filename("!") == "untitled_file"
     assert sanitize_filename("  .some_hidden_file.txt  ") == "_some_hidden_file.txt"
-    assert sanitize_filename("file_name_with_________many_underscores.txt") == "file_name_with_many_underscores.txt"

 from deepengineer.webcrawler.utils import sanitize_filename
+def test_sanitize_filename():
     assert sanitize_filename("My Document!@#$%^&*.txt") == "My_Document_.txt"
+    assert (
+        sanitize_filename("  Another file with spaces & special_chars  ")
+        == "Another_file_with_spaces_special_chars"
+    )
+    assert (
+        sanitize_filename("Düsseldorf_Report_2023.pdf") == "Dusseldorf_Report_2023.pdf"
+    )
+    assert (
+        sanitize_filename("File with an é, ö, ü, ç, ñ.docx")
+        == "File_with_an_e_o_u_c_n.docx"
+    )
     assert sanitize_filename("Очень важное дело.xlsx") == "_xlsx"
+    assert (
+        sanitize_filename(
+            "My.Super.Duper.File.Name.with.lots.of.dots.and.A@#!!%@#$%^&*.txt"
+        )
+        == "My.Super.Duper.File.Name.with.lots.of.dots.and.A_.txt"
+    )
     assert sanitize_filename("........hidden_file.txt") == "_.......hidden_file.txt"
+    assert (
+        sanitize_filename(
+            "A very long file name that exceeds typical operating system limits and needs to be truncated gracefully.zip"
+        )
+        == "A_very_long_file_name_that_exceeds_typical_operating_system_limits_and_needs_to_be_truncated_gracefully.zip"
+    )
     assert sanitize_filename(" ") == "untitled_file"
     assert sanitize_filename("!") == "untitled_file"
     assert sanitize_filename("  .some_hidden_file.txt  ") == "_some_hidden_file.txt"
+    assert (
+        sanitize_filename("file_name_with_________many_underscores.txt")
+        == "file_name_with_many_underscores.txt"
+    )