Charles Azam commited on
Commit
b5fafa1
·
1 Parent(s): 663d842

clean: run linting and formating on repo

Browse files
Makefile CHANGED
@@ -19,10 +19,7 @@ lint: ## Run ruff linter
19
  lint-fix: ## Run ruff linter and auto-fix issues
20
  uv run ruff check --fix src tests
21
 
22
- type-check: ## Run mypy type checker
23
- uv run mypy src
24
-
25
- check: format lint type-check ## Run all checks (format, lint, type-check)
26
 
27
  test: ## Run tests
28
  uv run pytest tests
 
19
  lint-fix: ## Run ruff linter and auto-fix issues
20
  uv run ruff check --fix src tests
21
 
22
+ check: format lint ## Run all checks (format, lint)
 
 
 
23
 
24
  test: ## Run tests
25
  uv run pytest tests
pyproject.toml CHANGED
@@ -34,13 +34,13 @@ build-backend = "hatchling.build"
34
 
35
  [tool.black]
36
  line-length = 88
37
- target-version = ['py313']
38
 
39
 
40
  [tool.ruff]
41
- target-version = "py313"
42
  line-length = 88
43
- select = [
44
  "E", # pycodestyle errors
45
  "W", # pycodestyle warnings
46
  "F", # pyflakes
@@ -49,11 +49,9 @@ select = [
49
  "C4", # flake8-comprehensions
50
  "UP", # pyupgrade
51
  ]
52
- ignore = [
53
  "E501", # line too long, handled by black
54
  "B008", # do not perform function calls in argument defaults
55
  "C901", # too complex
56
  ]
57
 
58
- [tool.ruff.per-file-ignores]
59
- "__init__.py" = ["F401"]
 
34
 
35
  [tool.black]
36
  line-length = 88
37
+ target-version = ['py312']
38
 
39
 
40
  [tool.ruff]
41
+ target-version = "py312"
42
  line-length = 88
43
+ lint.select = [
44
  "E", # pycodestyle errors
45
  "W", # pycodestyle warnings
46
  "F", # pyflakes
 
49
  "C4", # flake8-comprehensions
50
  "UP", # pyupgrade
51
  ]
52
+ lint.ignore = [
53
  "E501", # line too long, handled by black
54
  "B008", # do not perform function calls in argument defaults
55
  "C901", # too complex
56
  ]
57
 
 
 
src/deepengineer/common_path.py CHANGED
@@ -8,4 +8,4 @@ assert DEEPENGINEER_CODE_DIR.name == "deepengineer"
8
  assert DEEPENGINEER_SRC_DIR.name == "src"
9
 
10
  DATA_DIR = DEEPENGINEER_ROOT_DIR / "data"
11
- assert DATA_DIR.exists()
 
8
  assert DEEPENGINEER_SRC_DIR.name == "src"
9
 
10
  DATA_DIR = DEEPENGINEER_ROOT_DIR / "data"
11
+ assert DATA_DIR.exists()
src/deepengineer/deepsearch/analyse_markdown_agent.py CHANGED
@@ -2,86 +2,97 @@
2
  Simple agent to analyse a markdown, just to test some ideas.
3
  """
4
 
5
- from smolagents import CodeAgent, tool, Tool, LiteLLMModel
6
- from deepengineer.webcrawler.pdf_utils import get_markdown_by_page_numbers, get_table_of_contents_per_page_markdown, find_in_markdown, convert_ocr_response_to_markdown
7
- from mistralai import OCRResponse
8
  from enum import Enum
9
 
 
 
 
 
 
 
 
 
 
 
 
10
  class ToolNames(Enum):
11
  GET_TABLE_OF_CONTENTS = "get_table_of_contents"
12
  GET_MARKDOWN = "get_markdown"
13
  GET_PAGES_CONTENT = "get_pages_content"
14
  FIND_IN_MARKDOWN = "find_in_markdown"
15
 
 
16
  class GetTableOfContentsTool(Tool):
17
  name = ToolNames.GET_TABLE_OF_CONTENTS.value
18
  description = "Returns all of the titles in the document along with the page number they are on."
19
  inputs = {}
20
  output_type = "string"
21
-
22
  def __init__(self, markdown: OCRResponse):
23
  super().__init__()
24
  self.markdown: OCRResponse = markdown
25
- self.table_of_contents: str = get_table_of_contents_per_page_markdown(self.markdown)
26
-
 
 
27
  def forward(self) -> str:
28
  return self.table_of_contents
29
-
 
30
  class GetMarkdownTool(Tool):
31
  name = ToolNames.GET_MARKDOWN.value
32
  description = f"Returns the markdown entire content of the document. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages."
33
  inputs = {}
34
  output_type = "string"
35
-
36
  def __init__(self, markdown: OCRResponse):
37
  super().__init__()
38
  self.markdown: OCRResponse = markdown
39
  self.markdown_content: str = convert_ocr_response_to_markdown(self.markdown)
40
-
41
  def forward(self) -> str:
42
  return self.markdown_content
43
-
44
-
45
  class GetPagesContentTool(Tool):
46
  name = ToolNames.GET_PAGES_CONTENT.value
47
  description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages. Expects a list of page numbers as integers as input."
48
  inputs = {
49
  "page_numbers": {
50
  "type": "array",
51
- "description": "The page numbers to get the content of."
52
  },
53
  }
54
  output_type = "string"
55
-
56
  def __init__(self, markdown: OCRResponse):
57
  super().__init__()
58
  self.markdown: OCRResponse = markdown
59
 
60
  def forward(self, page_numbers: list[int]) -> str:
61
  return get_markdown_by_page_numbers(self.markdown, page_numbers)
62
-
 
63
  class FindInMarkdownTool(Tool):
64
  name = ToolNames.FIND_IN_MARKDOWN.value
65
  description = f"Finds the page numbers of the document that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the document that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages."
66
  inputs = {
67
  "search_queries": {
68
  "type": "array",
69
- "description": "The search queries to find in the document. List of strings."
70
  }
71
  }
72
  output_type = "array"
73
-
74
  def __init__(self, markdown: OCRResponse):
75
  super().__init__()
76
  self.markdown: OCRResponse = markdown
77
-
78
  def forward(self, search_queries: list[str]) -> list[int]:
79
  return find_in_markdown(self.markdown, search_queries)
80
 
81
 
82
-
83
  def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
84
-
85
  """This agent is just a test and will not be used as is by the main agent."""
86
 
87
  model = LiteLLMModel(model_id=model_id)
@@ -101,6 +112,8 @@ def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
101
  name="markdown_agent",
102
  description="""A team member that can analyse a markdown.""",
103
  )
104
- markdown_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files."""
 
 
105
 
106
- return markdown_agent
 
2
  Simple agent to analyse a markdown, just to test some ideas.
3
  """
4
 
 
 
 
5
  from enum import Enum
6
 
7
+ from mistralai import OCRResponse
8
+ from smolagents import CodeAgent, LiteLLMModel, Tool
9
+
10
+ from deepengineer.webcrawler.pdf_utils import (
11
+ convert_ocr_response_to_markdown,
12
+ find_in_markdown,
13
+ get_markdown_by_page_numbers,
14
+ get_table_of_contents_per_page_markdown,
15
+ )
16
+
17
+
18
  class ToolNames(Enum):
19
  GET_TABLE_OF_CONTENTS = "get_table_of_contents"
20
  GET_MARKDOWN = "get_markdown"
21
  GET_PAGES_CONTENT = "get_pages_content"
22
  FIND_IN_MARKDOWN = "find_in_markdown"
23
 
24
+
25
  class GetTableOfContentsTool(Tool):
26
  name = ToolNames.GET_TABLE_OF_CONTENTS.value
27
  description = "Returns all of the titles in the document along with the page number they are on."
28
  inputs = {}
29
  output_type = "string"
30
+
31
  def __init__(self, markdown: OCRResponse):
32
  super().__init__()
33
  self.markdown: OCRResponse = markdown
34
+ self.table_of_contents: str = get_table_of_contents_per_page_markdown(
35
+ self.markdown
36
+ )
37
+
38
  def forward(self) -> str:
39
  return self.table_of_contents
40
+
41
+
42
  class GetMarkdownTool(Tool):
43
  name = ToolNames.GET_MARKDOWN.value
44
  description = f"Returns the markdown entire content of the document. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages."
45
  inputs = {}
46
  output_type = "string"
47
+
48
  def __init__(self, markdown: OCRResponse):
49
  super().__init__()
50
  self.markdown: OCRResponse = markdown
51
  self.markdown_content: str = convert_ocr_response_to_markdown(self.markdown)
52
+
53
  def forward(self) -> str:
54
  return self.markdown_content
55
+
56
+
57
  class GetPagesContentTool(Tool):
58
  name = ToolNames.GET_PAGES_CONTENT.value
59
  description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages. Expects a list of page numbers as integers as input."
60
  inputs = {
61
  "page_numbers": {
62
  "type": "array",
63
+ "description": "The page numbers to get the content of.",
64
  },
65
  }
66
  output_type = "string"
67
+
68
  def __init__(self, markdown: OCRResponse):
69
  super().__init__()
70
  self.markdown: OCRResponse = markdown
71
 
72
  def forward(self, page_numbers: list[int]) -> str:
73
  return get_markdown_by_page_numbers(self.markdown, page_numbers)
74
+
75
+
76
  class FindInMarkdownTool(Tool):
77
  name = ToolNames.FIND_IN_MARKDOWN.value
78
  description = f"Finds the page numbers of the document that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the document that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages."
79
  inputs = {
80
  "search_queries": {
81
  "type": "array",
82
+ "description": "The search queries to find in the document. List of strings.",
83
  }
84
  }
85
  output_type = "array"
86
+
87
  def __init__(self, markdown: OCRResponse):
88
  super().__init__()
89
  self.markdown: OCRResponse = markdown
90
+
91
  def forward(self, search_queries: list[str]) -> list[int]:
92
  return find_in_markdown(self.markdown, search_queries)
93
 
94
 
 
95
  def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
 
96
  """This agent is just a test and will not be used as is by the main agent."""
97
 
98
  model = LiteLLMModel(model_id=model_id)
 
112
  name="markdown_agent",
113
  description="""A team member that can analyse a markdown.""",
114
  )
115
+ markdown_agent.prompt_templates["managed_agent"][
116
+ "task"
117
+ ] += """You can navigate to .txt online files."""
118
 
119
+ return markdown_agent
src/deepengineer/deepsearch/draw_agent.py CHANGED
@@ -1,158 +0,0 @@
1
- from io import BytesIO
2
- from time import sleep
3
-
4
- import helium
5
- from dotenv import load_dotenv
6
- from PIL import Image
7
- from selenium import webdriver
8
- from selenium.webdriver.common.by import By
9
- from selenium.webdriver.common.keys import Keys
10
-
11
- from smolagents import CodeAgent, tool
12
- from smolagents.agents import ActionStep
13
-
14
- # Load environment variables
15
- load_dotenv()
16
-
17
- @tool
18
- def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
19
- """
20
- Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
21
- Args:
22
- text: The text to search for
23
- nth_result: Which occurrence to jump to (default: 1)
24
- """
25
- elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
26
- if nth_result > len(elements):
27
- raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
28
- result = f"Found {len(elements)} matches for '{text}'."
29
- elem = elements[nth_result - 1]
30
- driver.execute_script("arguments[0].scrollIntoView(true);", elem)
31
- result += f"Focused on element {nth_result} of {len(elements)}"
32
- return result
33
-
34
- @tool
35
- def go_back() -> None:
36
- """Goes back to previous page."""
37
- driver.back()
38
-
39
- @tool
40
- def close_popups() -> str:
41
- """
42
- Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows!
43
- This does not work on cookie consent banners.
44
- """
45
- webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
46
-
47
-
48
- # Configure Chrome options
49
- chrome_options = webdriver.ChromeOptions()
50
- chrome_options.add_argument("--force-device-scale-factor=1")
51
- chrome_options.add_argument("--window-size=1000,1350")
52
- chrome_options.add_argument("--disable-pdf-viewer")
53
- chrome_options.add_argument("--window-position=0,0")
54
-
55
- # Initialize the browser
56
- driver = helium.start_chrome(headless=False, options=chrome_options)
57
-
58
- # Set up screenshot callback
59
- def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
60
- sleep(1.0) # Let JavaScript animations happen before taking the screenshot
61
- driver = helium.get_driver()
62
- current_step = memory_step.step_number
63
- if driver is not None:
64
- for previous_memory_step in agent.memory.steps: # Remove previous screenshots for lean processing
65
- if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
66
- previous_memory_step.observations_images = None
67
- png_bytes = driver.get_screenshot_as_png()
68
- image = Image.open(BytesIO(png_bytes))
69
- print(f"Captured a browser screenshot: {image.size} pixels")
70
- memory_step.observations_images = [image.copy()] # Create a copy to ensure it persists
71
-
72
- # Update observations with current URL
73
- url_info = f"Current url: {driver.current_url}"
74
- memory_step.observations = (
75
- url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
76
- )
77
-
78
- from smolagents import InferenceClientModel
79
-
80
- # Initialize the model
81
- model_id = "Qwen/Qwen2-VL-72B-Instruct" # You can change this to your preferred VLM model
82
- model = InferenceClientModel(model_id=model_id)
83
-
84
- # Create the agent
85
- agent = CodeAgent(
86
- tools=[go_back, close_popups, search_item_ctrl_f],
87
- model=model,
88
- additional_authorized_imports=["helium"],
89
- step_callbacks=[save_screenshot],
90
- max_steps=20,
91
- verbosity_level=2,
92
- )
93
-
94
- # Import helium for the agent
95
- agent.python_executor("from helium import *", agent.state)
96
-
97
- helium_instructions = """
98
- You can use helium to access websites. Don't bother about the helium driver, it's already managed.
99
- We've already ran "from helium import *"
100
- Then you can go to pages!
101
- Code:
102
- ```py
103
- go_to('github.com/trending')
104
- ```<end_code>
105
-
106
- You can directly click clickable elements by inputting the text that appears on them.
107
- Code:
108
- ```py
109
- click("Top products")
110
- ```<end_code>
111
-
112
- If it's a link:
113
- Code:
114
- ```py
115
- click(Link("Top products"))
116
- ```<end_code>
117
-
118
- If you try to interact with an element and it's not found, you'll get a LookupError.
119
- In general stop your action after each button click to see what happens on your screenshot.
120
- Never try to login in a page.
121
-
122
- To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
123
- Code:
124
- ```py
125
- scroll_down(num_pixels=1200) # This will scroll one viewport down
126
- ```<end_code>
127
-
128
- When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
129
- Just use your built-in tool `close_popups` to close them:
130
- Code:
131
- ```py
132
- close_popups()
133
- ```<end_code>
134
-
135
- You can use .exists() to check for the existence of an element. For example:
136
- Code:
137
- ```py
138
- if Text('Accept cookies?').exists():
139
- click('I accept')
140
- ```<end_code>
141
- """
142
-
143
- search_request = """
144
- Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
145
- """
146
-
147
- agent_output = agent.run(search_request + helium_instructions)
148
- print("Final output:")
149
- print(agent_output)
150
-
151
- github_request = """
152
- I'm trying to find how hard I have to work to get a repo in github.com/trending.
153
- Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
154
- """
155
-
156
- agent_output = agent.run(github_request + helium_instructions)
157
- print("Final output:")
158
- print(agent_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/deepengineer/deepsearch/scawl_web_agent.py CHANGED
@@ -1,13 +1,23 @@
1
- from smolagents import CodeAgent, Tool, LiteLLMModel
 
 
 
 
2
  from deepengineer.webcrawler.async_search import (
3
- linkup_search_async, arxiv_search_async,
4
- pubmed_search_async, scientific_search_async,
 
 
 
5
  )
6
- from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown, convert_ocr_response_to_markdown, get_markdown_by_page_numbers, find_in_markdown
7
- from enum import Enum
8
- import asyncio
9
- from deepengineer.webcrawler.async_search import SearchResponse
10
  from deepengineer.webcrawler.crawl_database import DataBase
 
 
 
 
 
 
 
11
 
12
  class ToolNames(Enum):
13
  # Search tools
@@ -16,37 +26,43 @@ class ToolNames(Enum):
16
  PUBMED_SEARCH = "pubmed_search"
17
  SCIENCEDIRECT_SEARCH = "sciencedirect_search"
18
  SCIENTIFIC_SEARCH = "scientific_search"
19
-
20
  # Exploring link tools
21
  GET_TABLE_OF_CONTENTS = "get_table_of_contents_of_url"
22
  GET_MARKDOWN = "get_markdown_of_url"
23
  GET_PAGES_CONTENT = "get_pages_content"
24
  FIND_IN_MARKDOWN = "find_in_markdown"
25
 
26
- def filter_search_results(search_response: SearchResponse, max_nb_results: int = 5) -> SearchResponse:
 
 
 
27
  search_response.search_results = search_response.search_results[:max_nb_results]
28
  return search_response.to_string()
29
 
30
 
31
  class SearchTool(Tool):
32
  name = ToolNames.SEARCH_TOOL.value
33
- description = f"""Search the web using Linkup API. Good for deep research with sourced answers.
34
  Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
35
  """
36
  inputs = {
37
  "search_query": {
38
  "type": "string",
39
- "description": "The search query to execute"
40
  },
41
  }
42
  output_type = "object"
43
-
44
  def forward(self, search_query: str) -> str:
45
- result = asyncio.run(linkup_search_async(
46
- search_query=search_query,
47
- ))
 
 
48
  return filter_search_results(result)
49
 
 
50
  class ArxivSearchTool(Tool):
51
  name = ToolNames.ARXIV_SEARCH.value
52
  description = """Search arXiv for academic papers and preprints with Linkup API.
@@ -55,15 +71,16 @@ class ArxivSearchTool(Tool):
55
  inputs = {
56
  "search_query": {
57
  "type": "string",
58
- "description": "The search query to execute on arXiv"
59
  }
60
  }
61
  output_type = "object"
62
-
63
  def forward(self, search_query: str) -> str:
64
  result = asyncio.run(arxiv_search_async(search_query))
65
  return filter_search_results(result)
66
 
 
67
  class PubmedSearchTool(Tool):
68
  name = ToolNames.PUBMED_SEARCH.value
69
  description = """Search PubMed for medical and scientific literature with Linkup API.
@@ -72,15 +89,16 @@ class PubmedSearchTool(Tool):
72
  inputs = {
73
  "search_query": {
74
  "type": "string",
75
- "description": "The search query to execute on PubMed"
76
  }
77
  }
78
  output_type = "object"
79
-
80
  def forward(self, search_query: str) -> str:
81
  result = asyncio.run(pubmed_search_async(search_query))
82
  return filter_search_results(result)
83
 
 
84
  class ScientificSearchTool(Tool):
85
  name = ToolNames.SCIENTIFIC_SEARCH.value
86
  description = """Search across multiple scientific domains: Wikipedia, arXiv, PubMed, and ScienceDirect.
@@ -89,16 +107,19 @@ class ScientificSearchTool(Tool):
89
  inputs = {
90
  "search_query": {
91
  "type": "string",
92
- "description": "The search query to execute across scientific domains"
93
  }
94
  }
95
  output_type = "object"
 
96
  def forward(self, search_query: str) -> dict:
97
  result = asyncio.run(scientific_search_async(search_query))
98
  return filter_search_results(result)
99
 
 
100
  URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
101
 
 
102
  class GetTableOfContentsTool(Tool):
103
  name = ToolNames.GET_TABLE_OF_CONTENTS.value
104
  description = f"""Returns all of the titles in the document along with the page number they are on.
@@ -107,55 +128,51 @@ class GetTableOfContentsTool(Tool):
107
  inputs = {
108
  "url": {
109
  "type": "string",
110
- "description": "The URL to get the table of contents of."
111
  }
112
  }
113
  output_type = "string"
114
-
115
  def __init__(self, database: DataBase):
116
  super().__init__()
117
  self.database: DataBase = database
118
-
119
  def forward(self, url: str) -> str:
120
  markdown = self.database.get_markdown_of_url(url)
121
  table_of_contents: str = get_table_of_contents_per_page_markdown(markdown)
122
  return table_of_contents
123
 
 
124
  class GetMarkdownTool(Tool):
125
  name = ToolNames.GET_MARKDOWN.value
126
  description = f"Returns in markdown entire content of the url. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can also use {ToolNames.GET_TABLE_OF_CONTENTS.value} first to get the table of contents of the document including the number of pages."
127
  inputs = {
128
- "url": {
129
- "type": "string",
130
- "description": "The URL to get the markdown of."
131
- }
132
  }
133
  output_type = "string"
134
-
135
  def __init__(self, database: DataBase):
136
  super().__init__()
137
  self.database: DataBase = database
138
-
139
  def forward(self, url: str) -> str:
140
  markdown = self.database.get_markdown_of_url(url)
141
  markdown_content: str = convert_ocr_response_to_markdown(markdown)
142
  return markdown_content
143
 
 
144
  class GetPagesContentTool(Tool):
145
  name = ToolNames.GET_PAGES_CONTENT.value
146
  description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the url including the number of pages. Expects a list of page numbers as integers as input. {URL_EXPLAINATION}"
147
  inputs = {
148
- "url": {
149
- "type": "string",
150
- "description": "The URL to get the content of."
151
- },
152
  "page_numbers": {
153
  "type": "array",
154
- "description": "The page numbers to get the content of."
155
  },
156
  }
157
  output_type = "string"
158
-
159
  def __init__(self, database: DataBase):
160
  super().__init__()
161
  self.database: DataBase = database
@@ -164,32 +181,31 @@ class GetPagesContentTool(Tool):
164
  markdown = self.database.get_markdown_of_url(url)
165
  return get_markdown_by_page_numbers(markdown, page_numbers)
166
 
 
167
  class FindInMarkdownTool(Tool):
168
  name = ToolNames.FIND_IN_MARKDOWN.value
169
  description = f"Finds the page numbers of the url that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the url that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages. {URL_EXPLAINATION}"
170
  inputs = {
171
- "url": {
172
- "type": "string",
173
- "description": "The URL to find in."
174
- },
175
  "search_queries": {
176
  "type": "array",
177
- "description": "The search queries to find in the url. List of strings."
178
- }
179
  }
180
  output_type = "array"
181
-
182
  def __init__(self, database: DataBase):
183
  super().__init__()
184
  self.database: DataBase = database
185
-
186
  def forward(self, url: str, search_queries: list[str]) -> list[int]:
187
  markdown = self.database.get_markdown_of_url(url)
188
  return find_in_markdown(markdown, search_queries)
189
 
 
190
  def create_web_search_agent(model_id="deepseek/deepseek-chat"):
191
  """Create a web search agent with search, crawling, and PDF analysis capabilities."""
192
-
193
  model = LiteLLMModel(model_id=model_id)
194
  database = DataBase()
195
 
@@ -204,7 +220,7 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
204
  GetPagesContentTool(database),
205
  FindInMarkdownTool(database),
206
  ]
207
-
208
  web_search_agent = CodeAgent(
209
  model=model,
210
  tools=WEB_SEARCH_TOOLS,
@@ -214,5 +230,5 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
214
  name="web_search_agent",
215
  description="""A team member that can search the web, crawl URLs, download PDFs, and analyze documents.""",
216
  )
217
-
218
  return web_search_agent
 
1
+ import asyncio
2
+ from enum import Enum
3
+
4
+ from smolagents import CodeAgent, LiteLLMModel, Tool
5
+
6
  from deepengineer.webcrawler.async_search import (
7
+ SearchResponse,
8
+ arxiv_search_async,
9
+ linkup_search_async,
10
+ pubmed_search_async,
11
+ scientific_search_async,
12
  )
 
 
 
 
13
  from deepengineer.webcrawler.crawl_database import DataBase
14
+ from deepengineer.webcrawler.pdf_utils import (
15
+ convert_ocr_response_to_markdown,
16
+ find_in_markdown,
17
+ get_markdown_by_page_numbers,
18
+ get_table_of_contents_per_page_markdown,
19
+ )
20
+
21
 
22
  class ToolNames(Enum):
23
  # Search tools
 
26
  PUBMED_SEARCH = "pubmed_search"
27
  SCIENCEDIRECT_SEARCH = "sciencedirect_search"
28
  SCIENTIFIC_SEARCH = "scientific_search"
29
+
30
  # Exploring link tools
31
  GET_TABLE_OF_CONTENTS = "get_table_of_contents_of_url"
32
  GET_MARKDOWN = "get_markdown_of_url"
33
  GET_PAGES_CONTENT = "get_pages_content"
34
  FIND_IN_MARKDOWN = "find_in_markdown"
35
 
36
+
37
+ def filter_search_results(
38
+ search_response: SearchResponse, max_nb_results: int = 5
39
+ ) -> SearchResponse:
40
  search_response.search_results = search_response.search_results[:max_nb_results]
41
  return search_response.to_string()
42
 
43
 
44
  class SearchTool(Tool):
45
  name = ToolNames.SEARCH_TOOL.value
46
+ description = """Search the web using Linkup API. Good for deep research with sourced answers.
47
  Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
48
  """
49
  inputs = {
50
  "search_query": {
51
  "type": "string",
52
+ "description": "The search query to execute",
53
  },
54
  }
55
  output_type = "object"
56
+
57
  def forward(self, search_query: str) -> str:
58
+ result = asyncio.run(
59
+ linkup_search_async(
60
+ search_query=search_query,
61
+ )
62
+ )
63
  return filter_search_results(result)
64
 
65
+
66
  class ArxivSearchTool(Tool):
67
  name = ToolNames.ARXIV_SEARCH.value
68
  description = """Search arXiv for academic papers and preprints with Linkup API.
 
71
  inputs = {
72
  "search_query": {
73
  "type": "string",
74
+ "description": "The search query to execute on arXiv",
75
  }
76
  }
77
  output_type = "object"
78
+
79
  def forward(self, search_query: str) -> str:
80
  result = asyncio.run(arxiv_search_async(search_query))
81
  return filter_search_results(result)
82
 
83
+
84
  class PubmedSearchTool(Tool):
85
  name = ToolNames.PUBMED_SEARCH.value
86
  description = """Search PubMed for medical and scientific literature with Linkup API.
 
89
  inputs = {
90
  "search_query": {
91
  "type": "string",
92
+ "description": "The search query to execute on PubMed",
93
  }
94
  }
95
  output_type = "object"
96
+
97
  def forward(self, search_query: str) -> str:
98
  result = asyncio.run(pubmed_search_async(search_query))
99
  return filter_search_results(result)
100
 
101
+
102
  class ScientificSearchTool(Tool):
103
  name = ToolNames.SCIENTIFIC_SEARCH.value
104
  description = """Search across multiple scientific domains: Wikipedia, arXiv, PubMed, and ScienceDirect.
 
107
  inputs = {
108
  "search_query": {
109
  "type": "string",
110
+ "description": "The search query to execute across scientific domains",
111
  }
112
  }
113
  output_type = "object"
114
+
115
  def forward(self, search_query: str) -> dict:
116
  result = asyncio.run(scientific_search_async(search_query))
117
  return filter_search_results(result)
118
 
119
+
120
  URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
121
 
122
+
123
  class GetTableOfContentsTool(Tool):
124
  name = ToolNames.GET_TABLE_OF_CONTENTS.value
125
  description = f"""Returns all of the titles in the document along with the page number they are on.
 
128
  inputs = {
129
  "url": {
130
  "type": "string",
131
+ "description": "The URL to get the table of contents of.",
132
  }
133
  }
134
  output_type = "string"
135
+
136
  def __init__(self, database: DataBase):
137
  super().__init__()
138
  self.database: DataBase = database
139
+
140
  def forward(self, url: str) -> str:
141
  markdown = self.database.get_markdown_of_url(url)
142
  table_of_contents: str = get_table_of_contents_per_page_markdown(markdown)
143
  return table_of_contents
144
 
145
+
146
  class GetMarkdownTool(Tool):
147
  name = ToolNames.GET_MARKDOWN.value
148
  description = f"Returns in markdown entire content of the url. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can also use {ToolNames.GET_TABLE_OF_CONTENTS.value} first to get the table of contents of the document including the number of pages."
149
  inputs = {
150
+ "url": {"type": "string", "description": "The URL to get the markdown of."}
 
 
 
151
  }
152
  output_type = "string"
153
+
154
  def __init__(self, database: DataBase):
155
  super().__init__()
156
  self.database: DataBase = database
157
+
158
  def forward(self, url: str) -> str:
159
  markdown = self.database.get_markdown_of_url(url)
160
  markdown_content: str = convert_ocr_response_to_markdown(markdown)
161
  return markdown_content
162
 
163
+
164
  class GetPagesContentTool(Tool):
165
  name = ToolNames.GET_PAGES_CONTENT.value
166
  description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the url including the number of pages. Expects a list of page numbers as integers as input. {URL_EXPLAINATION}"
167
  inputs = {
168
+ "url": {"type": "string", "description": "The URL to get the content of."},
 
 
 
169
  "page_numbers": {
170
  "type": "array",
171
+ "description": "The page numbers to get the content of.",
172
  },
173
  }
174
  output_type = "string"
175
+
176
  def __init__(self, database: DataBase):
177
  super().__init__()
178
  self.database: DataBase = database
 
181
  markdown = self.database.get_markdown_of_url(url)
182
  return get_markdown_by_page_numbers(markdown, page_numbers)
183
 
184
+
185
  class FindInMarkdownTool(Tool):
186
  name = ToolNames.FIND_IN_MARKDOWN.value
187
  description = f"Finds the page numbers of the url that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the url that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages. {URL_EXPLAINATION}"
188
  inputs = {
189
+ "url": {"type": "string", "description": "The URL to find in."},
 
 
 
190
  "search_queries": {
191
  "type": "array",
192
+ "description": "The search queries to find in the url. List of strings.",
193
+ },
194
  }
195
  output_type = "array"
196
+
197
  def __init__(self, database: DataBase):
198
  super().__init__()
199
  self.database: DataBase = database
200
+
201
  def forward(self, url: str, search_queries: list[str]) -> list[int]:
202
  markdown = self.database.get_markdown_of_url(url)
203
  return find_in_markdown(markdown, search_queries)
204
 
205
+
206
  def create_web_search_agent(model_id="deepseek/deepseek-chat"):
207
  """Create a web search agent with search, crawling, and PDF analysis capabilities."""
208
+
209
  model = LiteLLMModel(model_id=model_id)
210
  database = DataBase()
211
 
 
220
  GetPagesContentTool(database),
221
  FindInMarkdownTool(database),
222
  ]
223
+
224
  web_search_agent = CodeAgent(
225
  model=model,
226
  tools=WEB_SEARCH_TOOLS,
 
230
  name="web_search_agent",
231
  description="""A team member that can search the web, crawl URLs, download PDFs, and analyze documents.""",
232
  )
233
+
234
  return web_search_agent
src/deepengineer/webcrawler/async_crawl.py CHANGED
@@ -1,15 +1,17 @@
 
 
1
  import aiofiles
2
- import httpx
3
  import crawl4ai
4
- import os
5
- from pathlib import Path
6
 
7
  async def crawl4ai_extract_markdown_of_url_async(url: str) -> str:
8
  """Extract markdown content from a URL using crawl4ai."""
9
  async with crawl4ai.AsyncWebCrawler() as crawler:
10
  result = await crawler.arun(url=url)
11
  return result.markdown
12
-
 
13
  async def download_pdf_async(url: str, output_path: Path) -> str:
14
  """Download a PDF file from a URL."""
15
  timeout = httpx.Timeout(30.0, connect=10.0)
@@ -20,6 +22,7 @@ async def download_pdf_async(url: str, output_path: Path) -> str:
20
  await f.write(response.content)
21
  return output_path
22
 
 
23
  async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str:
24
  """Download a PDF from arXiv by converting the abstract URL to PDF URL. Works also for non arXiv URLs."""
25
  # Extract the arXiv ID from the URL
@@ -29,6 +32,5 @@ async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str:
29
  else:
30
  # If it's already a PDF URL, use it as is
31
  pdf_url = url
32
-
33
- return await download_pdf_async(pdf_url, output_path)
34
 
 
 
1
+ from pathlib import Path
2
+
3
  import aiofiles
 
4
  import crawl4ai
5
+ import httpx
6
+
7
 
8
  async def crawl4ai_extract_markdown_of_url_async(url: str) -> str:
9
  """Extract markdown content from a URL using crawl4ai."""
10
  async with crawl4ai.AsyncWebCrawler() as crawler:
11
  result = await crawler.arun(url=url)
12
  return result.markdown
13
+
14
+
15
  async def download_pdf_async(url: str, output_path: Path) -> str:
16
  """Download a PDF file from a URL."""
17
  timeout = httpx.Timeout(30.0, connect=10.0)
 
22
  await f.write(response.content)
23
  return output_path
24
 
25
+
26
  async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str:
27
  """Download a PDF from arXiv by converting the abstract URL to PDF URL. Works also for non arXiv URLs."""
28
  # Extract the arXiv ID from the URL
 
32
  else:
33
  # If it's already a PDF URL, use it as is
34
  pdf_url = url
 
 
35
 
36
+ return await download_pdf_async(pdf_url, output_path)
src/deepengineer/webcrawler/async_search.py CHANGED
@@ -1,61 +1,67 @@
1
  import os
2
- import asyncio
3
- import requests
4
- from pydantic import BaseModel, Field
5
- from typing import List, Optional, Literal
6
  from enum import Enum
 
7
 
 
8
  from linkup import LinkupClient, LinkupSourcedAnswer
 
9
  from tavily import AsyncTavilyClient
10
 
11
- from langchain_community.retrievers import ArxivRetriever
12
- from langchain_community.utilities.pubmed import PubMedAPIWrapper
13
 
14
  class SearchResult(BaseModel):
15
  """Represents a single search result from any search API."""
 
16
  title: str = Field(..., description="Title of the search result")
17
  url: str = Field(..., description="URL of the result")
18
  content: str = Field(..., description="Summary/snippet of content")
19
- raw_content: Optional[str] = Field(None, description="Full page content if available")
 
20
 
21
  class SearchResponse(BaseModel):
22
  """Represents a search response from any search API."""
 
23
  query: str = Field(..., description="The original search query")
24
- answer: str | None = Field(None, description="Direct answer from the search API if available")
25
- search_results: list[SearchResult] = Field(default_factory=list, description="List of search results")
26
-
 
 
 
 
27
  def to_string(self):
28
  """Convert search response to a formatted string suitable for LLM consumption."""
29
  result_parts = []
30
-
31
  # Add the query
32
  result_parts.append(f"Search Query: {self.query}\n")
33
-
34
  # Add the direct answer if available
35
  if self.answer:
36
  result_parts.append(f"Direct Answer: {self.answer}\n")
37
-
38
  # Add search results
39
  if self.search_results:
40
  result_parts.append(f"Found {len(self.search_results)} search results:\n")
41
-
42
  for i, result in enumerate(self.search_results, 1):
43
  result_parts.append(f"\n--- Result {i} ---")
44
  result_parts.append(f"Title: {result.title}")
45
  result_parts.append(f"URL: {result.url}")
46
- result_parts.append(f"Content: {result.content[:2000]}...")
47
  result_parts.append("") # Empty line for separation
48
  else:
49
  result_parts.append("No search results found.")
50
-
51
  return "\n".join(result_parts)
52
-
 
53
  class ScientificDomains(str, Enum):
54
  wikipedia = "wikipedia.org"
55
  arxiv = "arxiv.org"
56
  pubmed = "pubmed.ncbi.nlm.nih.gov"
57
  sciencedirect = "sciencedirect.com"
58
 
 
59
  def get_tavily_usage():
60
  url = "https://api.tavily.com/usage"
61
  headers = {"Authorization": f"Bearer {os.getenv('TAVILY_API_KEY')}"}
@@ -71,14 +77,14 @@ async def tavily_search_async(
71
  include_answer: Literal["basic", "advanced"] | None = "advanced",
72
  include_raw_content: Literal["text", "markdown"] | None = "markdown",
73
  include_images: bool = False,
74
- search_depth: Literal['basic', 'advanced'] | None = "basic",
75
  include_domains: list[ScientificDomains] = None,
76
  ) -> SearchResponse:
77
  """
78
  Performs concurrent web searches with the Tavily API
79
  """
80
  tavily_async_client = AsyncTavilyClient()
81
-
82
  search_response = await tavily_async_client.search(
83
  query=search_query,
84
  search_depth=search_depth,
@@ -88,29 +94,29 @@ async def tavily_search_async(
88
  include_images=include_images,
89
  include_domains=include_domains,
90
  )
91
-
92
  search_results = [
93
  SearchResult(
94
- title=result.get('title', ''),
95
- url=result.get('url', ''),
96
- content=result.get('content', ''),
97
- raw_content=result.get('raw_content')
98
  )
99
- for result in search_response.get('results', [])
100
  ]
101
 
102
  # Convert to our Pydantic models
103
  responses: SearchResponse = SearchResponse(
104
  query=search_query,
105
- answer=search_response.get('answer', None),
106
- search_results=search_results
107
  )
108
  return responses
109
 
110
 
111
  def get_linkup_balance():
112
  url = "https://api.linkup.so/v1/credits/balance"
113
-
114
  headers = {"Authorization": f"Bearer {os.getenv('LINKUP_API_KEY')}"}
115
 
116
  response = requests.request("GET", url, headers=headers)
@@ -122,14 +128,16 @@ def get_linkup_balance():
122
  async def linkup_search_async(
123
  search_query: str,
124
  depth: Literal["standard", "deep"] = "standard",
125
- output_type: Literal['searchResults', 'sourcedAnswer', 'structured'] = "sourcedAnswer",
 
 
126
  include_images: bool = False,
127
  include_domains: list[ScientificDomains] = None,
128
  ) -> SearchResponse:
129
  """
130
  Performs concurrent web searches using the Linkup API.
131
  """
132
-
133
  client = LinkupClient()
134
  search_response: LinkupSourcedAnswer = await client.async_search(
135
  query=search_query,
@@ -138,7 +146,7 @@ async def linkup_search_async(
138
  include_images=include_images,
139
  include_domains=include_domains,
140
  )
141
-
142
  search_results = [
143
  SearchResult(
144
  title=result.name,
@@ -151,37 +159,48 @@ async def linkup_search_async(
151
 
152
  # Convert to our Pydantic models
153
  responses: SearchResponse = SearchResponse(
154
- query=search_query,
155
- answer=search_response.answer,
156
- search_results=search_results
157
  )
158
  return responses
159
 
160
 
161
-
162
-
163
-
164
  async def arxiv_search_async(
165
  search_query: str,
166
  ) -> SearchResponse:
167
- response = await linkup_search_async(search_query, include_domains=[ScientificDomains.arxiv])
 
 
168
  return response
169
 
170
 
171
  async def pubmed_search_async(
172
  search_query: str,
173
  ) -> SearchResponse:
174
- response = await linkup_search_async(search_query, include_domains=[ScientificDomains.pubmed])
 
 
175
  return response
176
 
 
177
  async def sciencedirect_search_async(
178
  search_query: str,
179
  ) -> SearchResponse:
180
- response = await linkup_search_async(search_query, include_domains=[ScientificDomains.sciencedirect])
 
 
181
  return response
182
 
 
183
  async def scientific_search_async(
184
  search_query: str,
185
  ) -> SearchResponse:
186
- response = await linkup_search_async(search_query, include_domains=[ScientificDomains.wikipedia, ScientificDomains.arxiv, ScientificDomains.pubmed, ScientificDomains.sciencedirect])
187
- return response
 
 
 
 
 
 
 
 
 
1
  import os
 
 
 
 
2
  from enum import Enum
3
+ from typing import Literal
4
 
5
+ import requests
6
  from linkup import LinkupClient, LinkupSourcedAnswer
7
+ from pydantic import BaseModel, Field
8
  from tavily import AsyncTavilyClient
9
 
 
 
10
 
11
  class SearchResult(BaseModel):
12
  """Represents a single search result from any search API."""
13
+
14
  title: str = Field(..., description="Title of the search result")
15
  url: str = Field(..., description="URL of the result")
16
  content: str = Field(..., description="Summary/snippet of content")
17
+ raw_content: str | None = Field(None, description="Full page content if available")
18
+
19
 
20
  class SearchResponse(BaseModel):
21
  """Represents a search response from any search API."""
22
+
23
  query: str = Field(..., description="The original search query")
24
+ answer: str | None = Field(
25
+ None, description="Direct answer from the search API if available"
26
+ )
27
+ search_results: list[SearchResult] = Field(
28
+ default_factory=list, description="List of search results"
29
+ )
30
+
31
  def to_string(self):
32
  """Convert search response to a formatted string suitable for LLM consumption."""
33
  result_parts = []
34
+
35
  # Add the query
36
  result_parts.append(f"Search Query: {self.query}\n")
37
+
38
  # Add the direct answer if available
39
  if self.answer:
40
  result_parts.append(f"Direct Answer: {self.answer}\n")
41
+
42
  # Add search results
43
  if self.search_results:
44
  result_parts.append(f"Found {len(self.search_results)} search results:\n")
45
+
46
  for i, result in enumerate(self.search_results, 1):
47
  result_parts.append(f"\n--- Result {i} ---")
48
  result_parts.append(f"Title: {result.title}")
49
  result_parts.append(f"URL: {result.url}")
50
+ result_parts.append(f"Content: {result.content[:2000]}...")
51
  result_parts.append("") # Empty line for separation
52
  else:
53
  result_parts.append("No search results found.")
54
+
55
  return "\n".join(result_parts)
56
+
57
+
58
  class ScientificDomains(str, Enum):
59
  wikipedia = "wikipedia.org"
60
  arxiv = "arxiv.org"
61
  pubmed = "pubmed.ncbi.nlm.nih.gov"
62
  sciencedirect = "sciencedirect.com"
63
 
64
+
65
  def get_tavily_usage():
66
  url = "https://api.tavily.com/usage"
67
  headers = {"Authorization": f"Bearer {os.getenv('TAVILY_API_KEY')}"}
 
77
  include_answer: Literal["basic", "advanced"] | None = "advanced",
78
  include_raw_content: Literal["text", "markdown"] | None = "markdown",
79
  include_images: bool = False,
80
+ search_depth: Literal["basic", "advanced"] | None = "basic",
81
  include_domains: list[ScientificDomains] = None,
82
  ) -> SearchResponse:
83
  """
84
  Performs concurrent web searches with the Tavily API
85
  """
86
  tavily_async_client = AsyncTavilyClient()
87
+
88
  search_response = await tavily_async_client.search(
89
  query=search_query,
90
  search_depth=search_depth,
 
94
  include_images=include_images,
95
  include_domains=include_domains,
96
  )
97
+
98
  search_results = [
99
  SearchResult(
100
+ title=result.get("title", ""),
101
+ url=result.get("url", ""),
102
+ content=result.get("content", ""),
103
+ raw_content=result.get("raw_content"),
104
  )
105
+ for result in search_response.get("results", [])
106
  ]
107
 
108
  # Convert to our Pydantic models
109
  responses: SearchResponse = SearchResponse(
110
  query=search_query,
111
+ answer=search_response.get("answer", None),
112
+ search_results=search_results,
113
  )
114
  return responses
115
 
116
 
117
  def get_linkup_balance():
118
  url = "https://api.linkup.so/v1/credits/balance"
119
+
120
  headers = {"Authorization": f"Bearer {os.getenv('LINKUP_API_KEY')}"}
121
 
122
  response = requests.request("GET", url, headers=headers)
 
128
  async def linkup_search_async(
129
  search_query: str,
130
  depth: Literal["standard", "deep"] = "standard",
131
+ output_type: Literal[
132
+ "searchResults", "sourcedAnswer", "structured"
133
+ ] = "sourcedAnswer",
134
  include_images: bool = False,
135
  include_domains: list[ScientificDomains] = None,
136
  ) -> SearchResponse:
137
  """
138
  Performs concurrent web searches using the Linkup API.
139
  """
140
+
141
  client = LinkupClient()
142
  search_response: LinkupSourcedAnswer = await client.async_search(
143
  query=search_query,
 
146
  include_images=include_images,
147
  include_domains=include_domains,
148
  )
149
+
150
  search_results = [
151
  SearchResult(
152
  title=result.name,
 
159
 
160
  # Convert to our Pydantic models
161
  responses: SearchResponse = SearchResponse(
162
+ query=search_query, answer=search_response.answer, search_results=search_results
 
 
163
  )
164
  return responses
165
 
166
 
 
 
 
167
  async def arxiv_search_async(
168
  search_query: str,
169
  ) -> SearchResponse:
170
+ response = await linkup_search_async(
171
+ search_query, include_domains=[ScientificDomains.arxiv]
172
+ )
173
  return response
174
 
175
 
176
  async def pubmed_search_async(
177
  search_query: str,
178
  ) -> SearchResponse:
179
+ response = await linkup_search_async(
180
+ search_query, include_domains=[ScientificDomains.pubmed]
181
+ )
182
  return response
183
 
184
+
185
  async def sciencedirect_search_async(
186
  search_query: str,
187
  ) -> SearchResponse:
188
+ response = await linkup_search_async(
189
+ search_query, include_domains=[ScientificDomains.sciencedirect]
190
+ )
191
  return response
192
 
193
+
194
  async def scientific_search_async(
195
  search_query: str,
196
  ) -> SearchResponse:
197
+ response = await linkup_search_async(
198
+ search_query,
199
+ include_domains=[
200
+ ScientificDomains.wikipedia,
201
+ ScientificDomains.arxiv,
202
+ ScientificDomains.pubmed,
203
+ ScientificDomains.sciencedirect,
204
+ ],
205
+ )
206
+ return response
src/deepengineer/webcrawler/crawl_database.py CHANGED
@@ -1,16 +1,23 @@
1
- from deepengineer.webcrawler.utils import sanitize_filename
2
- from deepengineer.common_path import DATA_DIR
3
- from deepengineer.webcrawler.async_search import SearchResult, SearchResponse
4
  import asyncio
 
5
  from mistralai import OCRResponse
6
- from deepengineer.webcrawler.async_crawl import download_pdf_or_arxiv_pdf_async, crawl4ai_extract_markdown_of_url_async
7
- from deepengineer.webcrawler.pdf_utils import convert_raw_markdown_to_ocr_response
8
- from deepengineer.webcrawler.pdf_utils import convert_pdf_to_markdown_async
9
-
10
- class DataBase():
 
 
 
 
 
 
 
 
 
11
  def __init__(self):
12
  self.urls_to_markdown: dict[str, OCRResponse] = {}
13
-
14
  @staticmethod
15
  def preprocess_url(url: str) -> str:
16
  """Preprocess the url to make it a valid url."""
@@ -18,25 +25,25 @@ class DataBase():
18
  return url.replace("arxiv.org/abs/", "arxiv.org/pdf/")
19
  else:
20
  return url
21
-
22
  def crawl_url(self, url: str) -> str:
23
  """Crawl the url, if the url is a pdf, download the pdf and save and return the markdown."""
24
  url = self.preprocess_url(url)
25
  if "pdf" in url:
26
  output_path = (DATA_DIR / sanitize_filename(url)).with_suffix(".pdf")
27
- pdf_path = asyncio.run(download_pdf_or_arxiv_pdf_async(url, output_path=output_path))
 
 
28
  ocr_response = asyncio.run(convert_pdf_to_markdown_async(pdf_path))
29
  else:
30
  markdown = asyncio.run(crawl4ai_extract_markdown_of_url_async(url))
31
  ocr_response = convert_raw_markdown_to_ocr_response(markdown)
32
  self.urls_to_markdown[url] = ocr_response
33
  return ocr_response
34
-
35
-
36
  def get_markdown_of_url(self, url: str) -> OCRResponse:
37
  url = self.preprocess_url(url)
38
  if url in self.urls_to_markdown:
39
  return self.urls_to_markdown[url]
40
  else:
41
  return self.crawl_url(url)
42
-
 
 
 
 
1
  import asyncio
2
+
3
  from mistralai import OCRResponse
4
+
5
+ from deepengineer.common_path import DATA_DIR
6
+ from deepengineer.webcrawler.async_crawl import (
7
+ crawl4ai_extract_markdown_of_url_async,
8
+ download_pdf_or_arxiv_pdf_async,
9
+ )
10
+ from deepengineer.webcrawler.pdf_utils import (
11
+ convert_pdf_to_markdown_async,
12
+ convert_raw_markdown_to_ocr_response,
13
+ )
14
+ from deepengineer.webcrawler.utils import sanitize_filename
15
+
16
+
17
+ class DataBase:
18
  def __init__(self):
19
  self.urls_to_markdown: dict[str, OCRResponse] = {}
20
+
21
  @staticmethod
22
  def preprocess_url(url: str) -> str:
23
  """Preprocess the url to make it a valid url."""
 
25
  return url.replace("arxiv.org/abs/", "arxiv.org/pdf/")
26
  else:
27
  return url
28
+
29
  def crawl_url(self, url: str) -> str:
30
  """Crawl the url, if the url is a pdf, download the pdf and save and return the markdown."""
31
  url = self.preprocess_url(url)
32
  if "pdf" in url:
33
  output_path = (DATA_DIR / sanitize_filename(url)).with_suffix(".pdf")
34
+ pdf_path = asyncio.run(
35
+ download_pdf_or_arxiv_pdf_async(url, output_path=output_path)
36
+ )
37
  ocr_response = asyncio.run(convert_pdf_to_markdown_async(pdf_path))
38
  else:
39
  markdown = asyncio.run(crawl4ai_extract_markdown_of_url_async(url))
40
  ocr_response = convert_raw_markdown_to_ocr_response(markdown)
41
  self.urls_to_markdown[url] = ocr_response
42
  return ocr_response
43
+
 
44
  def get_markdown_of_url(self, url: str) -> OCRResponse:
45
  url = self.preprocess_url(url)
46
  if url in self.urls_to_markdown:
47
  return self.urls_to_markdown[url]
48
  else:
49
  return self.crawl_url(url)
 
src/deepengineer/webcrawler/pdf_utils.py CHANGED
@@ -1,16 +1,10 @@
1
  import os
2
  from pathlib import Path
3
- from pypdf import PdfReader, PdfWriter
4
- import io
5
- from pathlib import Path
6
- from mistralai import Mistral
7
- import os
8
- from litellm import completion
9
 
10
- from mistralai.models import OCRResponse, OCRPageObject, OCRUsageInfo
11
- import yaml
12
- from tenacity import retry, stop_after_attempt, wait_fixed, RetryError
13
  from litellm.exceptions import BadRequestError
 
 
14
 
15
  # Define the size limit in bytes
16
  MAX_SIZE_BYTES = 49 * 1024 * 1024
@@ -20,7 +14,6 @@ async def convert_pdf_to_markdown_async(
20
  pdf_path: Path,
21
  with_image_description: bool = False,
22
  ) -> OCRResponse:
23
-
24
  mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
25
 
26
  uploaded_pdf = await mistral_client.files.upload_async(
@@ -31,7 +24,9 @@ async def convert_pdf_to_markdown_async(
31
  purpose="ocr",
32
  )
33
 
34
- signed_url = await mistral_client.files.get_signed_url_async(file_id=uploaded_pdf.id)
 
 
35
 
36
  ocr_response = await mistral_client.ocr.process_async(
37
  model="mistral-ocr-latest",
@@ -42,27 +37,33 @@ async def convert_pdf_to_markdown_async(
42
  return ocr_response
43
 
44
 
45
- def convert_ocr_response_to_markdown(
46
- ocr_response: OCRResponse
47
- ) -> str:
48
  markdowns: list[str] = []
49
  for page in ocr_response.pages:
50
  page_description = page.markdown
51
  markdowns.append(page_description)
52
-
53
  return "\n\n".join(markdowns)
54
 
55
- def get_markdown_by_page_numbers(markdown: OCRResponse, page_numbers: list[int], get_full_content: bool = False) -> str:
 
 
 
56
  markdowns: list[str] = []
57
  page_numbers_to_get = set(page_numbers)
58
  if get_full_content:
59
  page_numbers_to_get = set(range(len(markdown.pages)))
60
 
61
  for page_number in page_numbers_to_get:
62
- markdowns.append(f"*Page {page_number}*\n{markdown.pages[page_number].markdown}")
 
 
63
  return "\n\n".join(markdowns)
64
 
65
- def find_in_markdown(markdown: OCRResponse, search_queries: list[str] | str) -> list[int]:
 
 
 
66
  """
67
  Find the page numbers of the pdf that contain the search query.
68
 
@@ -82,12 +83,13 @@ def find_in_markdown(markdown: OCRResponse, search_queries: list[str] | str) ->
82
  page_numbers.append(page_number)
83
  return page_numbers
84
 
 
85
  def get_table_of_contents_per_page_markdown(markdown: OCRResponse) -> str:
86
  """
87
  Get the table of contents of the pdf.
88
-
89
  Finds all the titles of the pdf to reconstruct the table of contents.
90
-
91
  Args:
92
  markdown (OCRResponse): The markdown of the pdf.
93
 
@@ -102,15 +104,26 @@ def get_table_of_contents_per_page_markdown(markdown: OCRResponse) -> str:
102
  if line.startswith("#"):
103
  title_to_page_number[line] = page_number
104
 
105
- table_of_contents = "\n".join([f"{title} - Page {page_number}" for title, page_number in title_to_page_number.items()])
 
 
 
 
 
106
  return table_of_contents
107
 
 
108
  def convert_raw_markdown_to_ocr_response(raw_markdown: str) -> OCRResponse:
109
  pages = raw_markdown.split("# ")
110
  usage_info_empty = OCRUsageInfo(pages_processed=0)
111
- return OCRResponse(pages=[OCRPageObject(index=i, markdown="# " + page, images=[], dimensions=None) for i, page in enumerate(pages)], usage_info=usage_info_empty, model="",)
112
-
113
-
 
 
 
 
 
114
 
115
 
116
  def get_images_from_pdf(pdf_path: Path, image_ids: list[str]) -> list[str]:
@@ -141,4 +154,3 @@ def get_images_from_pdf(pdf_path: Path, image_ids: list[str]) -> list[str]:
141
  except BadRequestError:
142
  output = ""
143
  return output
144
-
 
1
  import os
2
  from pathlib import Path
 
 
 
 
 
 
3
 
4
+ from litellm import completion
 
 
5
  from litellm.exceptions import BadRequestError
6
+ from mistralai import Mistral
7
+ from mistralai.models import OCRPageObject, OCRResponse, OCRUsageInfo
8
 
9
  # Define the size limit in bytes
10
  MAX_SIZE_BYTES = 49 * 1024 * 1024
 
14
  pdf_path: Path,
15
  with_image_description: bool = False,
16
  ) -> OCRResponse:
 
17
  mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
18
 
19
  uploaded_pdf = await mistral_client.files.upload_async(
 
24
  purpose="ocr",
25
  )
26
 
27
+ signed_url = await mistral_client.files.get_signed_url_async(
28
+ file_id=uploaded_pdf.id
29
+ )
30
 
31
  ocr_response = await mistral_client.ocr.process_async(
32
  model="mistral-ocr-latest",
 
37
  return ocr_response
38
 
39
 
40
+ def convert_ocr_response_to_markdown(ocr_response: OCRResponse) -> str:
 
 
41
  markdowns: list[str] = []
42
  for page in ocr_response.pages:
43
  page_description = page.markdown
44
  markdowns.append(page_description)
45
+
46
  return "\n\n".join(markdowns)
47
 
48
+
49
+ def get_markdown_by_page_numbers(
50
+ markdown: OCRResponse, page_numbers: list[int], get_full_content: bool = False
51
+ ) -> str:
52
  markdowns: list[str] = []
53
  page_numbers_to_get = set(page_numbers)
54
  if get_full_content:
55
  page_numbers_to_get = set(range(len(markdown.pages)))
56
 
57
  for page_number in page_numbers_to_get:
58
+ markdowns.append(
59
+ f"*Page {page_number}*\n{markdown.pages[page_number].markdown}"
60
+ )
61
  return "\n\n".join(markdowns)
62
 
63
+
64
+ def find_in_markdown(
65
+ markdown: OCRResponse, search_queries: list[str] | str
66
+ ) -> list[int]:
67
  """
68
  Find the page numbers of the pdf that contain the search query.
69
 
 
83
  page_numbers.append(page_number)
84
  return page_numbers
85
 
86
+
87
  def get_table_of_contents_per_page_markdown(markdown: OCRResponse) -> str:
88
  """
89
  Get the table of contents of the pdf.
90
+
91
  Finds all the titles of the pdf to reconstruct the table of contents.
92
+
93
  Args:
94
  markdown (OCRResponse): The markdown of the pdf.
95
 
 
104
  if line.startswith("#"):
105
  title_to_page_number[line] = page_number
106
 
107
+ table_of_contents = "\n".join(
108
+ [
109
+ f"{title} - Page {page_number}"
110
+ for title, page_number in title_to_page_number.items()
111
+ ]
112
+ )
113
  return table_of_contents
114
 
115
+
116
  def convert_raw_markdown_to_ocr_response(raw_markdown: str) -> OCRResponse:
117
  pages = raw_markdown.split("# ")
118
  usage_info_empty = OCRUsageInfo(pages_processed=0)
119
+ return OCRResponse(
120
+ pages=[
121
+ OCRPageObject(index=i, markdown="# " + page, images=[], dimensions=None)
122
+ for i, page in enumerate(pages)
123
+ ],
124
+ usage_info=usage_info_empty,
125
+ model="",
126
+ )
127
 
128
 
129
  def get_images_from_pdf(pdf_path: Path, image_ids: list[str]) -> list[str]:
 
154
  except BadRequestError:
155
  output = ""
156
  return output
 
src/deepengineer/webcrawler/testing.py CHANGED
@@ -6,15 +6,19 @@ TAVILY_RESPONSE_FILE = DATA_DIR / "answers" / "tavily_response.json"
6
 
7
 
8
  def load_linkup_response() -> SearchResponse:
9
- with open(LINKUP_RESPONSE_FILE, "r") as f:
10
  return SearchResponse.model_validate_json(f.read())
11
 
 
12
  def load_tavily_response() -> SearchResponse:
13
- with open(TAVILY_RESPONSE_FILE, "r") as f:
14
  return SearchResponse.model_validate_json(f.read())
15
 
 
16
  URL_WIKIPEDIA = "https://en.wikipedia.org/wiki/Graphite-moderated_reactor"
17
  URL_PDF = "https://arxiv.org/pdf/1301.1699.pdf"
18
  ARXIV_URL = "https://arxiv.org/abs/1301.1699"
19
  PUBMED_URL = "https://pubmed.ncbi.nlm.nih.gov/34100000/"
20
- SCIENCEDIRECT_URL = "https://www.sciencedirect.com/science/article/abs/pii/0168900289901964"
 
 
 
6
 
7
 
8
  def load_linkup_response() -> SearchResponse:
9
+ with open(LINKUP_RESPONSE_FILE) as f:
10
  return SearchResponse.model_validate_json(f.read())
11
 
12
+
13
  def load_tavily_response() -> SearchResponse:
14
+ with open(TAVILY_RESPONSE_FILE) as f:
15
  return SearchResponse.model_validate_json(f.read())
16
 
17
+
18
  URL_WIKIPEDIA = "https://en.wikipedia.org/wiki/Graphite-moderated_reactor"
19
  URL_PDF = "https://arxiv.org/pdf/1301.1699.pdf"
20
  ARXIV_URL = "https://arxiv.org/abs/1301.1699"
21
  PUBMED_URL = "https://pubmed.ncbi.nlm.nih.gov/34100000/"
22
+ SCIENCEDIRECT_URL = (
23
+ "https://www.sciencedirect.com/science/article/abs/pii/0168900289901964"
24
+ )
src/deepengineer/webcrawler/utils.py CHANGED
@@ -1,6 +1,7 @@
1
  import re
2
  import unicodedata
3
 
 
4
  def sanitize_filename(filename, replacement_char="_", max_length=255):
5
  """
6
  Sanitizes a string to be suitable for use as a filename.
@@ -31,26 +32,32 @@ def sanitize_filename(filename, replacement_char="_", max_length=255):
31
  # 1. Replace spaces with the replacement_char
32
  # This is done early to ensure spaces are handled before other replacements
33
  # to avoid issues with double replacement characters in subsequent steps.
34
- cleaned_filename = filename.replace(' ', replacement_char)
35
 
36
  # 2. Convert to NFKD and encode to ASCII to handle accented characters
37
  # This transforms 'crème brûlée' into 'creme brulee'
38
- cleaned_filename = unicodedata.normalize('NFKD', cleaned_filename).encode('ascii', 'ignore').decode('utf-8')
 
 
 
 
39
 
40
  # 3. Remove characters that are not alphanumeric, hyphen, underscore, or period.
41
  # Replace them with the specified replacement_char.
42
  # The regex pattern `[^a-zA-Z0-9\-_.]` matches any character that is NOT
43
  # (a-z, A-Z, 0-9, hyphen, underscore, or period).
44
- cleaned_filename = re.sub(r'[^a-zA-Z0-9\-_.]', replacement_char, cleaned_filename)
45
 
46
  # 4. Replace multiple consecutive replacement_char characters with a single one
47
- cleaned_filename = re.sub(re.escape(replacement_char) + r'+', replacement_char, cleaned_filename)
 
 
48
 
49
  # 5. Trim leading/trailing replacement_char characters
50
  cleaned_filename = cleaned_filename.strip(replacement_char)
51
 
52
  # 6. Ensure the filename doesn't start with a period (hidden file on some systems)
53
- if cleaned_filename.startswith('.'):
54
  cleaned_filename = replacement_char + cleaned_filename[1:]
55
 
56
  # 7. Truncate to max_length
@@ -58,12 +65,12 @@ def sanitize_filename(filename, replacement_char="_", max_length=255):
58
  if len(cleaned_filename) > max_length:
59
  # Try to keep the file extension if present
60
  name, ext = "", ""
61
- if '.' in cleaned_filename:
62
- parts = cleaned_filename.rsplit('.', 1)
63
  name, ext = parts[0], "." + parts[1]
64
 
65
  if len(name) > max_length - len(ext):
66
- cleaned_filename = name[:max_length - len(ext)] + ext
67
  else:
68
  cleaned_filename = cleaned_filename[:max_length]
69
 
 
1
  import re
2
  import unicodedata
3
 
4
+
5
  def sanitize_filename(filename, replacement_char="_", max_length=255):
6
  """
7
  Sanitizes a string to be suitable for use as a filename.
 
32
  # 1. Replace spaces with the replacement_char
33
  # This is done early to ensure spaces are handled before other replacements
34
  # to avoid issues with double replacement characters in subsequent steps.
35
+ cleaned_filename = filename.replace(" ", replacement_char)
36
 
37
  # 2. Convert to NFKD and encode to ASCII to handle accented characters
38
  # This transforms 'crème brûlée' into 'creme brulee'
39
+ cleaned_filename = (
40
+ unicodedata.normalize("NFKD", cleaned_filename)
41
+ .encode("ascii", "ignore")
42
+ .decode("utf-8")
43
+ )
44
 
45
  # 3. Remove characters that are not alphanumeric, hyphen, underscore, or period.
46
  # Replace them with the specified replacement_char.
47
  # The regex pattern `[^a-zA-Z0-9\-_.]` matches any character that is NOT
48
  # (a-z, A-Z, 0-9, hyphen, underscore, or period).
49
+ cleaned_filename = re.sub(r"[^a-zA-Z0-9\-_.]", replacement_char, cleaned_filename)
50
 
51
  # 4. Replace multiple consecutive replacement_char characters with a single one
52
+ cleaned_filename = re.sub(
53
+ re.escape(replacement_char) + r"+", replacement_char, cleaned_filename
54
+ )
55
 
56
  # 5. Trim leading/trailing replacement_char characters
57
  cleaned_filename = cleaned_filename.strip(replacement_char)
58
 
59
  # 6. Ensure the filename doesn't start with a period (hidden file on some systems)
60
+ if cleaned_filename.startswith("."):
61
  cleaned_filename = replacement_char + cleaned_filename[1:]
62
 
63
  # 7. Truncate to max_length
 
65
  if len(cleaned_filename) > max_length:
66
  # Try to keep the file extension if present
67
  name, ext = "", ""
68
+ if "." in cleaned_filename:
69
+ parts = cleaned_filename.rsplit(".", 1)
70
  name, ext = parts[0], "." + parts[1]
71
 
72
  if len(name) > max_length - len(ext):
73
+ cleaned_filename = name[: max_length - len(ext)] + ext
74
  else:
75
  cleaned_filename = cleaned_filename[:max_length]
76
 
tests/deepsearch/test_pdf_agent.py CHANGED
@@ -1,9 +1,17 @@
1
- from deepengineer.deepsearch.analyse_markdown_agent import create_agent, GetTableOfContentsTool, GetMarkdownTool, GetPagesContentTool, FindInMarkdownTool
2
- from mistralai import OCRResponse
3
  from deepengineer.common_path import DATA_DIR
 
 
 
 
 
 
 
 
 
4
 
5
  def load_mock_ocr_response() -> OCRResponse:
6
- with open(DATA_DIR / "report_thermal_neutron.json", "r") as f:
7
  return OCRResponse.model_validate_json(f.read())
8
 
9
 
@@ -13,13 +21,16 @@ def test_pdf_agent():
13
  assert pdf_agent is not None
14
  assert pdf_agent.name == "markdown_agent"
15
  assert pdf_agent.tools is not None
16
- assert len(pdf_agent.tools) == 4 + 1 # +1 for the final answer
17
-
18
-
19
  GetTableOfContentsTool(ocr_response).forward()
20
  GetMarkdownTool(ocr_response).forward()
21
- GetPagesContentTool(ocr_response).forward([1,2,3])
22
  FindInMarkdownTool(ocr_response).forward(["thermal neutron", "neutron"])
23
- # pdf_agent.run("Give me a summary of the document.")
24
 
25
 
 
 
 
 
 
 
1
+ import pytest
 
2
  from deepengineer.common_path import DATA_DIR
3
+ from deepengineer.deepsearch.analyse_markdown_agent import (
4
+ FindInMarkdownTool,
5
+ GetMarkdownTool,
6
+ GetPagesContentTool,
7
+ GetTableOfContentsTool,
8
+ create_agent,
9
+ )
10
+ from mistralai import OCRResponse
11
+
12
 
13
  def load_mock_ocr_response() -> OCRResponse:
14
+ with open(DATA_DIR / "report_thermal_neutron.json") as f:
15
  return OCRResponse.model_validate_json(f.read())
16
 
17
 
 
21
  assert pdf_agent is not None
22
  assert pdf_agent.name == "markdown_agent"
23
  assert pdf_agent.tools is not None
24
+ assert len(pdf_agent.tools) == 4 + 1 # +1 for the final answer
25
+
 
26
  GetTableOfContentsTool(ocr_response).forward()
27
  GetMarkdownTool(ocr_response).forward()
28
+ GetPagesContentTool(ocr_response).forward([1, 2, 3])
29
  FindInMarkdownTool(ocr_response).forward(["thermal neutron", "neutron"])
 
30
 
31
 
32
+ @pytest.mark.skip(reason="This test is too expensive to run on CI")
33
+ def test_run_pdf_agent():
34
+ ocr_response = load_mock_ocr_response()
35
+ pdf_agent = create_agent(ocr_response)
36
+ assert pdf_agent.run("Give me a summary of the document.") is not None
tests/deepsearch/test_web_agent.py CHANGED
@@ -1,5 +1,17 @@
 
1
  from deepengineer.deepsearch.scawl_web_agent import create_web_search_agent
2
 
 
3
  def test_create_web_search_agent():
 
 
 
 
 
4
  agent = create_web_search_agent()
5
- agent.run("Est il possible de faire un réacteur thermique avec du graphite et du plomb?")
 
 
 
 
 
 
1
+ import pytest
2
  from deepengineer.deepsearch.scawl_web_agent import create_web_search_agent
3
 
4
+
5
  def test_create_web_search_agent():
6
+ create_web_search_agent()
7
+
8
+
9
+ @pytest.mark.skip(reason="This test is too expensive to run on CI")
10
+ def test_run_web_search_agent():
11
  agent = create_web_search_agent()
12
+ assert (
13
+ agent.run(
14
+ "Est il possible de faire un réacteur thermique avec du graphite et du plomb?"
15
+ )
16
+ is not None
17
+ )
tests/webcrawler/test_async_crawl.py CHANGED
@@ -1,12 +1,12 @@
1
  import pytest
 
2
  from deepengineer.webcrawler.async_crawl import (
3
  crawl4ai_extract_markdown_of_url_async,
4
  download_pdf_async,
5
  download_pdf_or_arxiv_pdf_async,
6
  )
7
- from mistralai import OCRResponse
8
- from deepengineer.webcrawler.testing import URL_WIKIPEDIA, URL_PDF, ARXIV_URL
9
- from deepengineer.common_path import DATA_DIR
10
 
11
  @pytest.mark.asyncio
12
  async def test_crawl4ai_extract_markdown_of_url_async():
@@ -14,6 +14,7 @@ async def test_crawl4ai_extract_markdown_of_url_async():
14
  assert isinstance(markdown, str)
15
  assert "Graphite-moderated reactor" in markdown
16
 
 
17
  @pytest.mark.asyncio
18
  async def test_download_pdf_async():
19
  output_path = DATA_DIR / "temp.pdf"
@@ -22,6 +23,7 @@ async def test_download_pdf_async():
22
  assert pdf_path == output_path
23
  assert output_path.exists()
24
 
 
25
  @pytest.mark.asyncio
26
  async def test_arxiv_download_pdf_async():
27
  output_path = DATA_DIR / "temp.pdf"
 
1
  import pytest
2
+ from deepengineer.common_path import DATA_DIR
3
  from deepengineer.webcrawler.async_crawl import (
4
  crawl4ai_extract_markdown_of_url_async,
5
  download_pdf_async,
6
  download_pdf_or_arxiv_pdf_async,
7
  )
8
+ from deepengineer.webcrawler.testing import ARXIV_URL, URL_PDF, URL_WIKIPEDIA
9
+
 
10
 
11
  @pytest.mark.asyncio
12
  async def test_crawl4ai_extract_markdown_of_url_async():
 
14
  assert isinstance(markdown, str)
15
  assert "Graphite-moderated reactor" in markdown
16
 
17
+
18
  @pytest.mark.asyncio
19
  async def test_download_pdf_async():
20
  output_path = DATA_DIR / "temp.pdf"
 
23
  assert pdf_path == output_path
24
  assert output_path.exists()
25
 
26
+
27
  @pytest.mark.asyncio
28
  async def test_arxiv_download_pdf_async():
29
  output_path = DATA_DIR / "temp.pdf"
tests/webcrawler/test_async_search.py CHANGED
@@ -1,28 +1,25 @@
1
- import asyncio
2
  import pytest
3
  from deepengineer.webcrawler.async_search import (
4
- tavily_search_async,
5
  SearchResponse,
 
 
6
  get_tavily_usage,
7
  linkup_search_async,
8
- get_linkup_balance,
9
- arxiv_search_async
10
  )
11
- import numpy as np
12
 
13
 
14
  @pytest.mark.expensive
15
  @pytest.mark.asyncio
16
  async def test_tavily_search_async():
17
-
18
  usage_before = get_tavily_usage()
19
  print(usage_before)
20
-
21
 
22
  response = await tavily_search_async(
23
  search_query="Would it be possible to make a thermal reactor with graphite and lead?",
24
  )
25
-
26
  print(response.answer)
27
  assert response is not None
28
  assert isinstance(response, SearchResponse)
@@ -43,10 +40,10 @@ async def test_tavily_search_async():
43
  print(usage_after)
44
  assert usage_after == usage_before + 1
45
 
 
46
  @pytest.mark.expensive
47
  @pytest.mark.asyncio
48
  async def test_linkup_search_async():
49
-
50
  balance_before = get_linkup_balance()
51
  print(balance_before)
52
 
@@ -69,7 +66,8 @@ async def test_linkup_search_async():
69
  balance_after = get_linkup_balance()
70
  print(balance_after)
71
  assert np.isclose(balance_after, balance_before - 0.005)
72
-
 
73
  @pytest.mark.expensive
74
  @pytest.mark.asyncio
75
  async def test_arxiv_search_async():
@@ -78,22 +76,17 @@ async def test_arxiv_search_async():
78
  response = await arxiv_search_async(
79
  search_query="Would it be possible to make a thermal reactor with graphite and lead?",
80
  )
81
-
82
  assert response is not None
83
  assert isinstance(response, SearchResponse)
84
  assert response.query is not None
85
  assert response.answer is not None
86
  assert response.search_results is not None
87
  assert len(response.search_results) >= 10
88
- assert any(result.url.startswith("https://arxiv.org/abs/") for result in response.search_results)
 
 
 
89
 
90
  balance_after = get_linkup_balance()
91
  assert np.isclose(balance_after, balance_before - 0.005)
92
-
93
-
94
-
95
-
96
-
97
-
98
-
99
-
 
1
+ import numpy as np
2
  import pytest
3
  from deepengineer.webcrawler.async_search import (
 
4
  SearchResponse,
5
+ arxiv_search_async,
6
+ get_linkup_balance,
7
  get_tavily_usage,
8
  linkup_search_async,
9
+ tavily_search_async,
 
10
  )
 
11
 
12
 
13
  @pytest.mark.expensive
14
  @pytest.mark.asyncio
15
  async def test_tavily_search_async():
 
16
  usage_before = get_tavily_usage()
17
  print(usage_before)
 
18
 
19
  response = await tavily_search_async(
20
  search_query="Would it be possible to make a thermal reactor with graphite and lead?",
21
  )
22
+
23
  print(response.answer)
24
  assert response is not None
25
  assert isinstance(response, SearchResponse)
 
40
  print(usage_after)
41
  assert usage_after == usage_before + 1
42
 
43
+
44
  @pytest.mark.expensive
45
  @pytest.mark.asyncio
46
  async def test_linkup_search_async():
 
47
  balance_before = get_linkup_balance()
48
  print(balance_before)
49
 
 
66
  balance_after = get_linkup_balance()
67
  print(balance_after)
68
  assert np.isclose(balance_after, balance_before - 0.005)
69
+
70
+
71
  @pytest.mark.expensive
72
  @pytest.mark.asyncio
73
  async def test_arxiv_search_async():
 
76
  response = await arxiv_search_async(
77
  search_query="Would it be possible to make a thermal reactor with graphite and lead?",
78
  )
79
+
80
  assert response is not None
81
  assert isinstance(response, SearchResponse)
82
  assert response.query is not None
83
  assert response.answer is not None
84
  assert response.search_results is not None
85
  assert len(response.search_results) >= 10
86
+ assert any(
87
+ result.url.startswith("https://arxiv.org/abs/")
88
+ for result in response.search_results
89
+ )
90
 
91
  balance_after = get_linkup_balance()
92
  assert np.isclose(balance_after, balance_before - 0.005)
 
 
 
 
 
 
 
 
tests/webcrawler/test_crawl_database.py CHANGED
@@ -1,24 +1,44 @@
1
  from deepengineer.webcrawler.crawl_database import DataBase
2
 
 
3
  def test_crawl_database_arxiv_pdf():
4
  db = DataBase()
5
  db.crawl_url("https://arxiv.org/pdf/2105.00643")
6
  assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
7
  assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
8
- assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages[0].markdown is not None
 
 
 
9
  assert len(db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages) == 20
10
-
 
11
  def test_crawl_database_arxiv_link():
12
  db = DataBase()
13
  db.crawl_url("https://arxiv.org/abs/2105.00643")
14
  assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
15
  assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
16
- assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages[0].markdown is not None
 
 
 
17
  assert len(db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages) == 20
18
-
 
19
  def test_crawl_database_wikipedia_url():
20
  db = DataBase()
21
  db.crawl_url("https://en.wikipedia.org/wiki/Deep_learning")
22
- assert db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning") is not None
23
- assert db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning").pages[0].markdown is not None
24
- assert len(db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning").pages) >= 40
 
 
 
 
 
 
 
 
 
 
 
 
1
  from deepengineer.webcrawler.crawl_database import DataBase
2
 
3
+
4
  def test_crawl_database_arxiv_pdf():
5
  db = DataBase()
6
  db.crawl_url("https://arxiv.org/pdf/2105.00643")
7
  assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
8
  assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
9
+ assert (
10
+ db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages[0].markdown
11
+ is not None
12
+ )
13
  assert len(db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages) == 20
14
+
15
+
16
  def test_crawl_database_arxiv_link():
17
  db = DataBase()
18
  db.crawl_url("https://arxiv.org/abs/2105.00643")
19
  assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
20
  assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
21
+ assert (
22
+ db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages[0].markdown
23
+ is not None
24
+ )
25
  assert len(db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages) == 20
26
+
27
+
28
  def test_crawl_database_wikipedia_url():
29
  db = DataBase()
30
  db.crawl_url("https://en.wikipedia.org/wiki/Deep_learning")
31
+ assert (
32
+ db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning")
33
+ is not None
34
+ )
35
+ assert (
36
+ db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning")
37
+ .pages[0]
38
+ .markdown
39
+ is not None
40
+ )
41
+ assert (
42
+ len(db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning").pages)
43
+ >= 40
44
+ )
tests/webcrawler/test_pdfs_utils.py CHANGED
@@ -1,10 +1,17 @@
1
- from deepengineer.webcrawler.pdf_utils import convert_pdf_to_markdown_async, convert_ocr_response_to_markdown, find_in_markdown, get_table_of_contents_per_page_markdown, get_markdown_by_page_numbers
2
- from mistralai import OCRResponse
3
- from deepengineer.common_path import DATA_DIR
4
  import pytest
 
 
 
 
 
 
 
 
 
 
5
 
6
  def load_mock_ocr_response() -> OCRResponse:
7
- with open(DATA_DIR / "report_thermal_neutron.json", "r") as f:
8
  return OCRResponse.model_validate_json(f.read())
9
 
10
 
@@ -18,18 +25,20 @@ async def test_convert_pdf_to_markdown_async():
18
  assert isinstance(ocr_response, OCRResponse)
19
  assert len(ocr_response.pages) == 16
20
  assert "where each cylinder represent" in markdown
21
-
22
 
23
  def test_table_of_contents_per_page_pdf():
24
  ocr_response = load_mock_ocr_response()
25
  table_of_contents = get_table_of_contents_per_page_markdown(ocr_response)
26
  assert "References - Page 15" in table_of_contents
27
 
 
28
  def test_find_in_pdf():
29
  ocr_response = load_mock_ocr_response()
30
  page_numbers = find_in_markdown(ocr_response, "where each cylinder represent")
31
  assert page_numbers == [7]
32
 
 
33
  def test_get_markdown_by_page_numbers():
34
  ocr_response = load_mock_ocr_response()
35
  page_numbers = [7, 15]
@@ -37,4 +46,4 @@ def test_get_markdown_by_page_numbers():
37
  assert "Page 7" in markdown
38
  assert "Page 15" in markdown
39
  assert "References" in markdown
40
- assert "where each cylinder represent" in markdown
 
 
 
 
1
  import pytest
2
+ from deepengineer.common_path import DATA_DIR
3
+ from deepengineer.webcrawler.pdf_utils import (
4
+ convert_ocr_response_to_markdown,
5
+ convert_pdf_to_markdown_async,
6
+ find_in_markdown,
7
+ get_markdown_by_page_numbers,
8
+ get_table_of_contents_per_page_markdown,
9
+ )
10
+ from mistralai import OCRResponse
11
+
12
 
13
  def load_mock_ocr_response() -> OCRResponse:
14
+ with open(DATA_DIR / "report_thermal_neutron.json") as f:
15
  return OCRResponse.model_validate_json(f.read())
16
 
17
 
 
25
  assert isinstance(ocr_response, OCRResponse)
26
  assert len(ocr_response.pages) == 16
27
  assert "where each cylinder represent" in markdown
28
+
29
 
30
  def test_table_of_contents_per_page_pdf():
31
  ocr_response = load_mock_ocr_response()
32
  table_of_contents = get_table_of_contents_per_page_markdown(ocr_response)
33
  assert "References - Page 15" in table_of_contents
34
 
35
+
36
  def test_find_in_pdf():
37
  ocr_response = load_mock_ocr_response()
38
  page_numbers = find_in_markdown(ocr_response, "where each cylinder represent")
39
  assert page_numbers == [7]
40
 
41
+
42
  def test_get_markdown_by_page_numbers():
43
  ocr_response = load_mock_ocr_response()
44
  page_numbers = [7, 15]
 
46
  assert "Page 7" in markdown
47
  assert "Page 15" in markdown
48
  assert "References" in markdown
49
+ assert "where each cylinder represent" in markdown
tests/webcrawler/test_utils.py CHANGED
@@ -1,28 +1,37 @@
1
  from deepengineer.webcrawler.utils import sanitize_filename
2
 
3
- def test_sanitize_filename():
4
 
 
5
  assert sanitize_filename("My Document!@#$%^&*.txt") == "My_Document_.txt"
6
- assert sanitize_filename(" Another file with spaces & special_chars ") == "Another_file_with_spaces_special_chars"
7
- assert sanitize_filename("Düsseldorf_Report_2023.pdf") == "Dusseldorf_Report_2023.pdf"
8
- assert sanitize_filename("File with an é, ö, ü, ç, ñ.docx") == "File_with_an_e_o_u_c_n.docx"
 
 
 
 
 
 
 
 
9
  assert sanitize_filename("Очень важное дело.xlsx") == "_xlsx"
10
- assert sanitize_filename("My.Super.Duper.File.Name.with.lots.of.dots.and.A@#!!%@#$%^&*.txt") == "My.Super.Duper.File.Name.with.lots.of.dots.and.A_.txt"
 
 
 
 
 
11
  assert sanitize_filename("........hidden_file.txt") == "_.......hidden_file.txt"
12
- assert sanitize_filename("A very long file name that exceeds typical operating system limits and needs to be truncated gracefully.zip") == "A_very_long_file_name_that_exceeds_typical_operating_system_limits_and_needs_to_be_truncated_gracefully.zip"
 
 
 
 
 
13
  assert sanitize_filename(" ") == "untitled_file"
14
  assert sanitize_filename("!") == "untitled_file"
15
  assert sanitize_filename(" .some_hidden_file.txt ") == "_some_hidden_file.txt"
16
- assert sanitize_filename("file_name_with_________many_underscores.txt") == "file_name_with_many_underscores.txt"
17
-
18
-
19
-
20
-
21
-
22
-
23
-
24
-
25
-
26
-
27
-
28
-
 
1
  from deepengineer.webcrawler.utils import sanitize_filename
2
 
 
3
 
4
+ def test_sanitize_filename():
5
  assert sanitize_filename("My Document!@#$%^&*.txt") == "My_Document_.txt"
6
+ assert (
7
+ sanitize_filename(" Another file with spaces & special_chars ")
8
+ == "Another_file_with_spaces_special_chars"
9
+ )
10
+ assert (
11
+ sanitize_filename("Düsseldorf_Report_2023.pdf") == "Dusseldorf_Report_2023.pdf"
12
+ )
13
+ assert (
14
+ sanitize_filename("File with an é, ö, ü, ç, ñ.docx")
15
+ == "File_with_an_e_o_u_c_n.docx"
16
+ )
17
  assert sanitize_filename("Очень важное дело.xlsx") == "_xlsx"
18
+ assert (
19
+ sanitize_filename(
20
+ "My.Super.Duper.File.Name.with.lots.of.dots.and.A@#!!%@#$%^&*.txt"
21
+ )
22
+ == "My.Super.Duper.File.Name.with.lots.of.dots.and.A_.txt"
23
+ )
24
  assert sanitize_filename("........hidden_file.txt") == "_.......hidden_file.txt"
25
+ assert (
26
+ sanitize_filename(
27
+ "A very long file name that exceeds typical operating system limits and needs to be truncated gracefully.zip"
28
+ )
29
+ == "A_very_long_file_name_that_exceeds_typical_operating_system_limits_and_needs_to_be_truncated_gracefully.zip"
30
+ )
31
  assert sanitize_filename(" ") == "untitled_file"
32
  assert sanitize_filename("!") == "untitled_file"
33
  assert sanitize_filename(" .some_hidden_file.txt ") == "_some_hidden_file.txt"
34
+ assert (
35
+ sanitize_filename("file_name_with_________many_underscores.txt")
36
+ == "file_name_with_many_underscores.txt"
37
+ )