Charles Azam commited on
Commit
e040f4f
·
1 Parent(s): bcce487

feat: add database to process markdowns or links

Browse files
src/deepengineer/deepsearch/analyse_markdown_agent.py CHANGED
@@ -78,6 +78,8 @@ class FindInMarkdownTool(Tool):
78
 
79
 
80
  def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
 
 
81
 
82
  model = LiteLLMModel(model_id=model_id)
83
 
 
78
 
79
 
80
  def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
81
+
82
+ """This agent is just a test and will not be used as is by the main agent."""
83
 
84
  model = LiteLLMModel(model_id=model_id)
85
 
src/deepengineer/deepsearch/scawl_web_agent.py CHANGED
@@ -1,61 +1,39 @@
1
- from smolagents import CodeAgent, Tool, LiteLLMModel
2
  from deepengineer.webcrawler.async_search import (
3
- linkup_search_async, tavily_search_async, arxiv_search_async,
4
  pubmed_search_async, scientific_search_async,
5
  )
6
- from deepengineer.webcrawler.async_crawl import (
7
- crawl4ai_extract_markdown_of_url_async, arxiv_download_pdf_async, download_pdf_async
8
- )
9
  from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown, convert_ocr_response_to_markdown, get_markdown_by_page_numbers, find_in_markdown
10
  from mistralai import OCRResponse
11
  from enum import Enum
12
- from pathlib import Path
13
  import asyncio
14
- from typing import Literal
15
- from deepengineer.webcrawler.utils import sanitize_filename
16
- from deepengineer.common_path import DATA_DIR
17
- from deepengineer.webcrawler.async_search import SearchResult
18
 
19
- class DataBase():
20
- def __init__(self):
21
- self.storage_path = DATA_DIR
22
- self.storage_path.mkdir(exist_ok=True, parents=True)
23
- self.sources = dict[str, SearchResult]
24
-
25
- def add_sources(self, sources: list[SearchResult]):
26
- for source in sources:
27
- self.sources[source.url] = source
28
-
29
- def get_sources_by_url(self, url: str) -> SearchResult:
30
- return self.sources[url]
31
-
32
-
33
 
34
  class ToolNames(Enum):
35
  # Search tools
36
- TAVILY_SEARCH = "tavily_search"
37
- LINKUP_SEARCH = "linkup_search"
38
  ARXIV_SEARCH = "arxiv_search"
39
  PUBMED_SEARCH = "pubmed_search"
40
  SCIENCEDIRECT_SEARCH = "sciencedirect_search"
41
  SCIENTIFIC_SEARCH = "scientific_search"
42
-
43
- # Crawling tools
44
- CRAWL_URL = "crawl_url"
45
- DOWNLOAD_PDF = "download_pdf"
46
- ARXIV_DOWNLOAD_PDF = "arxiv_download_pdf"
47
 
48
- # PDF analysis tools (reusing from markdown agent)
49
- GET_TABLE_OF_CONTENTS = "get_table_of_contents"
50
- GET_MARKDOWN = "get_markdown"
51
  GET_PAGES_CONTENT = "get_pages_content"
52
  FIND_IN_MARKDOWN = "find_in_markdown"
53
 
 
 
 
 
54
 
55
  class SearchTool(Tool):
56
- provider: Literal["tavily", "linkup"]
57
- name = ToolNames.LINKUP_SEARCH.value
58
- description = "Search the web using Linkup API. Good for deep research with sourced answers."
 
59
  inputs = {
60
  "search_query": {
61
  "type": "string",
@@ -63,19 +41,19 @@ class SearchTool(Tool):
63
  },
64
  }
65
  output_type = "object"
 
66
 
67
- def forward(self, search_query: str, depth: str = "standard",
68
- output_type: str = "sourcedAnswer") -> dict:
69
  result = asyncio.run(linkup_search_async(
70
  search_query=search_query,
71
- depth=depth,
72
- output_type=output_type
73
  ))
74
- return result.model_dump()
75
 
76
  class ArxivSearchTool(Tool):
77
  name = ToolNames.ARXIV_SEARCH.value
78
- description = "Search arXiv for academic papers and preprints."
 
 
79
  inputs = {
80
  "search_query": {
81
  "type": "string",
@@ -84,13 +62,15 @@ class ArxivSearchTool(Tool):
84
  }
85
  output_type = "object"
86
 
87
- def forward(self, search_query: str) -> dict:
88
  result = asyncio.run(arxiv_search_async(search_query))
89
- return result.model_dump()
90
 
91
  class PubmedSearchTool(Tool):
92
  name = ToolNames.PUBMED_SEARCH.value
93
- description = "Search PubMed for medical and scientific literature."
 
 
94
  inputs = {
95
  "search_query": {
96
  "type": "string",
@@ -99,13 +79,15 @@ class PubmedSearchTool(Tool):
99
  }
100
  output_type = "object"
101
 
102
- def forward(self, search_query: str) -> dict:
103
  result = asyncio.run(pubmed_search_async(search_query))
104
- return result.model_dump()
105
 
106
  class ScientificSearchTool(Tool):
107
  name = ToolNames.SCIENTIFIC_SEARCH.value
108
- description = "Search across multiple scientific domains: Wikipedia, arXiv, PubMed, and ScienceDirect."
 
 
109
  inputs = {
110
  "search_query": {
111
  "type": "string",
@@ -118,93 +100,27 @@ class ScientificSearchTool(Tool):
118
  result = asyncio.run(scientific_search_async(search_query))
119
  return result.model_dump()
120
 
121
- class CrawlUrlTool(Tool):
122
- name = ToolNames.CRAWL_URL.value
123
- description = "Extract markdown content from a URL using crawl4ai."
124
- inputs = {
125
- "url": {
126
- "type": "string",
127
- "description": "The URL to crawl and extract markdown from"
128
- }
129
- }
130
- output_type = "string"
131
-
132
- def forward(self, url: str) -> str:
133
- return asyncio.run(crawl4ai_extract_markdown_of_url_async(url))
134
 
135
- class DownloadPdfTool(Tool):
136
- name = ToolNames.DOWNLOAD_PDF.value
137
- description = "Download a PDF file from a URL and store it in the data directory."
138
- inputs = {
139
- "url": {
140
- "type": "string",
141
- "description": "The URL of the PDF to download"
142
- },
143
- "filename": {
144
- "type": "string",
145
- "description": "The filename to save the PDF as (without .pdf extension)"
146
- }
147
- }
148
- output_type = "string"
149
-
150
- def forward(self, url: str, filename: str) -> str:
151
- # Create data directory if it doesn't exist
152
- data_dir = Path("data")
153
- data_dir.mkdir(exist_ok=True)
154
-
155
- # Create PDFs subdirectory
156
- pdfs_dir = data_dir / "pdfs"
157
- pdfs_dir.mkdir(exist_ok=True)
158
-
159
- output_path = pdfs_dir / f"{filename}.pdf"
160
-
161
- # Download the PDF
162
- result_path = asyncio.run(download_pdf_async(url, output_path))
163
- return f"PDF downloaded successfully to: {result_path}"
164
-
165
- class ArxivDownloadPdfTool(Tool):
166
- name = ToolNames.ARXIV_DOWNLOAD_PDF.value
167
- description = "Download a PDF from arXiv by converting the abstract URL to PDF URL."
168
  inputs = {
169
  "url": {
170
  "type": "string",
171
- "description": "The arXiv abstract URL (e.g., https://arxiv.org/abs/1234.5678)"
172
- },
173
- "filename": {
174
- "type": "string",
175
- "description": "The filename to save the PDF as (without .pdf extension)"
176
  }
177
  }
178
  output_type = "string"
179
 
180
- def forward(self, url: str, filename: str) -> str:
181
- # Create data directory if it doesn't exist
182
- data_dir = Path("data")
183
- data_dir.mkdir(exist_ok=True)
184
-
185
- # Create PDFs subdirectory
186
- pdfs_dir = data_dir / "pdfs"
187
- pdfs_dir.mkdir(exist_ok=True)
188
-
189
- output_path = pdfs_dir / f"{filename}.pdf"
190
-
191
- # Download the PDF
192
- result_path = asyncio.run(arxiv_download_pdf_async(url, output_path))
193
- return f"arXiv PDF downloaded successfully to: {result_path}"
194
-
195
- # Reuse the markdown analysis tools from analyse_markdown_agent.py
196
- class GetTableOfContentsTool(Tool):
197
- name = ToolNames.GET_TABLE_OF_CONTENTS.value
198
- description = "Returns all of the titles in the document along with the page number they are on."
199
- inputs = {}
200
- output_type = "string"
201
-
202
  def __init__(self, markdown: OCRResponse):
203
  super().__init__()
204
  self.markdown: OCRResponse = markdown
205
  self.table_of_contents: str = get_table_of_contents_per_page_markdown(self.markdown)
206
 
207
- def forward(self) -> str:
208
  return self.table_of_contents
209
 
210
  class GetMarkdownTool(Tool):
 
1
+ from smolagents import CodeAgent, Tool, LiteLLMModel, tool
2
  from deepengineer.webcrawler.async_search import (
3
+ linkup_search_async, arxiv_search_async,
4
  pubmed_search_async, scientific_search_async,
5
  )
 
 
 
6
  from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown, convert_ocr_response_to_markdown, get_markdown_by_page_numbers, find_in_markdown
7
  from mistralai import OCRResponse
8
  from enum import Enum
 
9
  import asyncio
10
+ from deepengineer.webcrawler.async_search import SearchResponse
 
 
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  class ToolNames(Enum):
14
  # Search tools
15
+ SEARCH_TOOL = "web search tool"
 
16
  ARXIV_SEARCH = "arxiv_search"
17
  PUBMED_SEARCH = "pubmed_search"
18
  SCIENCEDIRECT_SEARCH = "sciencedirect_search"
19
  SCIENTIFIC_SEARCH = "scientific_search"
 
 
 
 
 
20
 
21
+ # Exploring link tools
22
+ GET_TABLE_OF_CONTENTS = "get_table_of_contents_of_url"
23
+ GET_MARKDOWN = "get_markdown_of_url"
24
  GET_PAGES_CONTENT = "get_pages_content"
25
  FIND_IN_MARKDOWN = "find_in_markdown"
26
 
27
+ def filter_search_results(search_response: SearchResponse, max_nb_results: int = 10) -> SearchResponse:
28
+ search_response.search_results = search_response.search_results[:max_nb_results]
29
+ return search_response
30
+
31
 
32
  class SearchTool(Tool):
33
+ name = ToolNames.SEARCH_TOOL.value
34
+ description = f"""Search the web using Linkup API. Good for deep research with sourced answers.
35
+ Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
36
+ """
37
  inputs = {
38
  "search_query": {
39
  "type": "string",
 
41
  },
42
  }
43
  output_type = "object"
44
+ max_nb_results = 10
45
 
46
+ def forward(self, search_query: str) -> SearchResponse:
 
47
  result = asyncio.run(linkup_search_async(
48
  search_query=search_query,
 
 
49
  ))
50
+ return filter_search_results(result, SearchTool.max_nb_results)
51
 
52
  class ArxivSearchTool(Tool):
53
  name = ToolNames.ARXIV_SEARCH.value
54
+ description = """Search arXiv for academic papers and preprints with Linkup API.
55
+ Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
56
+ """
57
  inputs = {
58
  "search_query": {
59
  "type": "string",
 
62
  }
63
  output_type = "object"
64
 
65
+ def forward(self, search_query: str) -> SearchResponse:
66
  result = asyncio.run(arxiv_search_async(search_query))
67
+ return filter_search_results(result, ArxivSearchTool.max_nb_results)
68
 
69
  class PubmedSearchTool(Tool):
70
  name = ToolNames.PUBMED_SEARCH.value
71
+ description = """Search PubMed for medical and scientific literature with Linkup API.
72
+ Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
73
+ """
74
  inputs = {
75
  "search_query": {
76
  "type": "string",
 
79
  }
80
  output_type = "object"
81
 
82
+ def forward(self, search_query: str) -> SearchResponse:
83
  result = asyncio.run(pubmed_search_async(search_query))
84
+ return filter_search_results(result, PubmedSearchTool.max_nb_results)
85
 
86
  class ScientificSearchTool(Tool):
87
  name = ToolNames.SCIENTIFIC_SEARCH.value
88
+ description = """Search across multiple scientific domains: Wikipedia, arXiv, PubMed, and ScienceDirect.
89
+ Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
90
+ """
91
  inputs = {
92
  "search_query": {
93
  "type": "string",
 
100
  result = asyncio.run(scientific_search_async(search_query))
101
  return result.model_dump()
102
 
103
+ URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
+ class GetTableOfContentsTool(Tool):
106
+ name = ToolNames.GET_TABLE_OF_CONTENTS.value
107
+ description = f"""Returns all of the titles in the document along with the page number they are on.
108
+ {URL_EXPLAINATION}
109
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  inputs = {
111
  "url": {
112
  "type": "string",
113
+ "description": "The URL to get the table of contents of."
 
 
 
 
114
  }
115
  }
116
  output_type = "string"
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def __init__(self, markdown: OCRResponse):
119
  super().__init__()
120
  self.markdown: OCRResponse = markdown
121
  self.table_of_contents: str = get_table_of_contents_per_page_markdown(self.markdown)
122
 
123
+ def forward(self, url: str) -> str:
124
  return self.table_of_contents
125
 
126
  class GetMarkdownTool(Tool):
src/deepengineer/webcrawler/async_crawl.py CHANGED
@@ -20,8 +20,8 @@ async def download_pdf_async(url: str, output_path: Path) -> str:
20
  await f.write(response.content)
21
  return output_path
22
 
23
- async def arxiv_download_pdf_async(url: str, output_path: Path) -> str:
24
- """Download a PDF from arXiv by converting the abstract URL to PDF URL."""
25
  # Extract the arXiv ID from the URL
26
  if "/abs/" in url:
27
  arxiv_id = url.split("/abs/")[1].rstrip("/")
@@ -31,3 +31,4 @@ async def arxiv_download_pdf_async(url: str, output_path: Path) -> str:
31
  pdf_url = url
32
 
33
  return await download_pdf_async(pdf_url, output_path)
 
 
20
  await f.write(response.content)
21
  return output_path
22
 
23
+ async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str:
24
+ """Download a PDF from arXiv by converting the abstract URL to PDF URL. Works also for non arXiv URLs."""
25
  # Extract the arXiv ID from the URL
26
  if "/abs/" in url:
27
  arxiv_id = url.split("/abs/")[1].rstrip("/")
 
31
  pdf_url = url
32
 
33
  return await download_pdf_async(pdf_url, output_path)
34
+
src/deepengineer/webcrawler/crawl_database.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from deepengineer.webcrawler.utils import sanitize_filename
2
+ from deepengineer.common_path import DATA_DIR
3
+ from deepengineer.webcrawler.async_search import SearchResult, SearchResponse
4
+ import asyncio
5
+ from mistralai import OCRResponse
6
+ from deepengineer.webcrawler.async_crawl import download_pdf_or_arxiv_pdf_async, crawl4ai_extract_markdown_of_url_async
7
+ from deepengineer.webcrawler.pdf_utils import convert_raw_markdown_to_ocr_response
8
+ from deepengineer.webcrawler.pdf_utils import convert_pdf_to_markdown_async
9
+
10
+ class DataBase():
11
+ def __init__(self):
12
+ self.urls_to_markdown: dict[str, OCRResponse] = {}
13
+
14
+ @staticmethod
15
+ def preprocess_url(url: str) -> str:
16
+ """Preprocess the url to make it a valid url."""
17
+ if "arxiv.org/abs/" in url:
18
+ return url.replace("arxiv.org/abs/", "arxiv.org/pdf/")
19
+ else:
20
+ return url
21
+
22
+ def crawl_url(self, url: str) -> str:
23
+ """Crawl the url, if the url is a pdf, download the pdf and save and return the markdown."""
24
+ url = self.preprocess_url(url)
25
+ if "pdf" in url:
26
+ output_path = (DATA_DIR / sanitize_filename(url)).with_suffix(".pdf")
27
+ pdf_path = asyncio.run(download_pdf_or_arxiv_pdf_async(url, output_path=output_path))
28
+ ocr_response = asyncio.run(convert_pdf_to_markdown_async(pdf_path))
29
+ else:
30
+ markdown = asyncio.run(crawl4ai_extract_markdown_of_url_async(url))
31
+ ocr_response = convert_raw_markdown_to_ocr_response(markdown)
32
+ self.urls_to_markdown[url] = ocr_response
33
+ return ocr_response
34
+
35
+
36
+ def get_markdown_of_url(self, url: str) -> OCRResponse:
37
+ url = self.preprocess_url(url)
38
+ if url in self.urls_to_markdown:
39
+ return self.urls_to_markdown[url]
40
+ else:
41
+ return self.crawl_url(url)
42
+
src/deepengineer/webcrawler/pdf_utils.py CHANGED
@@ -19,7 +19,7 @@ MAX_SIZE_BYTES = 49 * 1024 * 1024
19
  async def convert_pdf_to_markdown_async(
20
  pdf_path: Path,
21
  with_image_description: bool = False,
22
- ) -> tuple[OCRResponse]:
23
 
24
  mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
25
 
@@ -62,7 +62,7 @@ def get_markdown_by_page_numbers(markdown: OCRResponse, page_numbers: list[int],
62
  markdowns.append(f"*Page {page_number}*\n{markdown.pages[page_number].markdown}")
63
  return "\n\n".join(markdowns)
64
 
65
- def find_in_markdown(markdown: OCRResponse, search_queries: list[str]) -> list[int]:
66
  """
67
  Find the page numbers of the pdf that contain the search query.
68
 
@@ -73,6 +73,8 @@ def find_in_markdown(markdown: OCRResponse, search_queries: list[str]) -> list[i
73
  Returns:
74
  list[int]: The page numbers of the pdf that contain the search query.
75
  """
 
 
76
  page_numbers: list[int] = []
77
  for page_number, page in enumerate(markdown.pages):
78
  for search_query in search_queries:
 
19
  async def convert_pdf_to_markdown_async(
20
  pdf_path: Path,
21
  with_image_description: bool = False,
22
+ ) -> OCRResponse:
23
 
24
  mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
25
 
 
62
  markdowns.append(f"*Page {page_number}*\n{markdown.pages[page_number].markdown}")
63
  return "\n\n".join(markdowns)
64
 
65
+ def find_in_markdown(markdown: OCRResponse, search_queries: list[str] | str) -> list[int]:
66
  """
67
  Find the page numbers of the pdf that contain the search query.
68
 
 
73
  Returns:
74
  list[int]: The page numbers of the pdf that contain the search query.
75
  """
76
+ if isinstance(search_queries, str):
77
+ search_queries = [search_queries]
78
  page_numbers: list[int] = []
79
  for page_number, page in enumerate(markdown.pages):
80
  for search_query in search_queries:
tests/webcrawler/test_async_crawl.py CHANGED
@@ -2,7 +2,7 @@ import pytest
2
  from deepengineer.webcrawler.async_crawl import (
3
  crawl4ai_extract_markdown_of_url_async,
4
  download_pdf_async,
5
- arxiv_download_pdf_async,
6
  )
7
  from mistralai import OCRResponse
8
  from deepengineer.webcrawler.testing import URL_WIKIPEDIA, URL_PDF, ARXIV_URL
@@ -27,6 +27,6 @@ async def test_arxiv_download_pdf_async():
27
  output_path = DATA_DIR / "temp.pdf"
28
  output_path.unlink(missing_ok=True)
29
  assert not output_path.exists()
30
- pdf_path = await arxiv_download_pdf_async(ARXIV_URL, output_path=output_path)
31
  assert pdf_path == output_path
32
  assert output_path.exists()
 
2
  from deepengineer.webcrawler.async_crawl import (
3
  crawl4ai_extract_markdown_of_url_async,
4
  download_pdf_async,
5
+ download_pdf_or_arxiv_pdf_async,
6
  )
7
  from mistralai import OCRResponse
8
  from deepengineer.webcrawler.testing import URL_WIKIPEDIA, URL_PDF, ARXIV_URL
 
27
  output_path = DATA_DIR / "temp.pdf"
28
  output_path.unlink(missing_ok=True)
29
  assert not output_path.exists()
30
+ pdf_path = await download_pdf_or_arxiv_pdf_async(ARXIV_URL, output_path=output_path)
31
  assert pdf_path == output_path
32
  assert output_path.exists()
tests/webcrawler/test_crawl_database.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from deepengineer.webcrawler.crawl_database import DataBase
2
+
3
+ def test_crawl_database_arxiv_pdf():
4
+ db = DataBase()
5
+ db.crawl_url("https://arxiv.org/pdf/2105.00643")
6
+ assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
7
+ assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
8
+ assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages[0].markdown is not None
9
+ assert len(db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages) == 20
10
+
11
+ def test_crawl_database_arxiv_link():
12
+ db = DataBase()
13
+ db.crawl_url("https://arxiv.org/abs/2105.00643")
14
+ assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
15
+ assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
16
+ assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages[0].markdown is not None
17
+ assert len(db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages) == 20
18
+
19
+
tests/webcrawler/test_pdf_agent.py CHANGED
@@ -1,4 +1,4 @@
1
- from deepengineer.deepsearch.analyse_markdown_agent import create_agent, GetTableOfContentsTool, GetMarkdownTool, GetPagesContentTool, FindInPdfTool
2
  from mistralai import OCRResponse
3
  from deepengineer.common_path import DATA_DIR
4
 
@@ -11,7 +11,7 @@ def test_pdf_agent():
11
  ocr_response = load_mock_ocr_response()
12
  pdf_agent = create_agent(ocr_response)
13
  assert pdf_agent is not None
14
- assert pdf_agent.name == "pdf_agent"
15
  assert pdf_agent.tools is not None
16
  assert len(pdf_agent.tools) == 4 + 1 # +1 for the final answer
17
 
@@ -19,8 +19,7 @@ def test_pdf_agent():
19
  GetTableOfContentsTool(ocr_response).forward()
20
  GetMarkdownTool(ocr_response).forward()
21
  GetPagesContentTool(ocr_response).forward([1,2,3])
22
- FindInPdfTool(ocr_response).forward(["thermal neutron", "neutron"])
23
- pdf_agent.run("Give me a summary of the document.")
24
 
25
- test_pdf_agent()
26
 
 
1
+ from deepengineer.deepsearch.analyse_markdown_agent import create_agent, GetTableOfContentsTool, GetMarkdownTool, GetPagesContentTool, FindInMarkdownTool
2
  from mistralai import OCRResponse
3
  from deepengineer.common_path import DATA_DIR
4
 
 
11
  ocr_response = load_mock_ocr_response()
12
  pdf_agent = create_agent(ocr_response)
13
  assert pdf_agent is not None
14
+ assert pdf_agent.name == "markdown_agent"
15
  assert pdf_agent.tools is not None
16
  assert len(pdf_agent.tools) == 4 + 1 # +1 for the final answer
17
 
 
19
  GetTableOfContentsTool(ocr_response).forward()
20
  GetMarkdownTool(ocr_response).forward()
21
  GetPagesContentTool(ocr_response).forward([1,2,3])
22
+ FindInMarkdownTool(ocr_response).forward(["thermal neutron", "neutron"])
23
+ # pdf_agent.run("Give me a summary of the document.")
24
 
 
25