Spaces:
Runtime error
Runtime error
Charles Azam
commited on
Commit
·
e040f4f
1
Parent(s):
bcce487
feat: add database to process markdowns or links
Browse files- src/deepengineer/deepsearch/analyse_markdown_agent.py +2 -0
- src/deepengineer/deepsearch/scawl_web_agent.py +39 -123
- src/deepengineer/webcrawler/async_crawl.py +3 -2
- src/deepengineer/webcrawler/crawl_database.py +42 -0
- src/deepengineer/webcrawler/pdf_utils.py +4 -2
- tests/webcrawler/test_async_crawl.py +2 -2
- tests/webcrawler/test_crawl_database.py +19 -0
- tests/webcrawler/test_pdf_agent.py +4 -5
src/deepengineer/deepsearch/analyse_markdown_agent.py
CHANGED
|
@@ -78,6 +78,8 @@ class FindInMarkdownTool(Tool):
|
|
| 78 |
|
| 79 |
|
| 80 |
def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
|
|
|
|
|
|
|
| 81 |
|
| 82 |
model = LiteLLMModel(model_id=model_id)
|
| 83 |
|
|
|
|
| 78 |
|
| 79 |
|
| 80 |
def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
|
| 81 |
+
|
| 82 |
+
"""This agent is just a test and will not be used as is by the main agent."""
|
| 83 |
|
| 84 |
model = LiteLLMModel(model_id=model_id)
|
| 85 |
|
src/deepengineer/deepsearch/scawl_web_agent.py
CHANGED
|
@@ -1,61 +1,39 @@
|
|
| 1 |
-
from smolagents import CodeAgent, Tool, LiteLLMModel
|
| 2 |
from deepengineer.webcrawler.async_search import (
|
| 3 |
-
linkup_search_async,
|
| 4 |
pubmed_search_async, scientific_search_async,
|
| 5 |
)
|
| 6 |
-
from deepengineer.webcrawler.async_crawl import (
|
| 7 |
-
crawl4ai_extract_markdown_of_url_async, arxiv_download_pdf_async, download_pdf_async
|
| 8 |
-
)
|
| 9 |
from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown, convert_ocr_response_to_markdown, get_markdown_by_page_numbers, find_in_markdown
|
| 10 |
from mistralai import OCRResponse
|
| 11 |
from enum import Enum
|
| 12 |
-
from pathlib import Path
|
| 13 |
import asyncio
|
| 14 |
-
from
|
| 15 |
-
from deepengineer.webcrawler.utils import sanitize_filename
|
| 16 |
-
from deepengineer.common_path import DATA_DIR
|
| 17 |
-
from deepengineer.webcrawler.async_search import SearchResult
|
| 18 |
|
| 19 |
-
class DataBase():
|
| 20 |
-
def __init__(self):
|
| 21 |
-
self.storage_path = DATA_DIR
|
| 22 |
-
self.storage_path.mkdir(exist_ok=True, parents=True)
|
| 23 |
-
self.sources = dict[str, SearchResult]
|
| 24 |
-
|
| 25 |
-
def add_sources(self, sources: list[SearchResult]):
|
| 26 |
-
for source in sources:
|
| 27 |
-
self.sources[source.url] = source
|
| 28 |
-
|
| 29 |
-
def get_sources_by_url(self, url: str) -> SearchResult:
|
| 30 |
-
return self.sources[url]
|
| 31 |
-
|
| 32 |
-
|
| 33 |
|
| 34 |
class ToolNames(Enum):
|
| 35 |
# Search tools
|
| 36 |
-
|
| 37 |
-
LINKUP_SEARCH = "linkup_search"
|
| 38 |
ARXIV_SEARCH = "arxiv_search"
|
| 39 |
PUBMED_SEARCH = "pubmed_search"
|
| 40 |
SCIENCEDIRECT_SEARCH = "sciencedirect_search"
|
| 41 |
SCIENTIFIC_SEARCH = "scientific_search"
|
| 42 |
-
|
| 43 |
-
# Crawling tools
|
| 44 |
-
CRAWL_URL = "crawl_url"
|
| 45 |
-
DOWNLOAD_PDF = "download_pdf"
|
| 46 |
-
ARXIV_DOWNLOAD_PDF = "arxiv_download_pdf"
|
| 47 |
|
| 48 |
-
#
|
| 49 |
-
GET_TABLE_OF_CONTENTS = "
|
| 50 |
-
GET_MARKDOWN = "
|
| 51 |
GET_PAGES_CONTENT = "get_pages_content"
|
| 52 |
FIND_IN_MARKDOWN = "find_in_markdown"
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
class SearchTool(Tool):
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
| 59 |
inputs = {
|
| 60 |
"search_query": {
|
| 61 |
"type": "string",
|
|
@@ -63,19 +41,19 @@ class SearchTool(Tool):
|
|
| 63 |
},
|
| 64 |
}
|
| 65 |
output_type = "object"
|
|
|
|
| 66 |
|
| 67 |
-
def forward(self, search_query: str
|
| 68 |
-
output_type: str = "sourcedAnswer") -> dict:
|
| 69 |
result = asyncio.run(linkup_search_async(
|
| 70 |
search_query=search_query,
|
| 71 |
-
depth=depth,
|
| 72 |
-
output_type=output_type
|
| 73 |
))
|
| 74 |
-
return result.
|
| 75 |
|
| 76 |
class ArxivSearchTool(Tool):
|
| 77 |
name = ToolNames.ARXIV_SEARCH.value
|
| 78 |
-
description = "Search arXiv for academic papers and preprints.
|
|
|
|
|
|
|
| 79 |
inputs = {
|
| 80 |
"search_query": {
|
| 81 |
"type": "string",
|
|
@@ -84,13 +62,15 @@ class ArxivSearchTool(Tool):
|
|
| 84 |
}
|
| 85 |
output_type = "object"
|
| 86 |
|
| 87 |
-
def forward(self, search_query: str) ->
|
| 88 |
result = asyncio.run(arxiv_search_async(search_query))
|
| 89 |
-
return result.
|
| 90 |
|
| 91 |
class PubmedSearchTool(Tool):
|
| 92 |
name = ToolNames.PUBMED_SEARCH.value
|
| 93 |
-
description = "Search PubMed for medical and scientific literature.
|
|
|
|
|
|
|
| 94 |
inputs = {
|
| 95 |
"search_query": {
|
| 96 |
"type": "string",
|
|
@@ -99,13 +79,15 @@ class PubmedSearchTool(Tool):
|
|
| 99 |
}
|
| 100 |
output_type = "object"
|
| 101 |
|
| 102 |
-
def forward(self, search_query: str) ->
|
| 103 |
result = asyncio.run(pubmed_search_async(search_query))
|
| 104 |
-
return result.
|
| 105 |
|
| 106 |
class ScientificSearchTool(Tool):
|
| 107 |
name = ToolNames.SCIENTIFIC_SEARCH.value
|
| 108 |
-
description = "Search across multiple scientific domains: Wikipedia, arXiv, PubMed, and ScienceDirect.
|
|
|
|
|
|
|
| 109 |
inputs = {
|
| 110 |
"search_query": {
|
| 111 |
"type": "string",
|
|
@@ -118,93 +100,27 @@ class ScientificSearchTool(Tool):
|
|
| 118 |
result = asyncio.run(scientific_search_async(search_query))
|
| 119 |
return result.model_dump()
|
| 120 |
|
| 121 |
-
|
| 122 |
-
name = ToolNames.CRAWL_URL.value
|
| 123 |
-
description = "Extract markdown content from a URL using crawl4ai."
|
| 124 |
-
inputs = {
|
| 125 |
-
"url": {
|
| 126 |
-
"type": "string",
|
| 127 |
-
"description": "The URL to crawl and extract markdown from"
|
| 128 |
-
}
|
| 129 |
-
}
|
| 130 |
-
output_type = "string"
|
| 131 |
-
|
| 132 |
-
def forward(self, url: str) -> str:
|
| 133 |
-
return asyncio.run(crawl4ai_extract_markdown_of_url_async(url))
|
| 134 |
|
| 135 |
-
class
|
| 136 |
-
name = ToolNames.
|
| 137 |
-
description = "
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
"type": "string",
|
| 141 |
-
"description": "The URL of the PDF to download"
|
| 142 |
-
},
|
| 143 |
-
"filename": {
|
| 144 |
-
"type": "string",
|
| 145 |
-
"description": "The filename to save the PDF as (without .pdf extension)"
|
| 146 |
-
}
|
| 147 |
-
}
|
| 148 |
-
output_type = "string"
|
| 149 |
-
|
| 150 |
-
def forward(self, url: str, filename: str) -> str:
|
| 151 |
-
# Create data directory if it doesn't exist
|
| 152 |
-
data_dir = Path("data")
|
| 153 |
-
data_dir.mkdir(exist_ok=True)
|
| 154 |
-
|
| 155 |
-
# Create PDFs subdirectory
|
| 156 |
-
pdfs_dir = data_dir / "pdfs"
|
| 157 |
-
pdfs_dir.mkdir(exist_ok=True)
|
| 158 |
-
|
| 159 |
-
output_path = pdfs_dir / f"{filename}.pdf"
|
| 160 |
-
|
| 161 |
-
# Download the PDF
|
| 162 |
-
result_path = asyncio.run(download_pdf_async(url, output_path))
|
| 163 |
-
return f"PDF downloaded successfully to: {result_path}"
|
| 164 |
-
|
| 165 |
-
class ArxivDownloadPdfTool(Tool):
|
| 166 |
-
name = ToolNames.ARXIV_DOWNLOAD_PDF.value
|
| 167 |
-
description = "Download a PDF from arXiv by converting the abstract URL to PDF URL."
|
| 168 |
inputs = {
|
| 169 |
"url": {
|
| 170 |
"type": "string",
|
| 171 |
-
"description": "The
|
| 172 |
-
},
|
| 173 |
-
"filename": {
|
| 174 |
-
"type": "string",
|
| 175 |
-
"description": "The filename to save the PDF as (without .pdf extension)"
|
| 176 |
}
|
| 177 |
}
|
| 178 |
output_type = "string"
|
| 179 |
|
| 180 |
-
def forward(self, url: str, filename: str) -> str:
|
| 181 |
-
# Create data directory if it doesn't exist
|
| 182 |
-
data_dir = Path("data")
|
| 183 |
-
data_dir.mkdir(exist_ok=True)
|
| 184 |
-
|
| 185 |
-
# Create PDFs subdirectory
|
| 186 |
-
pdfs_dir = data_dir / "pdfs"
|
| 187 |
-
pdfs_dir.mkdir(exist_ok=True)
|
| 188 |
-
|
| 189 |
-
output_path = pdfs_dir / f"{filename}.pdf"
|
| 190 |
-
|
| 191 |
-
# Download the PDF
|
| 192 |
-
result_path = asyncio.run(arxiv_download_pdf_async(url, output_path))
|
| 193 |
-
return f"arXiv PDF downloaded successfully to: {result_path}"
|
| 194 |
-
|
| 195 |
-
# Reuse the markdown analysis tools from analyse_markdown_agent.py
|
| 196 |
-
class GetTableOfContentsTool(Tool):
|
| 197 |
-
name = ToolNames.GET_TABLE_OF_CONTENTS.value
|
| 198 |
-
description = "Returns all of the titles in the document along with the page number they are on."
|
| 199 |
-
inputs = {}
|
| 200 |
-
output_type = "string"
|
| 201 |
-
|
| 202 |
def __init__(self, markdown: OCRResponse):
|
| 203 |
super().__init__()
|
| 204 |
self.markdown: OCRResponse = markdown
|
| 205 |
self.table_of_contents: str = get_table_of_contents_per_page_markdown(self.markdown)
|
| 206 |
|
| 207 |
-
def forward(self) -> str:
|
| 208 |
return self.table_of_contents
|
| 209 |
|
| 210 |
class GetMarkdownTool(Tool):
|
|
|
|
| 1 |
+
from smolagents import CodeAgent, Tool, LiteLLMModel, tool
|
| 2 |
from deepengineer.webcrawler.async_search import (
|
| 3 |
+
linkup_search_async, arxiv_search_async,
|
| 4 |
pubmed_search_async, scientific_search_async,
|
| 5 |
)
|
|
|
|
|
|
|
|
|
|
| 6 |
from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown, convert_ocr_response_to_markdown, get_markdown_by_page_numbers, find_in_markdown
|
| 7 |
from mistralai import OCRResponse
|
| 8 |
from enum import Enum
|
|
|
|
| 9 |
import asyncio
|
| 10 |
+
from deepengineer.webcrawler.async_search import SearchResponse
|
|
|
|
|
|
|
|
|
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
class ToolNames(Enum):
|
| 14 |
# Search tools
|
| 15 |
+
SEARCH_TOOL = "web search tool"
|
|
|
|
| 16 |
ARXIV_SEARCH = "arxiv_search"
|
| 17 |
PUBMED_SEARCH = "pubmed_search"
|
| 18 |
SCIENCEDIRECT_SEARCH = "sciencedirect_search"
|
| 19 |
SCIENTIFIC_SEARCH = "scientific_search"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
+
# Exploring link tools
|
| 22 |
+
GET_TABLE_OF_CONTENTS = "get_table_of_contents_of_url"
|
| 23 |
+
GET_MARKDOWN = "get_markdown_of_url"
|
| 24 |
GET_PAGES_CONTENT = "get_pages_content"
|
| 25 |
FIND_IN_MARKDOWN = "find_in_markdown"
|
| 26 |
|
| 27 |
+
def filter_search_results(search_response: SearchResponse, max_nb_results: int = 10) -> SearchResponse:
|
| 28 |
+
search_response.search_results = search_response.search_results[:max_nb_results]
|
| 29 |
+
return search_response
|
| 30 |
+
|
| 31 |
|
| 32 |
class SearchTool(Tool):
|
| 33 |
+
name = ToolNames.SEARCH_TOOL.value
|
| 34 |
+
description = f"""Search the web using Linkup API. Good for deep research with sourced answers.
|
| 35 |
+
Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
|
| 36 |
+
"""
|
| 37 |
inputs = {
|
| 38 |
"search_query": {
|
| 39 |
"type": "string",
|
|
|
|
| 41 |
},
|
| 42 |
}
|
| 43 |
output_type = "object"
|
| 44 |
+
max_nb_results = 10
|
| 45 |
|
| 46 |
+
def forward(self, search_query: str) -> SearchResponse:
|
|
|
|
| 47 |
result = asyncio.run(linkup_search_async(
|
| 48 |
search_query=search_query,
|
|
|
|
|
|
|
| 49 |
))
|
| 50 |
+
return filter_search_results(result, SearchTool.max_nb_results)
|
| 51 |
|
| 52 |
class ArxivSearchTool(Tool):
|
| 53 |
name = ToolNames.ARXIV_SEARCH.value
|
| 54 |
+
description = """Search arXiv for academic papers and preprints with Linkup API.
|
| 55 |
+
Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
|
| 56 |
+
"""
|
| 57 |
inputs = {
|
| 58 |
"search_query": {
|
| 59 |
"type": "string",
|
|
|
|
| 62 |
}
|
| 63 |
output_type = "object"
|
| 64 |
|
| 65 |
+
def forward(self, search_query: str) -> SearchResponse:
|
| 66 |
result = asyncio.run(arxiv_search_async(search_query))
|
| 67 |
+
return filter_search_results(result, ArxivSearchTool.max_nb_results)
|
| 68 |
|
| 69 |
class PubmedSearchTool(Tool):
|
| 70 |
name = ToolNames.PUBMED_SEARCH.value
|
| 71 |
+
description = """Search PubMed for medical and scientific literature with Linkup API.
|
| 72 |
+
Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
|
| 73 |
+
"""
|
| 74 |
inputs = {
|
| 75 |
"search_query": {
|
| 76 |
"type": "string",
|
|
|
|
| 79 |
}
|
| 80 |
output_type = "object"
|
| 81 |
|
| 82 |
+
def forward(self, search_query: str) -> SearchResponse:
|
| 83 |
result = asyncio.run(pubmed_search_async(search_query))
|
| 84 |
+
return filter_search_results(result, PubmedSearchTool.max_nb_results)
|
| 85 |
|
| 86 |
class ScientificSearchTool(Tool):
|
| 87 |
name = ToolNames.SCIENTIFIC_SEARCH.value
|
| 88 |
+
description = """Search across multiple scientific domains: Wikipedia, arXiv, PubMed, and ScienceDirect.
|
| 89 |
+
Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
|
| 90 |
+
"""
|
| 91 |
inputs = {
|
| 92 |
"search_query": {
|
| 93 |
"type": "string",
|
|
|
|
| 100 |
result = asyncio.run(scientific_search_async(search_query))
|
| 101 |
return result.model_dump()
|
| 102 |
|
| 103 |
+
URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
+
class GetTableOfContentsTool(Tool):
|
| 106 |
+
name = ToolNames.GET_TABLE_OF_CONTENTS.value
|
| 107 |
+
description = f"""Returns all of the titles in the document along with the page number they are on.
|
| 108 |
+
{URL_EXPLAINATION}
|
| 109 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
inputs = {
|
| 111 |
"url": {
|
| 112 |
"type": "string",
|
| 113 |
+
"description": "The URL to get the table of contents of."
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
}
|
| 115 |
}
|
| 116 |
output_type = "string"
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
def __init__(self, markdown: OCRResponse):
|
| 119 |
super().__init__()
|
| 120 |
self.markdown: OCRResponse = markdown
|
| 121 |
self.table_of_contents: str = get_table_of_contents_per_page_markdown(self.markdown)
|
| 122 |
|
| 123 |
+
def forward(self, url: str) -> str:
|
| 124 |
return self.table_of_contents
|
| 125 |
|
| 126 |
class GetMarkdownTool(Tool):
|
src/deepengineer/webcrawler/async_crawl.py
CHANGED
|
@@ -20,8 +20,8 @@ async def download_pdf_async(url: str, output_path: Path) -> str:
|
|
| 20 |
await f.write(response.content)
|
| 21 |
return output_path
|
| 22 |
|
| 23 |
-
async def
|
| 24 |
-
"""Download a PDF from arXiv by converting the abstract URL to PDF URL."""
|
| 25 |
# Extract the arXiv ID from the URL
|
| 26 |
if "/abs/" in url:
|
| 27 |
arxiv_id = url.split("/abs/")[1].rstrip("/")
|
|
@@ -31,3 +31,4 @@ async def arxiv_download_pdf_async(url: str, output_path: Path) -> str:
|
|
| 31 |
pdf_url = url
|
| 32 |
|
| 33 |
return await download_pdf_async(pdf_url, output_path)
|
|
|
|
|
|
| 20 |
await f.write(response.content)
|
| 21 |
return output_path
|
| 22 |
|
| 23 |
+
async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str:
|
| 24 |
+
"""Download a PDF from arXiv by converting the abstract URL to PDF URL. Works also for non arXiv URLs."""
|
| 25 |
# Extract the arXiv ID from the URL
|
| 26 |
if "/abs/" in url:
|
| 27 |
arxiv_id = url.split("/abs/")[1].rstrip("/")
|
|
|
|
| 31 |
pdf_url = url
|
| 32 |
|
| 33 |
return await download_pdf_async(pdf_url, output_path)
|
| 34 |
+
|
src/deepengineer/webcrawler/crawl_database.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from deepengineer.webcrawler.utils import sanitize_filename
|
| 2 |
+
from deepengineer.common_path import DATA_DIR
|
| 3 |
+
from deepengineer.webcrawler.async_search import SearchResult, SearchResponse
|
| 4 |
+
import asyncio
|
| 5 |
+
from mistralai import OCRResponse
|
| 6 |
+
from deepengineer.webcrawler.async_crawl import download_pdf_or_arxiv_pdf_async, crawl4ai_extract_markdown_of_url_async
|
| 7 |
+
from deepengineer.webcrawler.pdf_utils import convert_raw_markdown_to_ocr_response
|
| 8 |
+
from deepengineer.webcrawler.pdf_utils import convert_pdf_to_markdown_async
|
| 9 |
+
|
| 10 |
+
class DataBase():
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.urls_to_markdown: dict[str, OCRResponse] = {}
|
| 13 |
+
|
| 14 |
+
@staticmethod
|
| 15 |
+
def preprocess_url(url: str) -> str:
|
| 16 |
+
"""Preprocess the url to make it a valid url."""
|
| 17 |
+
if "arxiv.org/abs/" in url:
|
| 18 |
+
return url.replace("arxiv.org/abs/", "arxiv.org/pdf/")
|
| 19 |
+
else:
|
| 20 |
+
return url
|
| 21 |
+
|
| 22 |
+
def crawl_url(self, url: str) -> str:
|
| 23 |
+
"""Crawl the url, if the url is a pdf, download the pdf and save and return the markdown."""
|
| 24 |
+
url = self.preprocess_url(url)
|
| 25 |
+
if "pdf" in url:
|
| 26 |
+
output_path = (DATA_DIR / sanitize_filename(url)).with_suffix(".pdf")
|
| 27 |
+
pdf_path = asyncio.run(download_pdf_or_arxiv_pdf_async(url, output_path=output_path))
|
| 28 |
+
ocr_response = asyncio.run(convert_pdf_to_markdown_async(pdf_path))
|
| 29 |
+
else:
|
| 30 |
+
markdown = asyncio.run(crawl4ai_extract_markdown_of_url_async(url))
|
| 31 |
+
ocr_response = convert_raw_markdown_to_ocr_response(markdown)
|
| 32 |
+
self.urls_to_markdown[url] = ocr_response
|
| 33 |
+
return ocr_response
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_markdown_of_url(self, url: str) -> OCRResponse:
|
| 37 |
+
url = self.preprocess_url(url)
|
| 38 |
+
if url in self.urls_to_markdown:
|
| 39 |
+
return self.urls_to_markdown[url]
|
| 40 |
+
else:
|
| 41 |
+
return self.crawl_url(url)
|
| 42 |
+
|
src/deepengineer/webcrawler/pdf_utils.py
CHANGED
|
@@ -19,7 +19,7 @@ MAX_SIZE_BYTES = 49 * 1024 * 1024
|
|
| 19 |
async def convert_pdf_to_markdown_async(
|
| 20 |
pdf_path: Path,
|
| 21 |
with_image_description: bool = False,
|
| 22 |
-
) ->
|
| 23 |
|
| 24 |
mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
|
| 25 |
|
|
@@ -62,7 +62,7 @@ def get_markdown_by_page_numbers(markdown: OCRResponse, page_numbers: list[int],
|
|
| 62 |
markdowns.append(f"*Page {page_number}*\n{markdown.pages[page_number].markdown}")
|
| 63 |
return "\n\n".join(markdowns)
|
| 64 |
|
| 65 |
-
def find_in_markdown(markdown: OCRResponse, search_queries: list[str]) -> list[int]:
|
| 66 |
"""
|
| 67 |
Find the page numbers of the pdf that contain the search query.
|
| 68 |
|
|
@@ -73,6 +73,8 @@ def find_in_markdown(markdown: OCRResponse, search_queries: list[str]) -> list[i
|
|
| 73 |
Returns:
|
| 74 |
list[int]: The page numbers of the pdf that contain the search query.
|
| 75 |
"""
|
|
|
|
|
|
|
| 76 |
page_numbers: list[int] = []
|
| 77 |
for page_number, page in enumerate(markdown.pages):
|
| 78 |
for search_query in search_queries:
|
|
|
|
| 19 |
async def convert_pdf_to_markdown_async(
|
| 20 |
pdf_path: Path,
|
| 21 |
with_image_description: bool = False,
|
| 22 |
+
) -> OCRResponse:
|
| 23 |
|
| 24 |
mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
|
| 25 |
|
|
|
|
| 62 |
markdowns.append(f"*Page {page_number}*\n{markdown.pages[page_number].markdown}")
|
| 63 |
return "\n\n".join(markdowns)
|
| 64 |
|
| 65 |
+
def find_in_markdown(markdown: OCRResponse, search_queries: list[str] | str) -> list[int]:
|
| 66 |
"""
|
| 67 |
Find the page numbers of the pdf that contain the search query.
|
| 68 |
|
|
|
|
| 73 |
Returns:
|
| 74 |
list[int]: The page numbers of the pdf that contain the search query.
|
| 75 |
"""
|
| 76 |
+
if isinstance(search_queries, str):
|
| 77 |
+
search_queries = [search_queries]
|
| 78 |
page_numbers: list[int] = []
|
| 79 |
for page_number, page in enumerate(markdown.pages):
|
| 80 |
for search_query in search_queries:
|
tests/webcrawler/test_async_crawl.py
CHANGED
|
@@ -2,7 +2,7 @@ import pytest
|
|
| 2 |
from deepengineer.webcrawler.async_crawl import (
|
| 3 |
crawl4ai_extract_markdown_of_url_async,
|
| 4 |
download_pdf_async,
|
| 5 |
-
|
| 6 |
)
|
| 7 |
from mistralai import OCRResponse
|
| 8 |
from deepengineer.webcrawler.testing import URL_WIKIPEDIA, URL_PDF, ARXIV_URL
|
|
@@ -27,6 +27,6 @@ async def test_arxiv_download_pdf_async():
|
|
| 27 |
output_path = DATA_DIR / "temp.pdf"
|
| 28 |
output_path.unlink(missing_ok=True)
|
| 29 |
assert not output_path.exists()
|
| 30 |
-
pdf_path = await
|
| 31 |
assert pdf_path == output_path
|
| 32 |
assert output_path.exists()
|
|
|
|
| 2 |
from deepengineer.webcrawler.async_crawl import (
|
| 3 |
crawl4ai_extract_markdown_of_url_async,
|
| 4 |
download_pdf_async,
|
| 5 |
+
download_pdf_or_arxiv_pdf_async,
|
| 6 |
)
|
| 7 |
from mistralai import OCRResponse
|
| 8 |
from deepengineer.webcrawler.testing import URL_WIKIPEDIA, URL_PDF, ARXIV_URL
|
|
|
|
| 27 |
output_path = DATA_DIR / "temp.pdf"
|
| 28 |
output_path.unlink(missing_ok=True)
|
| 29 |
assert not output_path.exists()
|
| 30 |
+
pdf_path = await download_pdf_or_arxiv_pdf_async(ARXIV_URL, output_path=output_path)
|
| 31 |
assert pdf_path == output_path
|
| 32 |
assert output_path.exists()
|
tests/webcrawler/test_crawl_database.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from deepengineer.webcrawler.crawl_database import DataBase
|
| 2 |
+
|
| 3 |
+
def test_crawl_database_arxiv_pdf():
|
| 4 |
+
db = DataBase()
|
| 5 |
+
db.crawl_url("https://arxiv.org/pdf/2105.00643")
|
| 6 |
+
assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
|
| 7 |
+
assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
|
| 8 |
+
assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages[0].markdown is not None
|
| 9 |
+
assert len(db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages) == 20
|
| 10 |
+
|
| 11 |
+
def test_crawl_database_arxiv_link():
|
| 12 |
+
db = DataBase()
|
| 13 |
+
db.crawl_url("https://arxiv.org/abs/2105.00643")
|
| 14 |
+
assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
|
| 15 |
+
assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
|
| 16 |
+
assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages[0].markdown is not None
|
| 17 |
+
assert len(db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages) == 20
|
| 18 |
+
|
| 19 |
+
|
tests/webcrawler/test_pdf_agent.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from deepengineer.deepsearch.analyse_markdown_agent import create_agent, GetTableOfContentsTool, GetMarkdownTool, GetPagesContentTool,
|
| 2 |
from mistralai import OCRResponse
|
| 3 |
from deepengineer.common_path import DATA_DIR
|
| 4 |
|
|
@@ -11,7 +11,7 @@ def test_pdf_agent():
|
|
| 11 |
ocr_response = load_mock_ocr_response()
|
| 12 |
pdf_agent = create_agent(ocr_response)
|
| 13 |
assert pdf_agent is not None
|
| 14 |
-
assert pdf_agent.name == "
|
| 15 |
assert pdf_agent.tools is not None
|
| 16 |
assert len(pdf_agent.tools) == 4 + 1 # +1 for the final answer
|
| 17 |
|
|
@@ -19,8 +19,7 @@ def test_pdf_agent():
|
|
| 19 |
GetTableOfContentsTool(ocr_response).forward()
|
| 20 |
GetMarkdownTool(ocr_response).forward()
|
| 21 |
GetPagesContentTool(ocr_response).forward([1,2,3])
|
| 22 |
-
|
| 23 |
-
pdf_agent.run("Give me a summary of the document.")
|
| 24 |
|
| 25 |
-
test_pdf_agent()
|
| 26 |
|
|
|
|
| 1 |
+
from deepengineer.deepsearch.analyse_markdown_agent import create_agent, GetTableOfContentsTool, GetMarkdownTool, GetPagesContentTool, FindInMarkdownTool
|
| 2 |
from mistralai import OCRResponse
|
| 3 |
from deepengineer.common_path import DATA_DIR
|
| 4 |
|
|
|
|
| 11 |
ocr_response = load_mock_ocr_response()
|
| 12 |
pdf_agent = create_agent(ocr_response)
|
| 13 |
assert pdf_agent is not None
|
| 14 |
+
assert pdf_agent.name == "markdown_agent"
|
| 15 |
assert pdf_agent.tools is not None
|
| 16 |
assert len(pdf_agent.tools) == 4 + 1 # +1 for the final answer
|
| 17 |
|
|
|
|
| 19 |
GetTableOfContentsTool(ocr_response).forward()
|
| 20 |
GetMarkdownTool(ocr_response).forward()
|
| 21 |
GetPagesContentTool(ocr_response).forward([1,2,3])
|
| 22 |
+
FindInMarkdownTool(ocr_response).forward(["thermal neutron", "neutron"])
|
| 23 |
+
# pdf_agent.run("Give me a summary of the document.")
|
| 24 |
|
|
|
|
| 25 |
|