Spaces:
Runtime error
Runtime error
Charles Azam
commited on
Commit
·
b5fafa1
1
Parent(s):
663d842
clean: run linting and formating on repo
Browse files- Makefile +1 -4
- pyproject.toml +4 -6
- src/deepengineer/common_path.py +1 -1
- src/deepengineer/deepsearch/analyse_markdown_agent.py +34 -21
- src/deepengineer/deepsearch/draw_agent.py +0 -158
- src/deepengineer/deepsearch/scawl_web_agent.py +62 -46
- src/deepengineer/webcrawler/async_crawl.py +8 -6
- src/deepengineer/webcrawler/async_search.py +61 -42
- src/deepengineer/webcrawler/crawl_database.py +21 -14
- src/deepengineer/webcrawler/pdf_utils.py +37 -25
- src/deepengineer/webcrawler/testing.py +7 -3
- src/deepengineer/webcrawler/utils.py +15 -8
- tests/deepsearch/test_pdf_agent.py +19 -8
- tests/deepsearch/test_web_agent.py +13 -1
- tests/webcrawler/test_async_crawl.py +5 -3
- tests/webcrawler/test_async_search.py +13 -20
- tests/webcrawler/test_crawl_database.py +27 -7
- tests/webcrawler/test_pdfs_utils.py +15 -6
- tests/webcrawler/test_utils.py +28 -19
Makefile
CHANGED
|
@@ -19,10 +19,7 @@ lint: ## Run ruff linter
|
|
| 19 |
lint-fix: ## Run ruff linter and auto-fix issues
|
| 20 |
uv run ruff check --fix src tests
|
| 21 |
|
| 22 |
-
|
| 23 |
-
uv run mypy src
|
| 24 |
-
|
| 25 |
-
check: format lint type-check ## Run all checks (format, lint, type-check)
|
| 26 |
|
| 27 |
test: ## Run tests
|
| 28 |
uv run pytest tests
|
|
|
|
| 19 |
lint-fix: ## Run ruff linter and auto-fix issues
|
| 20 |
uv run ruff check --fix src tests
|
| 21 |
|
| 22 |
+
check: format lint ## Run all checks (format, lint)
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
test: ## Run tests
|
| 25 |
uv run pytest tests
|
pyproject.toml
CHANGED
|
@@ -34,13 +34,13 @@ build-backend = "hatchling.build"
|
|
| 34 |
|
| 35 |
[tool.black]
|
| 36 |
line-length = 88
|
| 37 |
-
target-version = ['
|
| 38 |
|
| 39 |
|
| 40 |
[tool.ruff]
|
| 41 |
-
target-version = "
|
| 42 |
line-length = 88
|
| 43 |
-
select = [
|
| 44 |
"E", # pycodestyle errors
|
| 45 |
"W", # pycodestyle warnings
|
| 46 |
"F", # pyflakes
|
|
@@ -49,11 +49,9 @@ select = [
|
|
| 49 |
"C4", # flake8-comprehensions
|
| 50 |
"UP", # pyupgrade
|
| 51 |
]
|
| 52 |
-
ignore = [
|
| 53 |
"E501", # line too long, handled by black
|
| 54 |
"B008", # do not perform function calls in argument defaults
|
| 55 |
"C901", # too complex
|
| 56 |
]
|
| 57 |
|
| 58 |
-
[tool.ruff.per-file-ignores]
|
| 59 |
-
"__init__.py" = ["F401"]
|
|
|
|
| 34 |
|
| 35 |
[tool.black]
|
| 36 |
line-length = 88
|
| 37 |
+
target-version = ['py312']
|
| 38 |
|
| 39 |
|
| 40 |
[tool.ruff]
|
| 41 |
+
target-version = "py312"
|
| 42 |
line-length = 88
|
| 43 |
+
lint.select = [
|
| 44 |
"E", # pycodestyle errors
|
| 45 |
"W", # pycodestyle warnings
|
| 46 |
"F", # pyflakes
|
|
|
|
| 49 |
"C4", # flake8-comprehensions
|
| 50 |
"UP", # pyupgrade
|
| 51 |
]
|
| 52 |
+
lint.ignore = [
|
| 53 |
"E501", # line too long, handled by black
|
| 54 |
"B008", # do not perform function calls in argument defaults
|
| 55 |
"C901", # too complex
|
| 56 |
]
|
| 57 |
|
|
|
|
|
|
src/deepengineer/common_path.py
CHANGED
|
@@ -8,4 +8,4 @@ assert DEEPENGINEER_CODE_DIR.name == "deepengineer"
|
|
| 8 |
assert DEEPENGINEER_SRC_DIR.name == "src"
|
| 9 |
|
| 10 |
DATA_DIR = DEEPENGINEER_ROOT_DIR / "data"
|
| 11 |
-
assert DATA_DIR.exists()
|
|
|
|
| 8 |
assert DEEPENGINEER_SRC_DIR.name == "src"
|
| 9 |
|
| 10 |
DATA_DIR = DEEPENGINEER_ROOT_DIR / "data"
|
| 11 |
+
assert DATA_DIR.exists()
|
src/deepengineer/deepsearch/analyse_markdown_agent.py
CHANGED
|
@@ -2,86 +2,97 @@
|
|
| 2 |
Simple agent to analyse a markdown, just to test some ideas.
|
| 3 |
"""
|
| 4 |
|
| 5 |
-
from smolagents import CodeAgent, tool, Tool, LiteLLMModel
|
| 6 |
-
from deepengineer.webcrawler.pdf_utils import get_markdown_by_page_numbers, get_table_of_contents_per_page_markdown, find_in_markdown, convert_ocr_response_to_markdown
|
| 7 |
-
from mistralai import OCRResponse
|
| 8 |
from enum import Enum
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
class ToolNames(Enum):
|
| 11 |
GET_TABLE_OF_CONTENTS = "get_table_of_contents"
|
| 12 |
GET_MARKDOWN = "get_markdown"
|
| 13 |
GET_PAGES_CONTENT = "get_pages_content"
|
| 14 |
FIND_IN_MARKDOWN = "find_in_markdown"
|
| 15 |
|
|
|
|
| 16 |
class GetTableOfContentsTool(Tool):
|
| 17 |
name = ToolNames.GET_TABLE_OF_CONTENTS.value
|
| 18 |
description = "Returns all of the titles in the document along with the page number they are on."
|
| 19 |
inputs = {}
|
| 20 |
output_type = "string"
|
| 21 |
-
|
| 22 |
def __init__(self, markdown: OCRResponse):
|
| 23 |
super().__init__()
|
| 24 |
self.markdown: OCRResponse = markdown
|
| 25 |
-
self.table_of_contents: str = get_table_of_contents_per_page_markdown(
|
| 26 |
-
|
|
|
|
|
|
|
| 27 |
def forward(self) -> str:
|
| 28 |
return self.table_of_contents
|
| 29 |
-
|
|
|
|
| 30 |
class GetMarkdownTool(Tool):
|
| 31 |
name = ToolNames.GET_MARKDOWN.value
|
| 32 |
description = f"Returns the markdown entire content of the document. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages."
|
| 33 |
inputs = {}
|
| 34 |
output_type = "string"
|
| 35 |
-
|
| 36 |
def __init__(self, markdown: OCRResponse):
|
| 37 |
super().__init__()
|
| 38 |
self.markdown: OCRResponse = markdown
|
| 39 |
self.markdown_content: str = convert_ocr_response_to_markdown(self.markdown)
|
| 40 |
-
|
| 41 |
def forward(self) -> str:
|
| 42 |
return self.markdown_content
|
| 43 |
-
|
| 44 |
-
|
| 45 |
class GetPagesContentTool(Tool):
|
| 46 |
name = ToolNames.GET_PAGES_CONTENT.value
|
| 47 |
description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages. Expects a list of page numbers as integers as input."
|
| 48 |
inputs = {
|
| 49 |
"page_numbers": {
|
| 50 |
"type": "array",
|
| 51 |
-
"description": "The page numbers to get the content of."
|
| 52 |
},
|
| 53 |
}
|
| 54 |
output_type = "string"
|
| 55 |
-
|
| 56 |
def __init__(self, markdown: OCRResponse):
|
| 57 |
super().__init__()
|
| 58 |
self.markdown: OCRResponse = markdown
|
| 59 |
|
| 60 |
def forward(self, page_numbers: list[int]) -> str:
|
| 61 |
return get_markdown_by_page_numbers(self.markdown, page_numbers)
|
| 62 |
-
|
|
|
|
| 63 |
class FindInMarkdownTool(Tool):
|
| 64 |
name = ToolNames.FIND_IN_MARKDOWN.value
|
| 65 |
description = f"Finds the page numbers of the document that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the document that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages."
|
| 66 |
inputs = {
|
| 67 |
"search_queries": {
|
| 68 |
"type": "array",
|
| 69 |
-
"description": "The search queries to find in the document. List of strings."
|
| 70 |
}
|
| 71 |
}
|
| 72 |
output_type = "array"
|
| 73 |
-
|
| 74 |
def __init__(self, markdown: OCRResponse):
|
| 75 |
super().__init__()
|
| 76 |
self.markdown: OCRResponse = markdown
|
| 77 |
-
|
| 78 |
def forward(self, search_queries: list[str]) -> list[int]:
|
| 79 |
return find_in_markdown(self.markdown, search_queries)
|
| 80 |
|
| 81 |
|
| 82 |
-
|
| 83 |
def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
|
| 84 |
-
|
| 85 |
"""This agent is just a test and will not be used as is by the main agent."""
|
| 86 |
|
| 87 |
model = LiteLLMModel(model_id=model_id)
|
|
@@ -101,6 +112,8 @@ def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
|
|
| 101 |
name="markdown_agent",
|
| 102 |
description="""A team member that can analyse a markdown.""",
|
| 103 |
)
|
| 104 |
-
markdown_agent.prompt_templates["managed_agent"][
|
|
|
|
|
|
|
| 105 |
|
| 106 |
-
return markdown_agent
|
|
|
|
| 2 |
Simple agent to analyse a markdown, just to test some ideas.
|
| 3 |
"""
|
| 4 |
|
|
|
|
|
|
|
|
|
|
| 5 |
from enum import Enum
|
| 6 |
|
| 7 |
+
from mistralai import OCRResponse
|
| 8 |
+
from smolagents import CodeAgent, LiteLLMModel, Tool
|
| 9 |
+
|
| 10 |
+
from deepengineer.webcrawler.pdf_utils import (
|
| 11 |
+
convert_ocr_response_to_markdown,
|
| 12 |
+
find_in_markdown,
|
| 13 |
+
get_markdown_by_page_numbers,
|
| 14 |
+
get_table_of_contents_per_page_markdown,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
class ToolNames(Enum):
|
| 19 |
GET_TABLE_OF_CONTENTS = "get_table_of_contents"
|
| 20 |
GET_MARKDOWN = "get_markdown"
|
| 21 |
GET_PAGES_CONTENT = "get_pages_content"
|
| 22 |
FIND_IN_MARKDOWN = "find_in_markdown"
|
| 23 |
|
| 24 |
+
|
| 25 |
class GetTableOfContentsTool(Tool):
|
| 26 |
name = ToolNames.GET_TABLE_OF_CONTENTS.value
|
| 27 |
description = "Returns all of the titles in the document along with the page number they are on."
|
| 28 |
inputs = {}
|
| 29 |
output_type = "string"
|
| 30 |
+
|
| 31 |
def __init__(self, markdown: OCRResponse):
|
| 32 |
super().__init__()
|
| 33 |
self.markdown: OCRResponse = markdown
|
| 34 |
+
self.table_of_contents: str = get_table_of_contents_per_page_markdown(
|
| 35 |
+
self.markdown
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
def forward(self) -> str:
|
| 39 |
return self.table_of_contents
|
| 40 |
+
|
| 41 |
+
|
| 42 |
class GetMarkdownTool(Tool):
|
| 43 |
name = ToolNames.GET_MARKDOWN.value
|
| 44 |
description = f"Returns the markdown entire content of the document. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages."
|
| 45 |
inputs = {}
|
| 46 |
output_type = "string"
|
| 47 |
+
|
| 48 |
def __init__(self, markdown: OCRResponse):
|
| 49 |
super().__init__()
|
| 50 |
self.markdown: OCRResponse = markdown
|
| 51 |
self.markdown_content: str = convert_ocr_response_to_markdown(self.markdown)
|
| 52 |
+
|
| 53 |
def forward(self) -> str:
|
| 54 |
return self.markdown_content
|
| 55 |
+
|
| 56 |
+
|
| 57 |
class GetPagesContentTool(Tool):
|
| 58 |
name = ToolNames.GET_PAGES_CONTENT.value
|
| 59 |
description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages. Expects a list of page numbers as integers as input."
|
| 60 |
inputs = {
|
| 61 |
"page_numbers": {
|
| 62 |
"type": "array",
|
| 63 |
+
"description": "The page numbers to get the content of.",
|
| 64 |
},
|
| 65 |
}
|
| 66 |
output_type = "string"
|
| 67 |
+
|
| 68 |
def __init__(self, markdown: OCRResponse):
|
| 69 |
super().__init__()
|
| 70 |
self.markdown: OCRResponse = markdown
|
| 71 |
|
| 72 |
def forward(self, page_numbers: list[int]) -> str:
|
| 73 |
return get_markdown_by_page_numbers(self.markdown, page_numbers)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
class FindInMarkdownTool(Tool):
|
| 77 |
name = ToolNames.FIND_IN_MARKDOWN.value
|
| 78 |
description = f"Finds the page numbers of the document that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the document that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages."
|
| 79 |
inputs = {
|
| 80 |
"search_queries": {
|
| 81 |
"type": "array",
|
| 82 |
+
"description": "The search queries to find in the document. List of strings.",
|
| 83 |
}
|
| 84 |
}
|
| 85 |
output_type = "array"
|
| 86 |
+
|
| 87 |
def __init__(self, markdown: OCRResponse):
|
| 88 |
super().__init__()
|
| 89 |
self.markdown: OCRResponse = markdown
|
| 90 |
+
|
| 91 |
def forward(self, search_queries: list[str]) -> list[int]:
|
| 92 |
return find_in_markdown(self.markdown, search_queries)
|
| 93 |
|
| 94 |
|
|
|
|
| 95 |
def create_agent(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
|
|
|
|
| 96 |
"""This agent is just a test and will not be used as is by the main agent."""
|
| 97 |
|
| 98 |
model = LiteLLMModel(model_id=model_id)
|
|
|
|
| 112 |
name="markdown_agent",
|
| 113 |
description="""A team member that can analyse a markdown.""",
|
| 114 |
)
|
| 115 |
+
markdown_agent.prompt_templates["managed_agent"][
|
| 116 |
+
"task"
|
| 117 |
+
] += """You can navigate to .txt online files."""
|
| 118 |
|
| 119 |
+
return markdown_agent
|
src/deepengineer/deepsearch/draw_agent.py
CHANGED
|
@@ -1,158 +0,0 @@
|
|
| 1 |
-
from io import BytesIO
|
| 2 |
-
from time import sleep
|
| 3 |
-
|
| 4 |
-
import helium
|
| 5 |
-
from dotenv import load_dotenv
|
| 6 |
-
from PIL import Image
|
| 7 |
-
from selenium import webdriver
|
| 8 |
-
from selenium.webdriver.common.by import By
|
| 9 |
-
from selenium.webdriver.common.keys import Keys
|
| 10 |
-
|
| 11 |
-
from smolagents import CodeAgent, tool
|
| 12 |
-
from smolagents.agents import ActionStep
|
| 13 |
-
|
| 14 |
-
# Load environment variables
|
| 15 |
-
load_dotenv()
|
| 16 |
-
|
| 17 |
-
@tool
|
| 18 |
-
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
|
| 19 |
-
"""
|
| 20 |
-
Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
|
| 21 |
-
Args:
|
| 22 |
-
text: The text to search for
|
| 23 |
-
nth_result: Which occurrence to jump to (default: 1)
|
| 24 |
-
"""
|
| 25 |
-
elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
|
| 26 |
-
if nth_result > len(elements):
|
| 27 |
-
raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
|
| 28 |
-
result = f"Found {len(elements)} matches for '{text}'."
|
| 29 |
-
elem = elements[nth_result - 1]
|
| 30 |
-
driver.execute_script("arguments[0].scrollIntoView(true);", elem)
|
| 31 |
-
result += f"Focused on element {nth_result} of {len(elements)}"
|
| 32 |
-
return result
|
| 33 |
-
|
| 34 |
-
@tool
|
| 35 |
-
def go_back() -> None:
|
| 36 |
-
"""Goes back to previous page."""
|
| 37 |
-
driver.back()
|
| 38 |
-
|
| 39 |
-
@tool
|
| 40 |
-
def close_popups() -> str:
|
| 41 |
-
"""
|
| 42 |
-
Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows!
|
| 43 |
-
This does not work on cookie consent banners.
|
| 44 |
-
"""
|
| 45 |
-
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
# Configure Chrome options
|
| 49 |
-
chrome_options = webdriver.ChromeOptions()
|
| 50 |
-
chrome_options.add_argument("--force-device-scale-factor=1")
|
| 51 |
-
chrome_options.add_argument("--window-size=1000,1350")
|
| 52 |
-
chrome_options.add_argument("--disable-pdf-viewer")
|
| 53 |
-
chrome_options.add_argument("--window-position=0,0")
|
| 54 |
-
|
| 55 |
-
# Initialize the browser
|
| 56 |
-
driver = helium.start_chrome(headless=False, options=chrome_options)
|
| 57 |
-
|
| 58 |
-
# Set up screenshot callback
|
| 59 |
-
def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
|
| 60 |
-
sleep(1.0) # Let JavaScript animations happen before taking the screenshot
|
| 61 |
-
driver = helium.get_driver()
|
| 62 |
-
current_step = memory_step.step_number
|
| 63 |
-
if driver is not None:
|
| 64 |
-
for previous_memory_step in agent.memory.steps: # Remove previous screenshots for lean processing
|
| 65 |
-
if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
|
| 66 |
-
previous_memory_step.observations_images = None
|
| 67 |
-
png_bytes = driver.get_screenshot_as_png()
|
| 68 |
-
image = Image.open(BytesIO(png_bytes))
|
| 69 |
-
print(f"Captured a browser screenshot: {image.size} pixels")
|
| 70 |
-
memory_step.observations_images = [image.copy()] # Create a copy to ensure it persists
|
| 71 |
-
|
| 72 |
-
# Update observations with current URL
|
| 73 |
-
url_info = f"Current url: {driver.current_url}"
|
| 74 |
-
memory_step.observations = (
|
| 75 |
-
url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
|
| 76 |
-
)
|
| 77 |
-
|
| 78 |
-
from smolagents import InferenceClientModel
|
| 79 |
-
|
| 80 |
-
# Initialize the model
|
| 81 |
-
model_id = "Qwen/Qwen2-VL-72B-Instruct" # You can change this to your preferred VLM model
|
| 82 |
-
model = InferenceClientModel(model_id=model_id)
|
| 83 |
-
|
| 84 |
-
# Create the agent
|
| 85 |
-
agent = CodeAgent(
|
| 86 |
-
tools=[go_back, close_popups, search_item_ctrl_f],
|
| 87 |
-
model=model,
|
| 88 |
-
additional_authorized_imports=["helium"],
|
| 89 |
-
step_callbacks=[save_screenshot],
|
| 90 |
-
max_steps=20,
|
| 91 |
-
verbosity_level=2,
|
| 92 |
-
)
|
| 93 |
-
|
| 94 |
-
# Import helium for the agent
|
| 95 |
-
agent.python_executor("from helium import *", agent.state)
|
| 96 |
-
|
| 97 |
-
helium_instructions = """
|
| 98 |
-
You can use helium to access websites. Don't bother about the helium driver, it's already managed.
|
| 99 |
-
We've already ran "from helium import *"
|
| 100 |
-
Then you can go to pages!
|
| 101 |
-
Code:
|
| 102 |
-
```py
|
| 103 |
-
go_to('github.com/trending')
|
| 104 |
-
```<end_code>
|
| 105 |
-
|
| 106 |
-
You can directly click clickable elements by inputting the text that appears on them.
|
| 107 |
-
Code:
|
| 108 |
-
```py
|
| 109 |
-
click("Top products")
|
| 110 |
-
```<end_code>
|
| 111 |
-
|
| 112 |
-
If it's a link:
|
| 113 |
-
Code:
|
| 114 |
-
```py
|
| 115 |
-
click(Link("Top products"))
|
| 116 |
-
```<end_code>
|
| 117 |
-
|
| 118 |
-
If you try to interact with an element and it's not found, you'll get a LookupError.
|
| 119 |
-
In general stop your action after each button click to see what happens on your screenshot.
|
| 120 |
-
Never try to login in a page.
|
| 121 |
-
|
| 122 |
-
To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
|
| 123 |
-
Code:
|
| 124 |
-
```py
|
| 125 |
-
scroll_down(num_pixels=1200) # This will scroll one viewport down
|
| 126 |
-
```<end_code>
|
| 127 |
-
|
| 128 |
-
When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
|
| 129 |
-
Just use your built-in tool `close_popups` to close them:
|
| 130 |
-
Code:
|
| 131 |
-
```py
|
| 132 |
-
close_popups()
|
| 133 |
-
```<end_code>
|
| 134 |
-
|
| 135 |
-
You can use .exists() to check for the existence of an element. For example:
|
| 136 |
-
Code:
|
| 137 |
-
```py
|
| 138 |
-
if Text('Accept cookies?').exists():
|
| 139 |
-
click('I accept')
|
| 140 |
-
```<end_code>
|
| 141 |
-
"""
|
| 142 |
-
|
| 143 |
-
search_request = """
|
| 144 |
-
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
|
| 145 |
-
"""
|
| 146 |
-
|
| 147 |
-
agent_output = agent.run(search_request + helium_instructions)
|
| 148 |
-
print("Final output:")
|
| 149 |
-
print(agent_output)
|
| 150 |
-
|
| 151 |
-
github_request = """
|
| 152 |
-
I'm trying to find how hard I have to work to get a repo in github.com/trending.
|
| 153 |
-
Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
|
| 154 |
-
"""
|
| 155 |
-
|
| 156 |
-
agent_output = agent.run(github_request + helium_instructions)
|
| 157 |
-
print("Final output:")
|
| 158 |
-
print(agent_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/deepengineer/deepsearch/scawl_web_agent.py
CHANGED
|
@@ -1,13 +1,23 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from deepengineer.webcrawler.async_search import (
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
| 5 |
)
|
| 6 |
-
from deepengineer.webcrawler.pdf_utils import get_table_of_contents_per_page_markdown, convert_ocr_response_to_markdown, get_markdown_by_page_numbers, find_in_markdown
|
| 7 |
-
from enum import Enum
|
| 8 |
-
import asyncio
|
| 9 |
-
from deepengineer.webcrawler.async_search import SearchResponse
|
| 10 |
from deepengineer.webcrawler.crawl_database import DataBase
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
class ToolNames(Enum):
|
| 13 |
# Search tools
|
|
@@ -16,37 +26,43 @@ class ToolNames(Enum):
|
|
| 16 |
PUBMED_SEARCH = "pubmed_search"
|
| 17 |
SCIENCEDIRECT_SEARCH = "sciencedirect_search"
|
| 18 |
SCIENTIFIC_SEARCH = "scientific_search"
|
| 19 |
-
|
| 20 |
# Exploring link tools
|
| 21 |
GET_TABLE_OF_CONTENTS = "get_table_of_contents_of_url"
|
| 22 |
GET_MARKDOWN = "get_markdown_of_url"
|
| 23 |
GET_PAGES_CONTENT = "get_pages_content"
|
| 24 |
FIND_IN_MARKDOWN = "find_in_markdown"
|
| 25 |
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
| 27 |
search_response.search_results = search_response.search_results[:max_nb_results]
|
| 28 |
return search_response.to_string()
|
| 29 |
|
| 30 |
|
| 31 |
class SearchTool(Tool):
|
| 32 |
name = ToolNames.SEARCH_TOOL.value
|
| 33 |
-
description =
|
| 34 |
Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
|
| 35 |
"""
|
| 36 |
inputs = {
|
| 37 |
"search_query": {
|
| 38 |
"type": "string",
|
| 39 |
-
"description": "The search query to execute"
|
| 40 |
},
|
| 41 |
}
|
| 42 |
output_type = "object"
|
| 43 |
-
|
| 44 |
def forward(self, search_query: str) -> str:
|
| 45 |
-
result = asyncio.run(
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
| 48 |
return filter_search_results(result)
|
| 49 |
|
|
|
|
| 50 |
class ArxivSearchTool(Tool):
|
| 51 |
name = ToolNames.ARXIV_SEARCH.value
|
| 52 |
description = """Search arXiv for academic papers and preprints with Linkup API.
|
|
@@ -55,15 +71,16 @@ class ArxivSearchTool(Tool):
|
|
| 55 |
inputs = {
|
| 56 |
"search_query": {
|
| 57 |
"type": "string",
|
| 58 |
-
"description": "The search query to execute on arXiv"
|
| 59 |
}
|
| 60 |
}
|
| 61 |
output_type = "object"
|
| 62 |
-
|
| 63 |
def forward(self, search_query: str) -> str:
|
| 64 |
result = asyncio.run(arxiv_search_async(search_query))
|
| 65 |
return filter_search_results(result)
|
| 66 |
|
|
|
|
| 67 |
class PubmedSearchTool(Tool):
|
| 68 |
name = ToolNames.PUBMED_SEARCH.value
|
| 69 |
description = """Search PubMed for medical and scientific literature with Linkup API.
|
|
@@ -72,15 +89,16 @@ class PubmedSearchTool(Tool):
|
|
| 72 |
inputs = {
|
| 73 |
"search_query": {
|
| 74 |
"type": "string",
|
| 75 |
-
"description": "The search query to execute on PubMed"
|
| 76 |
}
|
| 77 |
}
|
| 78 |
output_type = "object"
|
| 79 |
-
|
| 80 |
def forward(self, search_query: str) -> str:
|
| 81 |
result = asyncio.run(pubmed_search_async(search_query))
|
| 82 |
return filter_search_results(result)
|
| 83 |
|
|
|
|
| 84 |
class ScientificSearchTool(Tool):
|
| 85 |
name = ToolNames.SCIENTIFIC_SEARCH.value
|
| 86 |
description = """Search across multiple scientific domains: Wikipedia, arXiv, PubMed, and ScienceDirect.
|
|
@@ -89,16 +107,19 @@ class ScientificSearchTool(Tool):
|
|
| 89 |
inputs = {
|
| 90 |
"search_query": {
|
| 91 |
"type": "string",
|
| 92 |
-
"description": "The search query to execute across scientific domains"
|
| 93 |
}
|
| 94 |
}
|
| 95 |
output_type = "object"
|
|
|
|
| 96 |
def forward(self, search_query: str) -> dict:
|
| 97 |
result = asyncio.run(scientific_search_async(search_query))
|
| 98 |
return filter_search_results(result)
|
| 99 |
|
|
|
|
| 100 |
URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
|
| 101 |
|
|
|
|
| 102 |
class GetTableOfContentsTool(Tool):
|
| 103 |
name = ToolNames.GET_TABLE_OF_CONTENTS.value
|
| 104 |
description = f"""Returns all of the titles in the document along with the page number they are on.
|
|
@@ -107,55 +128,51 @@ class GetTableOfContentsTool(Tool):
|
|
| 107 |
inputs = {
|
| 108 |
"url": {
|
| 109 |
"type": "string",
|
| 110 |
-
"description": "The URL to get the table of contents of."
|
| 111 |
}
|
| 112 |
}
|
| 113 |
output_type = "string"
|
| 114 |
-
|
| 115 |
def __init__(self, database: DataBase):
|
| 116 |
super().__init__()
|
| 117 |
self.database: DataBase = database
|
| 118 |
-
|
| 119 |
def forward(self, url: str) -> str:
|
| 120 |
markdown = self.database.get_markdown_of_url(url)
|
| 121 |
table_of_contents: str = get_table_of_contents_per_page_markdown(markdown)
|
| 122 |
return table_of_contents
|
| 123 |
|
|
|
|
| 124 |
class GetMarkdownTool(Tool):
|
| 125 |
name = ToolNames.GET_MARKDOWN.value
|
| 126 |
description = f"Returns in markdown entire content of the url. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can also use {ToolNames.GET_TABLE_OF_CONTENTS.value} first to get the table of contents of the document including the number of pages."
|
| 127 |
inputs = {
|
| 128 |
-
"url": {
|
| 129 |
-
"type": "string",
|
| 130 |
-
"description": "The URL to get the markdown of."
|
| 131 |
-
}
|
| 132 |
}
|
| 133 |
output_type = "string"
|
| 134 |
-
|
| 135 |
def __init__(self, database: DataBase):
|
| 136 |
super().__init__()
|
| 137 |
self.database: DataBase = database
|
| 138 |
-
|
| 139 |
def forward(self, url: str) -> str:
|
| 140 |
markdown = self.database.get_markdown_of_url(url)
|
| 141 |
markdown_content: str = convert_ocr_response_to_markdown(markdown)
|
| 142 |
return markdown_content
|
| 143 |
|
|
|
|
| 144 |
class GetPagesContentTool(Tool):
|
| 145 |
name = ToolNames.GET_PAGES_CONTENT.value
|
| 146 |
description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the url including the number of pages. Expects a list of page numbers as integers as input. {URL_EXPLAINATION}"
|
| 147 |
inputs = {
|
| 148 |
-
"url": {
|
| 149 |
-
"type": "string",
|
| 150 |
-
"description": "The URL to get the content of."
|
| 151 |
-
},
|
| 152 |
"page_numbers": {
|
| 153 |
"type": "array",
|
| 154 |
-
"description": "The page numbers to get the content of."
|
| 155 |
},
|
| 156 |
}
|
| 157 |
output_type = "string"
|
| 158 |
-
|
| 159 |
def __init__(self, database: DataBase):
|
| 160 |
super().__init__()
|
| 161 |
self.database: DataBase = database
|
|
@@ -164,32 +181,31 @@ class GetPagesContentTool(Tool):
|
|
| 164 |
markdown = self.database.get_markdown_of_url(url)
|
| 165 |
return get_markdown_by_page_numbers(markdown, page_numbers)
|
| 166 |
|
|
|
|
| 167 |
class FindInMarkdownTool(Tool):
|
| 168 |
name = ToolNames.FIND_IN_MARKDOWN.value
|
| 169 |
description = f"Finds the page numbers of the url that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the url that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages. {URL_EXPLAINATION}"
|
| 170 |
inputs = {
|
| 171 |
-
"url": {
|
| 172 |
-
"type": "string",
|
| 173 |
-
"description": "The URL to find in."
|
| 174 |
-
},
|
| 175 |
"search_queries": {
|
| 176 |
"type": "array",
|
| 177 |
-
"description": "The search queries to find in the url. List of strings."
|
| 178 |
-
}
|
| 179 |
}
|
| 180 |
output_type = "array"
|
| 181 |
-
|
| 182 |
def __init__(self, database: DataBase):
|
| 183 |
super().__init__()
|
| 184 |
self.database: DataBase = database
|
| 185 |
-
|
| 186 |
def forward(self, url: str, search_queries: list[str]) -> list[int]:
|
| 187 |
markdown = self.database.get_markdown_of_url(url)
|
| 188 |
return find_in_markdown(markdown, search_queries)
|
| 189 |
|
|
|
|
| 190 |
def create_web_search_agent(model_id="deepseek/deepseek-chat"):
|
| 191 |
"""Create a web search agent with search, crawling, and PDF analysis capabilities."""
|
| 192 |
-
|
| 193 |
model = LiteLLMModel(model_id=model_id)
|
| 194 |
database = DataBase()
|
| 195 |
|
|
@@ -204,7 +220,7 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
|
|
| 204 |
GetPagesContentTool(database),
|
| 205 |
FindInMarkdownTool(database),
|
| 206 |
]
|
| 207 |
-
|
| 208 |
web_search_agent = CodeAgent(
|
| 209 |
model=model,
|
| 210 |
tools=WEB_SEARCH_TOOLS,
|
|
@@ -214,5 +230,5 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
|
|
| 214 |
name="web_search_agent",
|
| 215 |
description="""A team member that can search the web, crawl URLs, download PDFs, and analyze documents.""",
|
| 216 |
)
|
| 217 |
-
|
| 218 |
return web_search_agent
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
from enum import Enum
|
| 3 |
+
|
| 4 |
+
from smolagents import CodeAgent, LiteLLMModel, Tool
|
| 5 |
+
|
| 6 |
from deepengineer.webcrawler.async_search import (
|
| 7 |
+
SearchResponse,
|
| 8 |
+
arxiv_search_async,
|
| 9 |
+
linkup_search_async,
|
| 10 |
+
pubmed_search_async,
|
| 11 |
+
scientific_search_async,
|
| 12 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
from deepengineer.webcrawler.crawl_database import DataBase
|
| 14 |
+
from deepengineer.webcrawler.pdf_utils import (
|
| 15 |
+
convert_ocr_response_to_markdown,
|
| 16 |
+
find_in_markdown,
|
| 17 |
+
get_markdown_by_page_numbers,
|
| 18 |
+
get_table_of_contents_per_page_markdown,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
|
| 22 |
class ToolNames(Enum):
|
| 23 |
# Search tools
|
|
|
|
| 26 |
PUBMED_SEARCH = "pubmed_search"
|
| 27 |
SCIENCEDIRECT_SEARCH = "sciencedirect_search"
|
| 28 |
SCIENTIFIC_SEARCH = "scientific_search"
|
| 29 |
+
|
| 30 |
# Exploring link tools
|
| 31 |
GET_TABLE_OF_CONTENTS = "get_table_of_contents_of_url"
|
| 32 |
GET_MARKDOWN = "get_markdown_of_url"
|
| 33 |
GET_PAGES_CONTENT = "get_pages_content"
|
| 34 |
FIND_IN_MARKDOWN = "find_in_markdown"
|
| 35 |
|
| 36 |
+
|
| 37 |
+
def filter_search_results(
|
| 38 |
+
search_response: SearchResponse, max_nb_results: int = 5
|
| 39 |
+
) -> SearchResponse:
|
| 40 |
search_response.search_results = search_response.search_results[:max_nb_results]
|
| 41 |
return search_response.to_string()
|
| 42 |
|
| 43 |
|
| 44 |
class SearchTool(Tool):
|
| 45 |
name = ToolNames.SEARCH_TOOL.value
|
| 46 |
+
description = """Search the web using Linkup API. Good for deep research with sourced answers.
|
| 47 |
Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
|
| 48 |
"""
|
| 49 |
inputs = {
|
| 50 |
"search_query": {
|
| 51 |
"type": "string",
|
| 52 |
+
"description": "The search query to execute",
|
| 53 |
},
|
| 54 |
}
|
| 55 |
output_type = "object"
|
| 56 |
+
|
| 57 |
def forward(self, search_query: str) -> str:
|
| 58 |
+
result = asyncio.run(
|
| 59 |
+
linkup_search_async(
|
| 60 |
+
search_query=search_query,
|
| 61 |
+
)
|
| 62 |
+
)
|
| 63 |
return filter_search_results(result)
|
| 64 |
|
| 65 |
+
|
| 66 |
class ArxivSearchTool(Tool):
|
| 67 |
name = ToolNames.ARXIV_SEARCH.value
|
| 68 |
description = """Search arXiv for academic papers and preprints with Linkup API.
|
|
|
|
| 71 |
inputs = {
|
| 72 |
"search_query": {
|
| 73 |
"type": "string",
|
| 74 |
+
"description": "The search query to execute on arXiv",
|
| 75 |
}
|
| 76 |
}
|
| 77 |
output_type = "object"
|
| 78 |
+
|
| 79 |
def forward(self, search_query: str) -> str:
|
| 80 |
result = asyncio.run(arxiv_search_async(search_query))
|
| 81 |
return filter_search_results(result)
|
| 82 |
|
| 83 |
+
|
| 84 |
class PubmedSearchTool(Tool):
|
| 85 |
name = ToolNames.PUBMED_SEARCH.value
|
| 86 |
description = """Search PubMed for medical and scientific literature with Linkup API.
|
|
|
|
| 89 |
inputs = {
|
| 90 |
"search_query": {
|
| 91 |
"type": "string",
|
| 92 |
+
"description": "The search query to execute on PubMed",
|
| 93 |
}
|
| 94 |
}
|
| 95 |
output_type = "object"
|
| 96 |
+
|
| 97 |
def forward(self, search_query: str) -> str:
|
| 98 |
result = asyncio.run(pubmed_search_async(search_query))
|
| 99 |
return filter_search_results(result)
|
| 100 |
|
| 101 |
+
|
| 102 |
class ScientificSearchTool(Tool):
|
| 103 |
name = ToolNames.SCIENTIFIC_SEARCH.value
|
| 104 |
description = """Search across multiple scientific domains: Wikipedia, arXiv, PubMed, and ScienceDirect.
|
|
|
|
| 107 |
inputs = {
|
| 108 |
"search_query": {
|
| 109 |
"type": "string",
|
| 110 |
+
"description": "The search query to execute across scientific domains",
|
| 111 |
}
|
| 112 |
}
|
| 113 |
output_type = "object"
|
| 114 |
+
|
| 115 |
def forward(self, search_query: str) -> dict:
|
| 116 |
result = asyncio.run(scientific_search_async(search_query))
|
| 117 |
return filter_search_results(result)
|
| 118 |
|
| 119 |
+
|
| 120 |
URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
|
| 121 |
|
| 122 |
+
|
| 123 |
class GetTableOfContentsTool(Tool):
|
| 124 |
name = ToolNames.GET_TABLE_OF_CONTENTS.value
|
| 125 |
description = f"""Returns all of the titles in the document along with the page number they are on.
|
|
|
|
| 128 |
inputs = {
|
| 129 |
"url": {
|
| 130 |
"type": "string",
|
| 131 |
+
"description": "The URL to get the table of contents of.",
|
| 132 |
}
|
| 133 |
}
|
| 134 |
output_type = "string"
|
| 135 |
+
|
| 136 |
def __init__(self, database: DataBase):
|
| 137 |
super().__init__()
|
| 138 |
self.database: DataBase = database
|
| 139 |
+
|
| 140 |
def forward(self, url: str) -> str:
|
| 141 |
markdown = self.database.get_markdown_of_url(url)
|
| 142 |
table_of_contents: str = get_table_of_contents_per_page_markdown(markdown)
|
| 143 |
return table_of_contents
|
| 144 |
|
| 145 |
+
|
| 146 |
class GetMarkdownTool(Tool):
|
| 147 |
name = ToolNames.GET_MARKDOWN.value
|
| 148 |
description = f"Returns in markdown entire content of the url. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can also use {ToolNames.GET_TABLE_OF_CONTENTS.value} first to get the table of contents of the document including the number of pages."
|
| 149 |
inputs = {
|
| 150 |
+
"url": {"type": "string", "description": "The URL to get the markdown of."}
|
|
|
|
|
|
|
|
|
|
| 151 |
}
|
| 152 |
output_type = "string"
|
| 153 |
+
|
| 154 |
def __init__(self, database: DataBase):
|
| 155 |
super().__init__()
|
| 156 |
self.database: DataBase = database
|
| 157 |
+
|
| 158 |
def forward(self, url: str) -> str:
|
| 159 |
markdown = self.database.get_markdown_of_url(url)
|
| 160 |
markdown_content: str = convert_ocr_response_to_markdown(markdown)
|
| 161 |
return markdown_content
|
| 162 |
|
| 163 |
+
|
| 164 |
class GetPagesContentTool(Tool):
|
| 165 |
name = ToolNames.GET_PAGES_CONTENT.value
|
| 166 |
description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the url including the number of pages. Expects a list of page numbers as integers as input. {URL_EXPLAINATION}"
|
| 167 |
inputs = {
|
| 168 |
+
"url": {"type": "string", "description": "The URL to get the content of."},
|
|
|
|
|
|
|
|
|
|
| 169 |
"page_numbers": {
|
| 170 |
"type": "array",
|
| 171 |
+
"description": "The page numbers to get the content of.",
|
| 172 |
},
|
| 173 |
}
|
| 174 |
output_type = "string"
|
| 175 |
+
|
| 176 |
def __init__(self, database: DataBase):
|
| 177 |
super().__init__()
|
| 178 |
self.database: DataBase = database
|
|
|
|
| 181 |
markdown = self.database.get_markdown_of_url(url)
|
| 182 |
return get_markdown_by_page_numbers(markdown, page_numbers)
|
| 183 |
|
| 184 |
+
|
| 185 |
class FindInMarkdownTool(Tool):
|
| 186 |
name = ToolNames.FIND_IN_MARKDOWN.value
|
| 187 |
description = f"Finds the page numbers of the url that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the url that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages. {URL_EXPLAINATION}"
|
| 188 |
inputs = {
|
| 189 |
+
"url": {"type": "string", "description": "The URL to find in."},
|
|
|
|
|
|
|
|
|
|
| 190 |
"search_queries": {
|
| 191 |
"type": "array",
|
| 192 |
+
"description": "The search queries to find in the url. List of strings.",
|
| 193 |
+
},
|
| 194 |
}
|
| 195 |
output_type = "array"
|
| 196 |
+
|
| 197 |
def __init__(self, database: DataBase):
|
| 198 |
super().__init__()
|
| 199 |
self.database: DataBase = database
|
| 200 |
+
|
| 201 |
def forward(self, url: str, search_queries: list[str]) -> list[int]:
|
| 202 |
markdown = self.database.get_markdown_of_url(url)
|
| 203 |
return find_in_markdown(markdown, search_queries)
|
| 204 |
|
| 205 |
+
|
| 206 |
def create_web_search_agent(model_id="deepseek/deepseek-chat"):
|
| 207 |
"""Create a web search agent with search, crawling, and PDF analysis capabilities."""
|
| 208 |
+
|
| 209 |
model = LiteLLMModel(model_id=model_id)
|
| 210 |
database = DataBase()
|
| 211 |
|
|
|
|
| 220 |
GetPagesContentTool(database),
|
| 221 |
FindInMarkdownTool(database),
|
| 222 |
]
|
| 223 |
+
|
| 224 |
web_search_agent = CodeAgent(
|
| 225 |
model=model,
|
| 226 |
tools=WEB_SEARCH_TOOLS,
|
|
|
|
| 230 |
name="web_search_agent",
|
| 231 |
description="""A team member that can search the web, crawl URLs, download PDFs, and analyze documents.""",
|
| 232 |
)
|
| 233 |
+
|
| 234 |
return web_search_agent
|
src/deepengineer/webcrawler/async_crawl.py
CHANGED
|
@@ -1,15 +1,17 @@
|
|
|
|
|
|
|
|
| 1 |
import aiofiles
|
| 2 |
-
import httpx
|
| 3 |
import crawl4ai
|
| 4 |
-
import
|
| 5 |
-
|
| 6 |
|
| 7 |
async def crawl4ai_extract_markdown_of_url_async(url: str) -> str:
|
| 8 |
"""Extract markdown content from a URL using crawl4ai."""
|
| 9 |
async with crawl4ai.AsyncWebCrawler() as crawler:
|
| 10 |
result = await crawler.arun(url=url)
|
| 11 |
return result.markdown
|
| 12 |
-
|
|
|
|
| 13 |
async def download_pdf_async(url: str, output_path: Path) -> str:
|
| 14 |
"""Download a PDF file from a URL."""
|
| 15 |
timeout = httpx.Timeout(30.0, connect=10.0)
|
|
@@ -20,6 +22,7 @@ async def download_pdf_async(url: str, output_path: Path) -> str:
|
|
| 20 |
await f.write(response.content)
|
| 21 |
return output_path
|
| 22 |
|
|
|
|
| 23 |
async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str:
|
| 24 |
"""Download a PDF from arXiv by converting the abstract URL to PDF URL. Works also for non arXiv URLs."""
|
| 25 |
# Extract the arXiv ID from the URL
|
|
@@ -29,6 +32,5 @@ async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str:
|
|
| 29 |
else:
|
| 30 |
# If it's already a PDF URL, use it as is
|
| 31 |
pdf_url = url
|
| 32 |
-
|
| 33 |
-
return await download_pdf_async(pdf_url, output_path)
|
| 34 |
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
import aiofiles
|
|
|
|
| 4 |
import crawl4ai
|
| 5 |
+
import httpx
|
| 6 |
+
|
| 7 |
|
| 8 |
async def crawl4ai_extract_markdown_of_url_async(url: str) -> str:
|
| 9 |
"""Extract markdown content from a URL using crawl4ai."""
|
| 10 |
async with crawl4ai.AsyncWebCrawler() as crawler:
|
| 11 |
result = await crawler.arun(url=url)
|
| 12 |
return result.markdown
|
| 13 |
+
|
| 14 |
+
|
| 15 |
async def download_pdf_async(url: str, output_path: Path) -> str:
|
| 16 |
"""Download a PDF file from a URL."""
|
| 17 |
timeout = httpx.Timeout(30.0, connect=10.0)
|
|
|
|
| 22 |
await f.write(response.content)
|
| 23 |
return output_path
|
| 24 |
|
| 25 |
+
|
| 26 |
async def download_pdf_or_arxiv_pdf_async(url: str, output_path: Path) -> str:
|
| 27 |
"""Download a PDF from arXiv by converting the abstract URL to PDF URL. Works also for non arXiv URLs."""
|
| 28 |
# Extract the arXiv ID from the URL
|
|
|
|
| 32 |
else:
|
| 33 |
# If it's already a PDF URL, use it as is
|
| 34 |
pdf_url = url
|
|
|
|
|
|
|
| 35 |
|
| 36 |
+
return await download_pdf_async(pdf_url, output_path)
|
src/deepengineer/webcrawler/async_search.py
CHANGED
|
@@ -1,61 +1,67 @@
|
|
| 1 |
import os
|
| 2 |
-
import asyncio
|
| 3 |
-
import requests
|
| 4 |
-
from pydantic import BaseModel, Field
|
| 5 |
-
from typing import List, Optional, Literal
|
| 6 |
from enum import Enum
|
|
|
|
| 7 |
|
|
|
|
| 8 |
from linkup import LinkupClient, LinkupSourcedAnswer
|
|
|
|
| 9 |
from tavily import AsyncTavilyClient
|
| 10 |
|
| 11 |
-
from langchain_community.retrievers import ArxivRetriever
|
| 12 |
-
from langchain_community.utilities.pubmed import PubMedAPIWrapper
|
| 13 |
|
| 14 |
class SearchResult(BaseModel):
|
| 15 |
"""Represents a single search result from any search API."""
|
|
|
|
| 16 |
title: str = Field(..., description="Title of the search result")
|
| 17 |
url: str = Field(..., description="URL of the result")
|
| 18 |
content: str = Field(..., description="Summary/snippet of content")
|
| 19 |
-
raw_content:
|
|
|
|
| 20 |
|
| 21 |
class SearchResponse(BaseModel):
|
| 22 |
"""Represents a search response from any search API."""
|
|
|
|
| 23 |
query: str = Field(..., description="The original search query")
|
| 24 |
-
answer: str | None = Field(
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
def to_string(self):
|
| 28 |
"""Convert search response to a formatted string suitable for LLM consumption."""
|
| 29 |
result_parts = []
|
| 30 |
-
|
| 31 |
# Add the query
|
| 32 |
result_parts.append(f"Search Query: {self.query}\n")
|
| 33 |
-
|
| 34 |
# Add the direct answer if available
|
| 35 |
if self.answer:
|
| 36 |
result_parts.append(f"Direct Answer: {self.answer}\n")
|
| 37 |
-
|
| 38 |
# Add search results
|
| 39 |
if self.search_results:
|
| 40 |
result_parts.append(f"Found {len(self.search_results)} search results:\n")
|
| 41 |
-
|
| 42 |
for i, result in enumerate(self.search_results, 1):
|
| 43 |
result_parts.append(f"\n--- Result {i} ---")
|
| 44 |
result_parts.append(f"Title: {result.title}")
|
| 45 |
result_parts.append(f"URL: {result.url}")
|
| 46 |
-
result_parts.append(f"Content: {result.content[:2000]}...")
|
| 47 |
result_parts.append("") # Empty line for separation
|
| 48 |
else:
|
| 49 |
result_parts.append("No search results found.")
|
| 50 |
-
|
| 51 |
return "\n".join(result_parts)
|
| 52 |
-
|
|
|
|
| 53 |
class ScientificDomains(str, Enum):
|
| 54 |
wikipedia = "wikipedia.org"
|
| 55 |
arxiv = "arxiv.org"
|
| 56 |
pubmed = "pubmed.ncbi.nlm.nih.gov"
|
| 57 |
sciencedirect = "sciencedirect.com"
|
| 58 |
|
|
|
|
| 59 |
def get_tavily_usage():
|
| 60 |
url = "https://api.tavily.com/usage"
|
| 61 |
headers = {"Authorization": f"Bearer {os.getenv('TAVILY_API_KEY')}"}
|
|
@@ -71,14 +77,14 @@ async def tavily_search_async(
|
|
| 71 |
include_answer: Literal["basic", "advanced"] | None = "advanced",
|
| 72 |
include_raw_content: Literal["text", "markdown"] | None = "markdown",
|
| 73 |
include_images: bool = False,
|
| 74 |
-
search_depth: Literal[
|
| 75 |
include_domains: list[ScientificDomains] = None,
|
| 76 |
) -> SearchResponse:
|
| 77 |
"""
|
| 78 |
Performs concurrent web searches with the Tavily API
|
| 79 |
"""
|
| 80 |
tavily_async_client = AsyncTavilyClient()
|
| 81 |
-
|
| 82 |
search_response = await tavily_async_client.search(
|
| 83 |
query=search_query,
|
| 84 |
search_depth=search_depth,
|
|
@@ -88,29 +94,29 @@ async def tavily_search_async(
|
|
| 88 |
include_images=include_images,
|
| 89 |
include_domains=include_domains,
|
| 90 |
)
|
| 91 |
-
|
| 92 |
search_results = [
|
| 93 |
SearchResult(
|
| 94 |
-
title=result.get(
|
| 95 |
-
url=result.get(
|
| 96 |
-
content=result.get(
|
| 97 |
-
raw_content=result.get(
|
| 98 |
)
|
| 99 |
-
for result in search_response.get(
|
| 100 |
]
|
| 101 |
|
| 102 |
# Convert to our Pydantic models
|
| 103 |
responses: SearchResponse = SearchResponse(
|
| 104 |
query=search_query,
|
| 105 |
-
answer=search_response.get(
|
| 106 |
-
search_results=search_results
|
| 107 |
)
|
| 108 |
return responses
|
| 109 |
|
| 110 |
|
| 111 |
def get_linkup_balance():
|
| 112 |
url = "https://api.linkup.so/v1/credits/balance"
|
| 113 |
-
|
| 114 |
headers = {"Authorization": f"Bearer {os.getenv('LINKUP_API_KEY')}"}
|
| 115 |
|
| 116 |
response = requests.request("GET", url, headers=headers)
|
|
@@ -122,14 +128,16 @@ def get_linkup_balance():
|
|
| 122 |
async def linkup_search_async(
|
| 123 |
search_query: str,
|
| 124 |
depth: Literal["standard", "deep"] = "standard",
|
| 125 |
-
output_type: Literal[
|
|
|
|
|
|
|
| 126 |
include_images: bool = False,
|
| 127 |
include_domains: list[ScientificDomains] = None,
|
| 128 |
) -> SearchResponse:
|
| 129 |
"""
|
| 130 |
Performs concurrent web searches using the Linkup API.
|
| 131 |
"""
|
| 132 |
-
|
| 133 |
client = LinkupClient()
|
| 134 |
search_response: LinkupSourcedAnswer = await client.async_search(
|
| 135 |
query=search_query,
|
|
@@ -138,7 +146,7 @@ async def linkup_search_async(
|
|
| 138 |
include_images=include_images,
|
| 139 |
include_domains=include_domains,
|
| 140 |
)
|
| 141 |
-
|
| 142 |
search_results = [
|
| 143 |
SearchResult(
|
| 144 |
title=result.name,
|
|
@@ -151,37 +159,48 @@ async def linkup_search_async(
|
|
| 151 |
|
| 152 |
# Convert to our Pydantic models
|
| 153 |
responses: SearchResponse = SearchResponse(
|
| 154 |
-
query=search_query,
|
| 155 |
-
answer=search_response.answer,
|
| 156 |
-
search_results=search_results
|
| 157 |
)
|
| 158 |
return responses
|
| 159 |
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
async def arxiv_search_async(
|
| 165 |
search_query: str,
|
| 166 |
) -> SearchResponse:
|
| 167 |
-
response = await linkup_search_async(
|
|
|
|
|
|
|
| 168 |
return response
|
| 169 |
|
| 170 |
|
| 171 |
async def pubmed_search_async(
|
| 172 |
search_query: str,
|
| 173 |
) -> SearchResponse:
|
| 174 |
-
response = await linkup_search_async(
|
|
|
|
|
|
|
| 175 |
return response
|
| 176 |
|
|
|
|
| 177 |
async def sciencedirect_search_async(
|
| 178 |
search_query: str,
|
| 179 |
) -> SearchResponse:
|
| 180 |
-
response = await linkup_search_async(
|
|
|
|
|
|
|
| 181 |
return response
|
| 182 |
|
|
|
|
| 183 |
async def scientific_search_async(
|
| 184 |
search_query: str,
|
| 185 |
) -> SearchResponse:
|
| 186 |
-
response = await linkup_search_async(
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from enum import Enum
|
| 3 |
+
from typing import Literal
|
| 4 |
|
| 5 |
+
import requests
|
| 6 |
from linkup import LinkupClient, LinkupSourcedAnswer
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
from tavily import AsyncTavilyClient
|
| 9 |
|
|
|
|
|
|
|
| 10 |
|
| 11 |
class SearchResult(BaseModel):
|
| 12 |
"""Represents a single search result from any search API."""
|
| 13 |
+
|
| 14 |
title: str = Field(..., description="Title of the search result")
|
| 15 |
url: str = Field(..., description="URL of the result")
|
| 16 |
content: str = Field(..., description="Summary/snippet of content")
|
| 17 |
+
raw_content: str | None = Field(None, description="Full page content if available")
|
| 18 |
+
|
| 19 |
|
| 20 |
class SearchResponse(BaseModel):
|
| 21 |
"""Represents a search response from any search API."""
|
| 22 |
+
|
| 23 |
query: str = Field(..., description="The original search query")
|
| 24 |
+
answer: str | None = Field(
|
| 25 |
+
None, description="Direct answer from the search API if available"
|
| 26 |
+
)
|
| 27 |
+
search_results: list[SearchResult] = Field(
|
| 28 |
+
default_factory=list, description="List of search results"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
def to_string(self):
|
| 32 |
"""Convert search response to a formatted string suitable for LLM consumption."""
|
| 33 |
result_parts = []
|
| 34 |
+
|
| 35 |
# Add the query
|
| 36 |
result_parts.append(f"Search Query: {self.query}\n")
|
| 37 |
+
|
| 38 |
# Add the direct answer if available
|
| 39 |
if self.answer:
|
| 40 |
result_parts.append(f"Direct Answer: {self.answer}\n")
|
| 41 |
+
|
| 42 |
# Add search results
|
| 43 |
if self.search_results:
|
| 44 |
result_parts.append(f"Found {len(self.search_results)} search results:\n")
|
| 45 |
+
|
| 46 |
for i, result in enumerate(self.search_results, 1):
|
| 47 |
result_parts.append(f"\n--- Result {i} ---")
|
| 48 |
result_parts.append(f"Title: {result.title}")
|
| 49 |
result_parts.append(f"URL: {result.url}")
|
| 50 |
+
result_parts.append(f"Content: {result.content[:2000]}...")
|
| 51 |
result_parts.append("") # Empty line for separation
|
| 52 |
else:
|
| 53 |
result_parts.append("No search results found.")
|
| 54 |
+
|
| 55 |
return "\n".join(result_parts)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
class ScientificDomains(str, Enum):
|
| 59 |
wikipedia = "wikipedia.org"
|
| 60 |
arxiv = "arxiv.org"
|
| 61 |
pubmed = "pubmed.ncbi.nlm.nih.gov"
|
| 62 |
sciencedirect = "sciencedirect.com"
|
| 63 |
|
| 64 |
+
|
| 65 |
def get_tavily_usage():
|
| 66 |
url = "https://api.tavily.com/usage"
|
| 67 |
headers = {"Authorization": f"Bearer {os.getenv('TAVILY_API_KEY')}"}
|
|
|
|
| 77 |
include_answer: Literal["basic", "advanced"] | None = "advanced",
|
| 78 |
include_raw_content: Literal["text", "markdown"] | None = "markdown",
|
| 79 |
include_images: bool = False,
|
| 80 |
+
search_depth: Literal["basic", "advanced"] | None = "basic",
|
| 81 |
include_domains: list[ScientificDomains] = None,
|
| 82 |
) -> SearchResponse:
|
| 83 |
"""
|
| 84 |
Performs concurrent web searches with the Tavily API
|
| 85 |
"""
|
| 86 |
tavily_async_client = AsyncTavilyClient()
|
| 87 |
+
|
| 88 |
search_response = await tavily_async_client.search(
|
| 89 |
query=search_query,
|
| 90 |
search_depth=search_depth,
|
|
|
|
| 94 |
include_images=include_images,
|
| 95 |
include_domains=include_domains,
|
| 96 |
)
|
| 97 |
+
|
| 98 |
search_results = [
|
| 99 |
SearchResult(
|
| 100 |
+
title=result.get("title", ""),
|
| 101 |
+
url=result.get("url", ""),
|
| 102 |
+
content=result.get("content", ""),
|
| 103 |
+
raw_content=result.get("raw_content"),
|
| 104 |
)
|
| 105 |
+
for result in search_response.get("results", [])
|
| 106 |
]
|
| 107 |
|
| 108 |
# Convert to our Pydantic models
|
| 109 |
responses: SearchResponse = SearchResponse(
|
| 110 |
query=search_query,
|
| 111 |
+
answer=search_response.get("answer", None),
|
| 112 |
+
search_results=search_results,
|
| 113 |
)
|
| 114 |
return responses
|
| 115 |
|
| 116 |
|
| 117 |
def get_linkup_balance():
|
| 118 |
url = "https://api.linkup.so/v1/credits/balance"
|
| 119 |
+
|
| 120 |
headers = {"Authorization": f"Bearer {os.getenv('LINKUP_API_KEY')}"}
|
| 121 |
|
| 122 |
response = requests.request("GET", url, headers=headers)
|
|
|
|
| 128 |
async def linkup_search_async(
|
| 129 |
search_query: str,
|
| 130 |
depth: Literal["standard", "deep"] = "standard",
|
| 131 |
+
output_type: Literal[
|
| 132 |
+
"searchResults", "sourcedAnswer", "structured"
|
| 133 |
+
] = "sourcedAnswer",
|
| 134 |
include_images: bool = False,
|
| 135 |
include_domains: list[ScientificDomains] = None,
|
| 136 |
) -> SearchResponse:
|
| 137 |
"""
|
| 138 |
Performs concurrent web searches using the Linkup API.
|
| 139 |
"""
|
| 140 |
+
|
| 141 |
client = LinkupClient()
|
| 142 |
search_response: LinkupSourcedAnswer = await client.async_search(
|
| 143 |
query=search_query,
|
|
|
|
| 146 |
include_images=include_images,
|
| 147 |
include_domains=include_domains,
|
| 148 |
)
|
| 149 |
+
|
| 150 |
search_results = [
|
| 151 |
SearchResult(
|
| 152 |
title=result.name,
|
|
|
|
| 159 |
|
| 160 |
# Convert to our Pydantic models
|
| 161 |
responses: SearchResponse = SearchResponse(
|
| 162 |
+
query=search_query, answer=search_response.answer, search_results=search_results
|
|
|
|
|
|
|
| 163 |
)
|
| 164 |
return responses
|
| 165 |
|
| 166 |
|
|
|
|
|
|
|
|
|
|
| 167 |
async def arxiv_search_async(
|
| 168 |
search_query: str,
|
| 169 |
) -> SearchResponse:
|
| 170 |
+
response = await linkup_search_async(
|
| 171 |
+
search_query, include_domains=[ScientificDomains.arxiv]
|
| 172 |
+
)
|
| 173 |
return response
|
| 174 |
|
| 175 |
|
| 176 |
async def pubmed_search_async(
|
| 177 |
search_query: str,
|
| 178 |
) -> SearchResponse:
|
| 179 |
+
response = await linkup_search_async(
|
| 180 |
+
search_query, include_domains=[ScientificDomains.pubmed]
|
| 181 |
+
)
|
| 182 |
return response
|
| 183 |
|
| 184 |
+
|
| 185 |
async def sciencedirect_search_async(
|
| 186 |
search_query: str,
|
| 187 |
) -> SearchResponse:
|
| 188 |
+
response = await linkup_search_async(
|
| 189 |
+
search_query, include_domains=[ScientificDomains.sciencedirect]
|
| 190 |
+
)
|
| 191 |
return response
|
| 192 |
|
| 193 |
+
|
| 194 |
async def scientific_search_async(
|
| 195 |
search_query: str,
|
| 196 |
) -> SearchResponse:
|
| 197 |
+
response = await linkup_search_async(
|
| 198 |
+
search_query,
|
| 199 |
+
include_domains=[
|
| 200 |
+
ScientificDomains.wikipedia,
|
| 201 |
+
ScientificDomains.arxiv,
|
| 202 |
+
ScientificDomains.pubmed,
|
| 203 |
+
ScientificDomains.sciencedirect,
|
| 204 |
+
],
|
| 205 |
+
)
|
| 206 |
+
return response
|
src/deepengineer/webcrawler/crawl_database.py
CHANGED
|
@@ -1,16 +1,23 @@
|
|
| 1 |
-
from deepengineer.webcrawler.utils import sanitize_filename
|
| 2 |
-
from deepengineer.common_path import DATA_DIR
|
| 3 |
-
from deepengineer.webcrawler.async_search import SearchResult, SearchResponse
|
| 4 |
import asyncio
|
|
|
|
| 5 |
from mistralai import OCRResponse
|
| 6 |
-
|
| 7 |
-
from deepengineer.
|
| 8 |
-
from deepengineer.webcrawler.
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
def __init__(self):
|
| 12 |
self.urls_to_markdown: dict[str, OCRResponse] = {}
|
| 13 |
-
|
| 14 |
@staticmethod
|
| 15 |
def preprocess_url(url: str) -> str:
|
| 16 |
"""Preprocess the url to make it a valid url."""
|
|
@@ -18,25 +25,25 @@ class DataBase():
|
|
| 18 |
return url.replace("arxiv.org/abs/", "arxiv.org/pdf/")
|
| 19 |
else:
|
| 20 |
return url
|
| 21 |
-
|
| 22 |
def crawl_url(self, url: str) -> str:
|
| 23 |
"""Crawl the url, if the url is a pdf, download the pdf and save and return the markdown."""
|
| 24 |
url = self.preprocess_url(url)
|
| 25 |
if "pdf" in url:
|
| 26 |
output_path = (DATA_DIR / sanitize_filename(url)).with_suffix(".pdf")
|
| 27 |
-
pdf_path = asyncio.run(
|
|
|
|
|
|
|
| 28 |
ocr_response = asyncio.run(convert_pdf_to_markdown_async(pdf_path))
|
| 29 |
else:
|
| 30 |
markdown = asyncio.run(crawl4ai_extract_markdown_of_url_async(url))
|
| 31 |
ocr_response = convert_raw_markdown_to_ocr_response(markdown)
|
| 32 |
self.urls_to_markdown[url] = ocr_response
|
| 33 |
return ocr_response
|
| 34 |
-
|
| 35 |
-
|
| 36 |
def get_markdown_of_url(self, url: str) -> OCRResponse:
|
| 37 |
url = self.preprocess_url(url)
|
| 38 |
if url in self.urls_to_markdown:
|
| 39 |
return self.urls_to_markdown[url]
|
| 40 |
else:
|
| 41 |
return self.crawl_url(url)
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import asyncio
|
| 2 |
+
|
| 3 |
from mistralai import OCRResponse
|
| 4 |
+
|
| 5 |
+
from deepengineer.common_path import DATA_DIR
|
| 6 |
+
from deepengineer.webcrawler.async_crawl import (
|
| 7 |
+
crawl4ai_extract_markdown_of_url_async,
|
| 8 |
+
download_pdf_or_arxiv_pdf_async,
|
| 9 |
+
)
|
| 10 |
+
from deepengineer.webcrawler.pdf_utils import (
|
| 11 |
+
convert_pdf_to_markdown_async,
|
| 12 |
+
convert_raw_markdown_to_ocr_response,
|
| 13 |
+
)
|
| 14 |
+
from deepengineer.webcrawler.utils import sanitize_filename
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class DataBase:
|
| 18 |
def __init__(self):
|
| 19 |
self.urls_to_markdown: dict[str, OCRResponse] = {}
|
| 20 |
+
|
| 21 |
@staticmethod
|
| 22 |
def preprocess_url(url: str) -> str:
|
| 23 |
"""Preprocess the url to make it a valid url."""
|
|
|
|
| 25 |
return url.replace("arxiv.org/abs/", "arxiv.org/pdf/")
|
| 26 |
else:
|
| 27 |
return url
|
| 28 |
+
|
| 29 |
def crawl_url(self, url: str) -> str:
|
| 30 |
"""Crawl the url, if the url is a pdf, download the pdf and save and return the markdown."""
|
| 31 |
url = self.preprocess_url(url)
|
| 32 |
if "pdf" in url:
|
| 33 |
output_path = (DATA_DIR / sanitize_filename(url)).with_suffix(".pdf")
|
| 34 |
+
pdf_path = asyncio.run(
|
| 35 |
+
download_pdf_or_arxiv_pdf_async(url, output_path=output_path)
|
| 36 |
+
)
|
| 37 |
ocr_response = asyncio.run(convert_pdf_to_markdown_async(pdf_path))
|
| 38 |
else:
|
| 39 |
markdown = asyncio.run(crawl4ai_extract_markdown_of_url_async(url))
|
| 40 |
ocr_response = convert_raw_markdown_to_ocr_response(markdown)
|
| 41 |
self.urls_to_markdown[url] = ocr_response
|
| 42 |
return ocr_response
|
| 43 |
+
|
|
|
|
| 44 |
def get_markdown_of_url(self, url: str) -> OCRResponse:
|
| 45 |
url = self.preprocess_url(url)
|
| 46 |
if url in self.urls_to_markdown:
|
| 47 |
return self.urls_to_markdown[url]
|
| 48 |
else:
|
| 49 |
return self.crawl_url(url)
|
|
|
src/deepengineer/webcrawler/pdf_utils.py
CHANGED
|
@@ -1,16 +1,10 @@
|
|
| 1 |
import os
|
| 2 |
from pathlib import Path
|
| 3 |
-
from pypdf import PdfReader, PdfWriter
|
| 4 |
-
import io
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
from mistralai import Mistral
|
| 7 |
-
import os
|
| 8 |
-
from litellm import completion
|
| 9 |
|
| 10 |
-
from
|
| 11 |
-
import yaml
|
| 12 |
-
from tenacity import retry, stop_after_attempt, wait_fixed, RetryError
|
| 13 |
from litellm.exceptions import BadRequestError
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# Define the size limit in bytes
|
| 16 |
MAX_SIZE_BYTES = 49 * 1024 * 1024
|
|
@@ -20,7 +14,6 @@ async def convert_pdf_to_markdown_async(
|
|
| 20 |
pdf_path: Path,
|
| 21 |
with_image_description: bool = False,
|
| 22 |
) -> OCRResponse:
|
| 23 |
-
|
| 24 |
mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
|
| 25 |
|
| 26 |
uploaded_pdf = await mistral_client.files.upload_async(
|
|
@@ -31,7 +24,9 @@ async def convert_pdf_to_markdown_async(
|
|
| 31 |
purpose="ocr",
|
| 32 |
)
|
| 33 |
|
| 34 |
-
signed_url = await mistral_client.files.get_signed_url_async(
|
|
|
|
|
|
|
| 35 |
|
| 36 |
ocr_response = await mistral_client.ocr.process_async(
|
| 37 |
model="mistral-ocr-latest",
|
|
@@ -42,27 +37,33 @@ async def convert_pdf_to_markdown_async(
|
|
| 42 |
return ocr_response
|
| 43 |
|
| 44 |
|
| 45 |
-
def convert_ocr_response_to_markdown(
|
| 46 |
-
ocr_response: OCRResponse
|
| 47 |
-
) -> str:
|
| 48 |
markdowns: list[str] = []
|
| 49 |
for page in ocr_response.pages:
|
| 50 |
page_description = page.markdown
|
| 51 |
markdowns.append(page_description)
|
| 52 |
-
|
| 53 |
return "\n\n".join(markdowns)
|
| 54 |
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
| 56 |
markdowns: list[str] = []
|
| 57 |
page_numbers_to_get = set(page_numbers)
|
| 58 |
if get_full_content:
|
| 59 |
page_numbers_to_get = set(range(len(markdown.pages)))
|
| 60 |
|
| 61 |
for page_number in page_numbers_to_get:
|
| 62 |
-
markdowns.append(
|
|
|
|
|
|
|
| 63 |
return "\n\n".join(markdowns)
|
| 64 |
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
| 66 |
"""
|
| 67 |
Find the page numbers of the pdf that contain the search query.
|
| 68 |
|
|
@@ -82,12 +83,13 @@ def find_in_markdown(markdown: OCRResponse, search_queries: list[str] | str) ->
|
|
| 82 |
page_numbers.append(page_number)
|
| 83 |
return page_numbers
|
| 84 |
|
|
|
|
| 85 |
def get_table_of_contents_per_page_markdown(markdown: OCRResponse) -> str:
|
| 86 |
"""
|
| 87 |
Get the table of contents of the pdf.
|
| 88 |
-
|
| 89 |
Finds all the titles of the pdf to reconstruct the table of contents.
|
| 90 |
-
|
| 91 |
Args:
|
| 92 |
markdown (OCRResponse): The markdown of the pdf.
|
| 93 |
|
|
@@ -102,15 +104,26 @@ def get_table_of_contents_per_page_markdown(markdown: OCRResponse) -> str:
|
|
| 102 |
if line.startswith("#"):
|
| 103 |
title_to_page_number[line] = page_number
|
| 104 |
|
| 105 |
-
table_of_contents = "\n".join(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
return table_of_contents
|
| 107 |
|
|
|
|
| 108 |
def convert_raw_markdown_to_ocr_response(raw_markdown: str) -> OCRResponse:
|
| 109 |
pages = raw_markdown.split("# ")
|
| 110 |
usage_info_empty = OCRUsageInfo(pages_processed=0)
|
| 111 |
-
return OCRResponse(
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
|
| 116 |
def get_images_from_pdf(pdf_path: Path, image_ids: list[str]) -> list[str]:
|
|
@@ -141,4 +154,3 @@ def get_images_from_pdf(pdf_path: Path, image_ids: list[str]) -> list[str]:
|
|
| 141 |
except BadRequestError:
|
| 142 |
output = ""
|
| 143 |
return output
|
| 144 |
-
|
|
|
|
| 1 |
import os
|
| 2 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
from litellm import completion
|
|
|
|
|
|
|
| 5 |
from litellm.exceptions import BadRequestError
|
| 6 |
+
from mistralai import Mistral
|
| 7 |
+
from mistralai.models import OCRPageObject, OCRResponse, OCRUsageInfo
|
| 8 |
|
| 9 |
# Define the size limit in bytes
|
| 10 |
MAX_SIZE_BYTES = 49 * 1024 * 1024
|
|
|
|
| 14 |
pdf_path: Path,
|
| 15 |
with_image_description: bool = False,
|
| 16 |
) -> OCRResponse:
|
|
|
|
| 17 |
mistral_client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
|
| 18 |
|
| 19 |
uploaded_pdf = await mistral_client.files.upload_async(
|
|
|
|
| 24 |
purpose="ocr",
|
| 25 |
)
|
| 26 |
|
| 27 |
+
signed_url = await mistral_client.files.get_signed_url_async(
|
| 28 |
+
file_id=uploaded_pdf.id
|
| 29 |
+
)
|
| 30 |
|
| 31 |
ocr_response = await mistral_client.ocr.process_async(
|
| 32 |
model="mistral-ocr-latest",
|
|
|
|
| 37 |
return ocr_response
|
| 38 |
|
| 39 |
|
| 40 |
+
def convert_ocr_response_to_markdown(ocr_response: OCRResponse) -> str:
|
|
|
|
|
|
|
| 41 |
markdowns: list[str] = []
|
| 42 |
for page in ocr_response.pages:
|
| 43 |
page_description = page.markdown
|
| 44 |
markdowns.append(page_description)
|
| 45 |
+
|
| 46 |
return "\n\n".join(markdowns)
|
| 47 |
|
| 48 |
+
|
| 49 |
+
def get_markdown_by_page_numbers(
|
| 50 |
+
markdown: OCRResponse, page_numbers: list[int], get_full_content: bool = False
|
| 51 |
+
) -> str:
|
| 52 |
markdowns: list[str] = []
|
| 53 |
page_numbers_to_get = set(page_numbers)
|
| 54 |
if get_full_content:
|
| 55 |
page_numbers_to_get = set(range(len(markdown.pages)))
|
| 56 |
|
| 57 |
for page_number in page_numbers_to_get:
|
| 58 |
+
markdowns.append(
|
| 59 |
+
f"*Page {page_number}*\n{markdown.pages[page_number].markdown}"
|
| 60 |
+
)
|
| 61 |
return "\n\n".join(markdowns)
|
| 62 |
|
| 63 |
+
|
| 64 |
+
def find_in_markdown(
|
| 65 |
+
markdown: OCRResponse, search_queries: list[str] | str
|
| 66 |
+
) -> list[int]:
|
| 67 |
"""
|
| 68 |
Find the page numbers of the pdf that contain the search query.
|
| 69 |
|
|
|
|
| 83 |
page_numbers.append(page_number)
|
| 84 |
return page_numbers
|
| 85 |
|
| 86 |
+
|
| 87 |
def get_table_of_contents_per_page_markdown(markdown: OCRResponse) -> str:
|
| 88 |
"""
|
| 89 |
Get the table of contents of the pdf.
|
| 90 |
+
|
| 91 |
Finds all the titles of the pdf to reconstruct the table of contents.
|
| 92 |
+
|
| 93 |
Args:
|
| 94 |
markdown (OCRResponse): The markdown of the pdf.
|
| 95 |
|
|
|
|
| 104 |
if line.startswith("#"):
|
| 105 |
title_to_page_number[line] = page_number
|
| 106 |
|
| 107 |
+
table_of_contents = "\n".join(
|
| 108 |
+
[
|
| 109 |
+
f"{title} - Page {page_number}"
|
| 110 |
+
for title, page_number in title_to_page_number.items()
|
| 111 |
+
]
|
| 112 |
+
)
|
| 113 |
return table_of_contents
|
| 114 |
|
| 115 |
+
|
| 116 |
def convert_raw_markdown_to_ocr_response(raw_markdown: str) -> OCRResponse:
|
| 117 |
pages = raw_markdown.split("# ")
|
| 118 |
usage_info_empty = OCRUsageInfo(pages_processed=0)
|
| 119 |
+
return OCRResponse(
|
| 120 |
+
pages=[
|
| 121 |
+
OCRPageObject(index=i, markdown="# " + page, images=[], dimensions=None)
|
| 122 |
+
for i, page in enumerate(pages)
|
| 123 |
+
],
|
| 124 |
+
usage_info=usage_info_empty,
|
| 125 |
+
model="",
|
| 126 |
+
)
|
| 127 |
|
| 128 |
|
| 129 |
def get_images_from_pdf(pdf_path: Path, image_ids: list[str]) -> list[str]:
|
|
|
|
| 154 |
except BadRequestError:
|
| 155 |
output = ""
|
| 156 |
return output
|
|
|
src/deepengineer/webcrawler/testing.py
CHANGED
|
@@ -6,15 +6,19 @@ TAVILY_RESPONSE_FILE = DATA_DIR / "answers" / "tavily_response.json"
|
|
| 6 |
|
| 7 |
|
| 8 |
def load_linkup_response() -> SearchResponse:
|
| 9 |
-
with open(LINKUP_RESPONSE_FILE
|
| 10 |
return SearchResponse.model_validate_json(f.read())
|
| 11 |
|
|
|
|
| 12 |
def load_tavily_response() -> SearchResponse:
|
| 13 |
-
with open(TAVILY_RESPONSE_FILE
|
| 14 |
return SearchResponse.model_validate_json(f.read())
|
| 15 |
|
|
|
|
| 16 |
URL_WIKIPEDIA = "https://en.wikipedia.org/wiki/Graphite-moderated_reactor"
|
| 17 |
URL_PDF = "https://arxiv.org/pdf/1301.1699.pdf"
|
| 18 |
ARXIV_URL = "https://arxiv.org/abs/1301.1699"
|
| 19 |
PUBMED_URL = "https://pubmed.ncbi.nlm.nih.gov/34100000/"
|
| 20 |
-
SCIENCEDIRECT_URL =
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
def load_linkup_response() -> SearchResponse:
|
| 9 |
+
with open(LINKUP_RESPONSE_FILE) as f:
|
| 10 |
return SearchResponse.model_validate_json(f.read())
|
| 11 |
|
| 12 |
+
|
| 13 |
def load_tavily_response() -> SearchResponse:
|
| 14 |
+
with open(TAVILY_RESPONSE_FILE) as f:
|
| 15 |
return SearchResponse.model_validate_json(f.read())
|
| 16 |
|
| 17 |
+
|
| 18 |
URL_WIKIPEDIA = "https://en.wikipedia.org/wiki/Graphite-moderated_reactor"
|
| 19 |
URL_PDF = "https://arxiv.org/pdf/1301.1699.pdf"
|
| 20 |
ARXIV_URL = "https://arxiv.org/abs/1301.1699"
|
| 21 |
PUBMED_URL = "https://pubmed.ncbi.nlm.nih.gov/34100000/"
|
| 22 |
+
SCIENCEDIRECT_URL = (
|
| 23 |
+
"https://www.sciencedirect.com/science/article/abs/pii/0168900289901964"
|
| 24 |
+
)
|
src/deepengineer/webcrawler/utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import re
|
| 2 |
import unicodedata
|
| 3 |
|
|
|
|
| 4 |
def sanitize_filename(filename, replacement_char="_", max_length=255):
|
| 5 |
"""
|
| 6 |
Sanitizes a string to be suitable for use as a filename.
|
|
@@ -31,26 +32,32 @@ def sanitize_filename(filename, replacement_char="_", max_length=255):
|
|
| 31 |
# 1. Replace spaces with the replacement_char
|
| 32 |
# This is done early to ensure spaces are handled before other replacements
|
| 33 |
# to avoid issues with double replacement characters in subsequent steps.
|
| 34 |
-
cleaned_filename = filename.replace(
|
| 35 |
|
| 36 |
# 2. Convert to NFKD and encode to ASCII to handle accented characters
|
| 37 |
# This transforms 'crème brûlée' into 'creme brulee'
|
| 38 |
-
cleaned_filename =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
# 3. Remove characters that are not alphanumeric, hyphen, underscore, or period.
|
| 41 |
# Replace them with the specified replacement_char.
|
| 42 |
# The regex pattern `[^a-zA-Z0-9\-_.]` matches any character that is NOT
|
| 43 |
# (a-z, A-Z, 0-9, hyphen, underscore, or period).
|
| 44 |
-
cleaned_filename = re.sub(r
|
| 45 |
|
| 46 |
# 4. Replace multiple consecutive replacement_char characters with a single one
|
| 47 |
-
cleaned_filename = re.sub(
|
|
|
|
|
|
|
| 48 |
|
| 49 |
# 5. Trim leading/trailing replacement_char characters
|
| 50 |
cleaned_filename = cleaned_filename.strip(replacement_char)
|
| 51 |
|
| 52 |
# 6. Ensure the filename doesn't start with a period (hidden file on some systems)
|
| 53 |
-
if cleaned_filename.startswith(
|
| 54 |
cleaned_filename = replacement_char + cleaned_filename[1:]
|
| 55 |
|
| 56 |
# 7. Truncate to max_length
|
|
@@ -58,12 +65,12 @@ def sanitize_filename(filename, replacement_char="_", max_length=255):
|
|
| 58 |
if len(cleaned_filename) > max_length:
|
| 59 |
# Try to keep the file extension if present
|
| 60 |
name, ext = "", ""
|
| 61 |
-
if
|
| 62 |
-
parts = cleaned_filename.rsplit(
|
| 63 |
name, ext = parts[0], "." + parts[1]
|
| 64 |
|
| 65 |
if len(name) > max_length - len(ext):
|
| 66 |
-
cleaned_filename = name[:max_length - len(ext)] + ext
|
| 67 |
else:
|
| 68 |
cleaned_filename = cleaned_filename[:max_length]
|
| 69 |
|
|
|
|
| 1 |
import re
|
| 2 |
import unicodedata
|
| 3 |
|
| 4 |
+
|
| 5 |
def sanitize_filename(filename, replacement_char="_", max_length=255):
|
| 6 |
"""
|
| 7 |
Sanitizes a string to be suitable for use as a filename.
|
|
|
|
| 32 |
# 1. Replace spaces with the replacement_char
|
| 33 |
# This is done early to ensure spaces are handled before other replacements
|
| 34 |
# to avoid issues with double replacement characters in subsequent steps.
|
| 35 |
+
cleaned_filename = filename.replace(" ", replacement_char)
|
| 36 |
|
| 37 |
# 2. Convert to NFKD and encode to ASCII to handle accented characters
|
| 38 |
# This transforms 'crème brûlée' into 'creme brulee'
|
| 39 |
+
cleaned_filename = (
|
| 40 |
+
unicodedata.normalize("NFKD", cleaned_filename)
|
| 41 |
+
.encode("ascii", "ignore")
|
| 42 |
+
.decode("utf-8")
|
| 43 |
+
)
|
| 44 |
|
| 45 |
# 3. Remove characters that are not alphanumeric, hyphen, underscore, or period.
|
| 46 |
# Replace them with the specified replacement_char.
|
| 47 |
# The regex pattern `[^a-zA-Z0-9\-_.]` matches any character that is NOT
|
| 48 |
# (a-z, A-Z, 0-9, hyphen, underscore, or period).
|
| 49 |
+
cleaned_filename = re.sub(r"[^a-zA-Z0-9\-_.]", replacement_char, cleaned_filename)
|
| 50 |
|
| 51 |
# 4. Replace multiple consecutive replacement_char characters with a single one
|
| 52 |
+
cleaned_filename = re.sub(
|
| 53 |
+
re.escape(replacement_char) + r"+", replacement_char, cleaned_filename
|
| 54 |
+
)
|
| 55 |
|
| 56 |
# 5. Trim leading/trailing replacement_char characters
|
| 57 |
cleaned_filename = cleaned_filename.strip(replacement_char)
|
| 58 |
|
| 59 |
# 6. Ensure the filename doesn't start with a period (hidden file on some systems)
|
| 60 |
+
if cleaned_filename.startswith("."):
|
| 61 |
cleaned_filename = replacement_char + cleaned_filename[1:]
|
| 62 |
|
| 63 |
# 7. Truncate to max_length
|
|
|
|
| 65 |
if len(cleaned_filename) > max_length:
|
| 66 |
# Try to keep the file extension if present
|
| 67 |
name, ext = "", ""
|
| 68 |
+
if "." in cleaned_filename:
|
| 69 |
+
parts = cleaned_filename.rsplit(".", 1)
|
| 70 |
name, ext = parts[0], "." + parts[1]
|
| 71 |
|
| 72 |
if len(name) > max_length - len(ext):
|
| 73 |
+
cleaned_filename = name[: max_length - len(ext)] + ext
|
| 74 |
else:
|
| 75 |
cleaned_filename = cleaned_filename[:max_length]
|
| 76 |
|
tests/deepsearch/test_pdf_agent.py
CHANGED
|
@@ -1,9 +1,17 @@
|
|
| 1 |
-
|
| 2 |
-
from mistralai import OCRResponse
|
| 3 |
from deepengineer.common_path import DATA_DIR
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
def load_mock_ocr_response() -> OCRResponse:
|
| 6 |
-
with open(DATA_DIR / "report_thermal_neutron.json"
|
| 7 |
return OCRResponse.model_validate_json(f.read())
|
| 8 |
|
| 9 |
|
|
@@ -13,13 +21,16 @@ def test_pdf_agent():
|
|
| 13 |
assert pdf_agent is not None
|
| 14 |
assert pdf_agent.name == "markdown_agent"
|
| 15 |
assert pdf_agent.tools is not None
|
| 16 |
-
assert len(pdf_agent.tools) == 4 + 1
|
| 17 |
-
|
| 18 |
-
|
| 19 |
GetTableOfContentsTool(ocr_response).forward()
|
| 20 |
GetMarkdownTool(ocr_response).forward()
|
| 21 |
-
GetPagesContentTool(ocr_response).forward([1,2,3])
|
| 22 |
FindInMarkdownTool(ocr_response).forward(["thermal neutron", "neutron"])
|
| 23 |
-
# pdf_agent.run("Give me a summary of the document.")
|
| 24 |
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
|
|
|
| 2 |
from deepengineer.common_path import DATA_DIR
|
| 3 |
+
from deepengineer.deepsearch.analyse_markdown_agent import (
|
| 4 |
+
FindInMarkdownTool,
|
| 5 |
+
GetMarkdownTool,
|
| 6 |
+
GetPagesContentTool,
|
| 7 |
+
GetTableOfContentsTool,
|
| 8 |
+
create_agent,
|
| 9 |
+
)
|
| 10 |
+
from mistralai import OCRResponse
|
| 11 |
+
|
| 12 |
|
| 13 |
def load_mock_ocr_response() -> OCRResponse:
|
| 14 |
+
with open(DATA_DIR / "report_thermal_neutron.json") as f:
|
| 15 |
return OCRResponse.model_validate_json(f.read())
|
| 16 |
|
| 17 |
|
|
|
|
| 21 |
assert pdf_agent is not None
|
| 22 |
assert pdf_agent.name == "markdown_agent"
|
| 23 |
assert pdf_agent.tools is not None
|
| 24 |
+
assert len(pdf_agent.tools) == 4 + 1 # +1 for the final answer
|
| 25 |
+
|
|
|
|
| 26 |
GetTableOfContentsTool(ocr_response).forward()
|
| 27 |
GetMarkdownTool(ocr_response).forward()
|
| 28 |
+
GetPagesContentTool(ocr_response).forward([1, 2, 3])
|
| 29 |
FindInMarkdownTool(ocr_response).forward(["thermal neutron", "neutron"])
|
|
|
|
| 30 |
|
| 31 |
|
| 32 |
+
@pytest.mark.skip(reason="This test is too expensive to run on CI")
|
| 33 |
+
def test_run_pdf_agent():
|
| 34 |
+
ocr_response = load_mock_ocr_response()
|
| 35 |
+
pdf_agent = create_agent(ocr_response)
|
| 36 |
+
assert pdf_agent.run("Give me a summary of the document.") is not None
|
tests/deepsearch/test_web_agent.py
CHANGED
|
@@ -1,5 +1,17 @@
|
|
|
|
|
| 1 |
from deepengineer.deepsearch.scawl_web_agent import create_web_search_agent
|
| 2 |
|
|
|
|
| 3 |
def test_create_web_search_agent():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
agent = create_web_search_agent()
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
from deepengineer.deepsearch.scawl_web_agent import create_web_search_agent
|
| 3 |
|
| 4 |
+
|
| 5 |
def test_create_web_search_agent():
|
| 6 |
+
create_web_search_agent()
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@pytest.mark.skip(reason="This test is too expensive to run on CI")
|
| 10 |
+
def test_run_web_search_agent():
|
| 11 |
agent = create_web_search_agent()
|
| 12 |
+
assert (
|
| 13 |
+
agent.run(
|
| 14 |
+
"Est il possible de faire un réacteur thermique avec du graphite et du plomb?"
|
| 15 |
+
)
|
| 16 |
+
is not None
|
| 17 |
+
)
|
tests/webcrawler/test_async_crawl.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
import pytest
|
|
|
|
| 2 |
from deepengineer.webcrawler.async_crawl import (
|
| 3 |
crawl4ai_extract_markdown_of_url_async,
|
| 4 |
download_pdf_async,
|
| 5 |
download_pdf_or_arxiv_pdf_async,
|
| 6 |
)
|
| 7 |
-
from
|
| 8 |
-
|
| 9 |
-
from deepengineer.common_path import DATA_DIR
|
| 10 |
|
| 11 |
@pytest.mark.asyncio
|
| 12 |
async def test_crawl4ai_extract_markdown_of_url_async():
|
|
@@ -14,6 +14,7 @@ async def test_crawl4ai_extract_markdown_of_url_async():
|
|
| 14 |
assert isinstance(markdown, str)
|
| 15 |
assert "Graphite-moderated reactor" in markdown
|
| 16 |
|
|
|
|
| 17 |
@pytest.mark.asyncio
|
| 18 |
async def test_download_pdf_async():
|
| 19 |
output_path = DATA_DIR / "temp.pdf"
|
|
@@ -22,6 +23,7 @@ async def test_download_pdf_async():
|
|
| 22 |
assert pdf_path == output_path
|
| 23 |
assert output_path.exists()
|
| 24 |
|
|
|
|
| 25 |
@pytest.mark.asyncio
|
| 26 |
async def test_arxiv_download_pdf_async():
|
| 27 |
output_path = DATA_DIR / "temp.pdf"
|
|
|
|
| 1 |
import pytest
|
| 2 |
+
from deepengineer.common_path import DATA_DIR
|
| 3 |
from deepengineer.webcrawler.async_crawl import (
|
| 4 |
crawl4ai_extract_markdown_of_url_async,
|
| 5 |
download_pdf_async,
|
| 6 |
download_pdf_or_arxiv_pdf_async,
|
| 7 |
)
|
| 8 |
+
from deepengineer.webcrawler.testing import ARXIV_URL, URL_PDF, URL_WIKIPEDIA
|
| 9 |
+
|
|
|
|
| 10 |
|
| 11 |
@pytest.mark.asyncio
|
| 12 |
async def test_crawl4ai_extract_markdown_of_url_async():
|
|
|
|
| 14 |
assert isinstance(markdown, str)
|
| 15 |
assert "Graphite-moderated reactor" in markdown
|
| 16 |
|
| 17 |
+
|
| 18 |
@pytest.mark.asyncio
|
| 19 |
async def test_download_pdf_async():
|
| 20 |
output_path = DATA_DIR / "temp.pdf"
|
|
|
|
| 23 |
assert pdf_path == output_path
|
| 24 |
assert output_path.exists()
|
| 25 |
|
| 26 |
+
|
| 27 |
@pytest.mark.asyncio
|
| 28 |
async def test_arxiv_download_pdf_async():
|
| 29 |
output_path = DATA_DIR / "temp.pdf"
|
tests/webcrawler/test_async_search.py
CHANGED
|
@@ -1,28 +1,25 @@
|
|
| 1 |
-
import
|
| 2 |
import pytest
|
| 3 |
from deepengineer.webcrawler.async_search import (
|
| 4 |
-
tavily_search_async,
|
| 5 |
SearchResponse,
|
|
|
|
|
|
|
| 6 |
get_tavily_usage,
|
| 7 |
linkup_search_async,
|
| 8 |
-
|
| 9 |
-
arxiv_search_async
|
| 10 |
)
|
| 11 |
-
import numpy as np
|
| 12 |
|
| 13 |
|
| 14 |
@pytest.mark.expensive
|
| 15 |
@pytest.mark.asyncio
|
| 16 |
async def test_tavily_search_async():
|
| 17 |
-
|
| 18 |
usage_before = get_tavily_usage()
|
| 19 |
print(usage_before)
|
| 20 |
-
|
| 21 |
|
| 22 |
response = await tavily_search_async(
|
| 23 |
search_query="Would it be possible to make a thermal reactor with graphite and lead?",
|
| 24 |
)
|
| 25 |
-
|
| 26 |
print(response.answer)
|
| 27 |
assert response is not None
|
| 28 |
assert isinstance(response, SearchResponse)
|
|
@@ -43,10 +40,10 @@ async def test_tavily_search_async():
|
|
| 43 |
print(usage_after)
|
| 44 |
assert usage_after == usage_before + 1
|
| 45 |
|
|
|
|
| 46 |
@pytest.mark.expensive
|
| 47 |
@pytest.mark.asyncio
|
| 48 |
async def test_linkup_search_async():
|
| 49 |
-
|
| 50 |
balance_before = get_linkup_balance()
|
| 51 |
print(balance_before)
|
| 52 |
|
|
@@ -69,7 +66,8 @@ async def test_linkup_search_async():
|
|
| 69 |
balance_after = get_linkup_balance()
|
| 70 |
print(balance_after)
|
| 71 |
assert np.isclose(balance_after, balance_before - 0.005)
|
| 72 |
-
|
|
|
|
| 73 |
@pytest.mark.expensive
|
| 74 |
@pytest.mark.asyncio
|
| 75 |
async def test_arxiv_search_async():
|
|
@@ -78,22 +76,17 @@ async def test_arxiv_search_async():
|
|
| 78 |
response = await arxiv_search_async(
|
| 79 |
search_query="Would it be possible to make a thermal reactor with graphite and lead?",
|
| 80 |
)
|
| 81 |
-
|
| 82 |
assert response is not None
|
| 83 |
assert isinstance(response, SearchResponse)
|
| 84 |
assert response.query is not None
|
| 85 |
assert response.answer is not None
|
| 86 |
assert response.search_results is not None
|
| 87 |
assert len(response.search_results) >= 10
|
| 88 |
-
assert any(
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
balance_after = get_linkup_balance()
|
| 91 |
assert np.isclose(balance_after, balance_before - 0.005)
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
import pytest
|
| 3 |
from deepengineer.webcrawler.async_search import (
|
|
|
|
| 4 |
SearchResponse,
|
| 5 |
+
arxiv_search_async,
|
| 6 |
+
get_linkup_balance,
|
| 7 |
get_tavily_usage,
|
| 8 |
linkup_search_async,
|
| 9 |
+
tavily_search_async,
|
|
|
|
| 10 |
)
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
@pytest.mark.expensive
|
| 14 |
@pytest.mark.asyncio
|
| 15 |
async def test_tavily_search_async():
|
|
|
|
| 16 |
usage_before = get_tavily_usage()
|
| 17 |
print(usage_before)
|
|
|
|
| 18 |
|
| 19 |
response = await tavily_search_async(
|
| 20 |
search_query="Would it be possible to make a thermal reactor with graphite and lead?",
|
| 21 |
)
|
| 22 |
+
|
| 23 |
print(response.answer)
|
| 24 |
assert response is not None
|
| 25 |
assert isinstance(response, SearchResponse)
|
|
|
|
| 40 |
print(usage_after)
|
| 41 |
assert usage_after == usage_before + 1
|
| 42 |
|
| 43 |
+
|
| 44 |
@pytest.mark.expensive
|
| 45 |
@pytest.mark.asyncio
|
| 46 |
async def test_linkup_search_async():
|
|
|
|
| 47 |
balance_before = get_linkup_balance()
|
| 48 |
print(balance_before)
|
| 49 |
|
|
|
|
| 66 |
balance_after = get_linkup_balance()
|
| 67 |
print(balance_after)
|
| 68 |
assert np.isclose(balance_after, balance_before - 0.005)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
@pytest.mark.expensive
|
| 72 |
@pytest.mark.asyncio
|
| 73 |
async def test_arxiv_search_async():
|
|
|
|
| 76 |
response = await arxiv_search_async(
|
| 77 |
search_query="Would it be possible to make a thermal reactor with graphite and lead?",
|
| 78 |
)
|
| 79 |
+
|
| 80 |
assert response is not None
|
| 81 |
assert isinstance(response, SearchResponse)
|
| 82 |
assert response.query is not None
|
| 83 |
assert response.answer is not None
|
| 84 |
assert response.search_results is not None
|
| 85 |
assert len(response.search_results) >= 10
|
| 86 |
+
assert any(
|
| 87 |
+
result.url.startswith("https://arxiv.org/abs/")
|
| 88 |
+
for result in response.search_results
|
| 89 |
+
)
|
| 90 |
|
| 91 |
balance_after = get_linkup_balance()
|
| 92 |
assert np.isclose(balance_after, balance_before - 0.005)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/webcrawler/test_crawl_database.py
CHANGED
|
@@ -1,24 +1,44 @@
|
|
| 1 |
from deepengineer.webcrawler.crawl_database import DataBase
|
| 2 |
|
|
|
|
| 3 |
def test_crawl_database_arxiv_pdf():
|
| 4 |
db = DataBase()
|
| 5 |
db.crawl_url("https://arxiv.org/pdf/2105.00643")
|
| 6 |
assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
|
| 7 |
assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
|
| 8 |
-
assert
|
|
|
|
|
|
|
|
|
|
| 9 |
assert len(db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages) == 20
|
| 10 |
-
|
|
|
|
| 11 |
def test_crawl_database_arxiv_link():
|
| 12 |
db = DataBase()
|
| 13 |
db.crawl_url("https://arxiv.org/abs/2105.00643")
|
| 14 |
assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
|
| 15 |
assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
|
| 16 |
-
assert
|
|
|
|
|
|
|
|
|
|
| 17 |
assert len(db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages) == 20
|
| 18 |
-
|
|
|
|
| 19 |
def test_crawl_database_wikipedia_url():
|
| 20 |
db = DataBase()
|
| 21 |
db.crawl_url("https://en.wikipedia.org/wiki/Deep_learning")
|
| 22 |
-
assert
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from deepengineer.webcrawler.crawl_database import DataBase
|
| 2 |
|
| 3 |
+
|
| 4 |
def test_crawl_database_arxiv_pdf():
|
| 5 |
db = DataBase()
|
| 6 |
db.crawl_url("https://arxiv.org/pdf/2105.00643")
|
| 7 |
assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
|
| 8 |
assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
|
| 9 |
+
assert (
|
| 10 |
+
db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages[0].markdown
|
| 11 |
+
is not None
|
| 12 |
+
)
|
| 13 |
assert len(db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643").pages) == 20
|
| 14 |
+
|
| 15 |
+
|
| 16 |
def test_crawl_database_arxiv_link():
|
| 17 |
db = DataBase()
|
| 18 |
db.crawl_url("https://arxiv.org/abs/2105.00643")
|
| 19 |
assert db.get_markdown_of_url("https://arxiv.org/abs/2105.00643") is not None
|
| 20 |
assert db.get_markdown_of_url("https://arxiv.org/pdf/2105.00643") is not None
|
| 21 |
+
assert (
|
| 22 |
+
db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages[0].markdown
|
| 23 |
+
is not None
|
| 24 |
+
)
|
| 25 |
assert len(db.get_markdown_of_url("https://arxiv.org/abs/2105.00643").pages) == 20
|
| 26 |
+
|
| 27 |
+
|
| 28 |
def test_crawl_database_wikipedia_url():
|
| 29 |
db = DataBase()
|
| 30 |
db.crawl_url("https://en.wikipedia.org/wiki/Deep_learning")
|
| 31 |
+
assert (
|
| 32 |
+
db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning")
|
| 33 |
+
is not None
|
| 34 |
+
)
|
| 35 |
+
assert (
|
| 36 |
+
db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning")
|
| 37 |
+
.pages[0]
|
| 38 |
+
.markdown
|
| 39 |
+
is not None
|
| 40 |
+
)
|
| 41 |
+
assert (
|
| 42 |
+
len(db.get_markdown_of_url("https://en.wikipedia.org/wiki/Deep_learning").pages)
|
| 43 |
+
>= 40
|
| 44 |
+
)
|
tests/webcrawler/test_pdfs_utils.py
CHANGED
|
@@ -1,10 +1,17 @@
|
|
| 1 |
-
from deepengineer.webcrawler.pdf_utils import convert_pdf_to_markdown_async, convert_ocr_response_to_markdown, find_in_markdown, get_table_of_contents_per_page_markdown, get_markdown_by_page_numbers
|
| 2 |
-
from mistralai import OCRResponse
|
| 3 |
-
from deepengineer.common_path import DATA_DIR
|
| 4 |
import pytest
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
def load_mock_ocr_response() -> OCRResponse:
|
| 7 |
-
with open(DATA_DIR / "report_thermal_neutron.json"
|
| 8 |
return OCRResponse.model_validate_json(f.read())
|
| 9 |
|
| 10 |
|
|
@@ -18,18 +25,20 @@ async def test_convert_pdf_to_markdown_async():
|
|
| 18 |
assert isinstance(ocr_response, OCRResponse)
|
| 19 |
assert len(ocr_response.pages) == 16
|
| 20 |
assert "where each cylinder represent" in markdown
|
| 21 |
-
|
| 22 |
|
| 23 |
def test_table_of_contents_per_page_pdf():
|
| 24 |
ocr_response = load_mock_ocr_response()
|
| 25 |
table_of_contents = get_table_of_contents_per_page_markdown(ocr_response)
|
| 26 |
assert "References - Page 15" in table_of_contents
|
| 27 |
|
|
|
|
| 28 |
def test_find_in_pdf():
|
| 29 |
ocr_response = load_mock_ocr_response()
|
| 30 |
page_numbers = find_in_markdown(ocr_response, "where each cylinder represent")
|
| 31 |
assert page_numbers == [7]
|
| 32 |
|
|
|
|
| 33 |
def test_get_markdown_by_page_numbers():
|
| 34 |
ocr_response = load_mock_ocr_response()
|
| 35 |
page_numbers = [7, 15]
|
|
@@ -37,4 +46,4 @@ def test_get_markdown_by_page_numbers():
|
|
| 37 |
assert "Page 7" in markdown
|
| 38 |
assert "Page 15" in markdown
|
| 39 |
assert "References" in markdown
|
| 40 |
-
assert "where each cylinder represent" in markdown
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import pytest
|
| 2 |
+
from deepengineer.common_path import DATA_DIR
|
| 3 |
+
from deepengineer.webcrawler.pdf_utils import (
|
| 4 |
+
convert_ocr_response_to_markdown,
|
| 5 |
+
convert_pdf_to_markdown_async,
|
| 6 |
+
find_in_markdown,
|
| 7 |
+
get_markdown_by_page_numbers,
|
| 8 |
+
get_table_of_contents_per_page_markdown,
|
| 9 |
+
)
|
| 10 |
+
from mistralai import OCRResponse
|
| 11 |
+
|
| 12 |
|
| 13 |
def load_mock_ocr_response() -> OCRResponse:
|
| 14 |
+
with open(DATA_DIR / "report_thermal_neutron.json") as f:
|
| 15 |
return OCRResponse.model_validate_json(f.read())
|
| 16 |
|
| 17 |
|
|
|
|
| 25 |
assert isinstance(ocr_response, OCRResponse)
|
| 26 |
assert len(ocr_response.pages) == 16
|
| 27 |
assert "where each cylinder represent" in markdown
|
| 28 |
+
|
| 29 |
|
| 30 |
def test_table_of_contents_per_page_pdf():
|
| 31 |
ocr_response = load_mock_ocr_response()
|
| 32 |
table_of_contents = get_table_of_contents_per_page_markdown(ocr_response)
|
| 33 |
assert "References - Page 15" in table_of_contents
|
| 34 |
|
| 35 |
+
|
| 36 |
def test_find_in_pdf():
|
| 37 |
ocr_response = load_mock_ocr_response()
|
| 38 |
page_numbers = find_in_markdown(ocr_response, "where each cylinder represent")
|
| 39 |
assert page_numbers == [7]
|
| 40 |
|
| 41 |
+
|
| 42 |
def test_get_markdown_by_page_numbers():
|
| 43 |
ocr_response = load_mock_ocr_response()
|
| 44 |
page_numbers = [7, 15]
|
|
|
|
| 46 |
assert "Page 7" in markdown
|
| 47 |
assert "Page 15" in markdown
|
| 48 |
assert "References" in markdown
|
| 49 |
+
assert "where each cylinder represent" in markdown
|
tests/webcrawler/test_utils.py
CHANGED
|
@@ -1,28 +1,37 @@
|
|
| 1 |
from deepengineer.webcrawler.utils import sanitize_filename
|
| 2 |
|
| 3 |
-
def test_sanitize_filename():
|
| 4 |
|
|
|
|
| 5 |
assert sanitize_filename("My Document!@#$%^&*.txt") == "My_Document_.txt"
|
| 6 |
-
assert
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
assert sanitize_filename("Очень важное дело.xlsx") == "_xlsx"
|
| 10 |
-
assert
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
assert sanitize_filename("........hidden_file.txt") == "_.......hidden_file.txt"
|
| 12 |
-
assert
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
assert sanitize_filename(" ") == "untitled_file"
|
| 14 |
assert sanitize_filename("!") == "untitled_file"
|
| 15 |
assert sanitize_filename(" .some_hidden_file.txt ") == "_some_hidden_file.txt"
|
| 16 |
-
assert
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
| 1 |
from deepengineer.webcrawler.utils import sanitize_filename
|
| 2 |
|
|
|
|
| 3 |
|
| 4 |
+
def test_sanitize_filename():
|
| 5 |
assert sanitize_filename("My Document!@#$%^&*.txt") == "My_Document_.txt"
|
| 6 |
+
assert (
|
| 7 |
+
sanitize_filename(" Another file with spaces & special_chars ")
|
| 8 |
+
== "Another_file_with_spaces_special_chars"
|
| 9 |
+
)
|
| 10 |
+
assert (
|
| 11 |
+
sanitize_filename("Düsseldorf_Report_2023.pdf") == "Dusseldorf_Report_2023.pdf"
|
| 12 |
+
)
|
| 13 |
+
assert (
|
| 14 |
+
sanitize_filename("File with an é, ö, ü, ç, ñ.docx")
|
| 15 |
+
== "File_with_an_e_o_u_c_n.docx"
|
| 16 |
+
)
|
| 17 |
assert sanitize_filename("Очень важное дело.xlsx") == "_xlsx"
|
| 18 |
+
assert (
|
| 19 |
+
sanitize_filename(
|
| 20 |
+
"My.Super.Duper.File.Name.with.lots.of.dots.and.A@#!!%@#$%^&*.txt"
|
| 21 |
+
)
|
| 22 |
+
== "My.Super.Duper.File.Name.with.lots.of.dots.and.A_.txt"
|
| 23 |
+
)
|
| 24 |
assert sanitize_filename("........hidden_file.txt") == "_.......hidden_file.txt"
|
| 25 |
+
assert (
|
| 26 |
+
sanitize_filename(
|
| 27 |
+
"A very long file name that exceeds typical operating system limits and needs to be truncated gracefully.zip"
|
| 28 |
+
)
|
| 29 |
+
== "A_very_long_file_name_that_exceeds_typical_operating_system_limits_and_needs_to_be_truncated_gracefully.zip"
|
| 30 |
+
)
|
| 31 |
assert sanitize_filename(" ") == "untitled_file"
|
| 32 |
assert sanitize_filename("!") == "untitled_file"
|
| 33 |
assert sanitize_filename(" .some_hidden_file.txt ") == "_some_hidden_file.txt"
|
| 34 |
+
assert (
|
| 35 |
+
sanitize_filename("file_name_with_________many_underscores.txt")
|
| 36 |
+
== "file_name_with_many_underscores.txt"
|
| 37 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|