Charles Azam commited on
Commit
bb62e6b
·
1 Parent(s): ce79b68

feat: scrawl web agent

Browse files
docs/webcrawler.py DELETED
@@ -1,3 +0,0 @@
1
- from dotenv import load_dotenv
2
-
3
- load_dotenv()
 
 
 
 
src/deepengineer/deepsearch/analyse_markdown_agent.py CHANGED
@@ -6,7 +6,6 @@ from smolagents import CodeAgent, tool, Tool, LiteLLMModel
6
  from deepengineer.webcrawler.pdf_utils import get_markdown_by_page_numbers, get_table_of_contents_per_page_markdown, find_in_markdown, convert_ocr_response_to_markdown
7
  from mistralai import OCRResponse
8
  from enum import Enum
9
- from pathlib import Path
10
 
11
  class ToolNames(Enum):
12
  GET_TABLE_OF_CONTENTS = "get_table_of_contents"
 
6
  from deepengineer.webcrawler.pdf_utils import get_markdown_by_page_numbers, get_table_of_contents_per_page_markdown, find_in_markdown, convert_ocr_response_to_markdown
7
  from mistralai import OCRResponse
8
  from enum import Enum
 
9
 
10
  class ToolNames(Enum):
11
  GET_TABLE_OF_CONTENTS = "get_table_of_contents"
src/deepengineer/deepsearch/scawl_web_agent.py CHANGED
@@ -1,4 +1,4 @@
1
- from smolagents import CodeAgent, Tool, LiteLLMModel, tool
2
  from deepengineer.webcrawler.async_search import (
3
  linkup_search_async, arxiv_search_async,
4
  pubmed_search_async, scientific_search_async,
@@ -8,7 +8,7 @@ from mistralai import OCRResponse
8
  from enum import Enum
9
  import asyncio
10
  from deepengineer.webcrawler.async_search import SearchResponse
11
-
12
 
13
  class ToolNames(Enum):
14
  # Search tools
@@ -115,32 +115,43 @@ class GetTableOfContentsTool(Tool):
115
  }
116
  output_type = "string"
117
 
118
- def __init__(self, markdown: OCRResponse):
119
  super().__init__()
120
- self.markdown: OCRResponse = markdown
121
- self.table_of_contents: str = get_table_of_contents_per_page_markdown(self.markdown)
122
 
123
  def forward(self, url: str) -> str:
124
- return self.table_of_contents
 
 
125
 
126
  class GetMarkdownTool(Tool):
127
  name = ToolNames.GET_MARKDOWN.value
128
- description = f"Returns the markdown entire content of the document. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages."
129
- inputs = {}
 
 
 
 
 
130
  output_type = "string"
131
 
132
- def __init__(self, markdown: OCRResponse):
133
  super().__init__()
134
- self.markdown: OCRResponse = markdown
135
- self.markdown_content: str = convert_ocr_response_to_markdown(self.markdown)
136
 
137
- def forward(self) -> str:
138
- return self.markdown_content
 
 
139
 
140
  class GetPagesContentTool(Tool):
141
  name = ToolNames.GET_PAGES_CONTENT.value
142
- description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the document including the number of pages. Expects a list of page numbers as integers as input."
143
  inputs = {
 
 
 
 
144
  "page_numbers": {
145
  "type": "array",
146
  "description": "The page numbers to get the content of."
@@ -148,30 +159,36 @@ class GetPagesContentTool(Tool):
148
  }
149
  output_type = "string"
150
 
151
- def __init__(self, markdown: OCRResponse):
152
  super().__init__()
153
- self.markdown: OCRResponse = markdown
154
 
155
- def forward(self, page_numbers: list[int]) -> str:
156
- return get_markdown_by_page_numbers(self.markdown, page_numbers)
 
157
 
158
  class FindInMarkdownTool(Tool):
159
  name = ToolNames.FIND_IN_MARKDOWN.value
160
- description = f"Finds the page numbers of the document that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the document that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages."
161
  inputs = {
 
 
 
 
162
  "search_queries": {
163
  "type": "array",
164
- "description": "The search queries to find in the document. List of strings."
165
  }
166
  }
167
  output_type = "array"
168
 
169
- def __init__(self, markdown: OCRResponse):
170
  super().__init__()
171
- self.markdown: OCRResponse = markdown
172
 
173
- def forward(self, search_queries: list[str]) -> list[int]:
174
- return find_in_markdown(self.markdown, search_queries)
 
175
 
176
  def create_web_search_agent(model_id="deepseek/deepseek-chat"):
177
  """Create a web search agent with search, crawling, and PDF analysis capabilities."""
@@ -180,17 +197,17 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
180
 
181
  # Web search and crawling tools
182
  WEB_SEARCH_TOOLS = [
183
- TavilySearchTool(),
184
- LinkupSearchTool(),
185
  ArxivSearchTool(),
186
  PubmedSearchTool(),
187
  ScientificSearchTool(),
188
- CrawlUrlTool(),
189
- DownloadPdfTool(),
190
- ArxivDownloadPdfTool(),
 
191
  ]
192
 
193
- web_search_agent = CodeAgent(
194
  model=model,
195
  tools=WEB_SEARCH_TOOLS,
196
  max_steps=20,
@@ -200,58 +217,4 @@ def create_web_search_agent(model_id="deepseek/deepseek-chat"):
200
  description="""A team member that can search the web, crawl URLs, download PDFs, and analyze documents.""",
201
  )
202
 
203
- web_search_agent.prompt_templates["managed_agent"]["task"] += """
204
- You can search the web using various APIs (Tavily, Linkup, arXiv, PubMed, ScienceDirect).
205
- You can crawl URLs to extract markdown content.
206
- You can download PDFs from URLs or arXiv and store them in the data/pdfs directory.
207
- For PDF analysis, you'll need to first download the PDF and then use the markdown analysis tools.
208
- """
209
-
210
- return web_search_agent
211
-
212
- def create_web_search_agent_with_pdf_analysis(markdown: OCRResponse, model_id="deepseek/deepseek-chat"):
213
- """Create a web search agent that also includes PDF analysis capabilities."""
214
-
215
- model = LiteLLMModel(model_id=model_id)
216
-
217
- # Web search and crawling tools
218
- WEB_SEARCH_TOOLS = [
219
- TavilySearchTool(),
220
- LinkupSearchTool(),
221
- ArxivSearchTool(),
222
- PubmedSearchTool(),
223
- ScientificSearchTool(),
224
- CrawlUrlTool(),
225
- DownloadPdfTool(),
226
- ArxivDownloadPdfTool(),
227
- ]
228
-
229
- # PDF analysis tools (if markdown is provided)
230
- PDF_ANALYSIS_TOOLS = [
231
- GetTableOfContentsTool(markdown),
232
- GetMarkdownTool(markdown),
233
- GetPagesContentTool(markdown),
234
- FindInMarkdownTool(markdown),
235
- ]
236
-
237
- all_tools = WEB_SEARCH_TOOLS + PDF_ANALYSIS_TOOLS
238
-
239
- web_search_agent = CodeAgent(
240
- model=model,
241
- tools=all_tools,
242
- max_steps=20,
243
- verbosity_level=2,
244
- planning_interval=4,
245
- name="web_search_agent_with_pdf_analysis",
246
- description="""A team member that can search the web, crawl URLs, download PDFs, and analyze the provided PDF document.""",
247
- additional_authorized_imports=["numpy", "matplotlib", "scipy", "sympy", "pandas", ],
248
- )
249
-
250
- web_search_agent.prompt_templates["managed_agent"]["task"] += """
251
- You can search the web using various APIs (Linkup, arXiv, PubMed, ScienceDirect).
252
- You can crawl URLs to extract markdown content.
253
- You can download PDFs from URLs or arXiv and store them in the data/pdfs directory.
254
- You can analyze the provided PDF document using the markdown analysis tools.
255
- """
256
-
257
  return web_search_agent
 
1
+ from smolagents import CodeAgent, Tool, LiteLLMModel, tool, ToolCallingAgent
2
  from deepengineer.webcrawler.async_search import (
3
  linkup_search_async, arxiv_search_async,
4
  pubmed_search_async, scientific_search_async,
 
8
  from enum import Enum
9
  import asyncio
10
  from deepengineer.webcrawler.async_search import SearchResponse
11
+ from deepengineer.webcrawler.crawl_database import DataBase
12
 
13
  class ToolNames(Enum):
14
  # Search tools
 
115
  }
116
  output_type = "string"
117
 
118
+ def __init__(self, database: DataBase):
119
  super().__init__()
120
+ self.database: DataBase = database
 
121
 
122
  def forward(self, url: str) -> str:
123
+ markdown = self.database.get_markdown_of_url(url)
124
+ table_of_contents: str = get_table_of_contents_per_page_markdown(markdown)
125
+ return table_of_contents
126
 
127
  class GetMarkdownTool(Tool):
128
  name = ToolNames.GET_MARKDOWN.value
129
+ description = f"Returns in markdown entire content of the url. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can also use {ToolNames.GET_TABLE_OF_CONTENTS.value} first to get the table of contents of the document including the number of pages."
130
+ inputs = {
131
+ "url": {
132
+ "type": "string",
133
+ "description": "The URL to get the markdown of."
134
+ }
135
+ }
136
  output_type = "string"
137
 
138
+ def __init__(self, database: DataBase):
139
  super().__init__()
140
+ self.database: DataBase = database
 
141
 
142
+ def forward(self, url: str) -> str:
143
+ markdown = self.database.get_markdown_of_url(url)
144
+ markdown_content: str = convert_ocr_response_to_markdown(markdown)
145
+ return markdown_content
146
 
147
  class GetPagesContentTool(Tool):
148
  name = ToolNames.GET_PAGES_CONTENT.value
149
+ description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the url including the number of pages. Expects a list of page numbers as integers as input. {URL_EXPLAINATION}"
150
  inputs = {
151
+ "url": {
152
+ "type": "string",
153
+ "description": "The URL to get the content of."
154
+ },
155
  "page_numbers": {
156
  "type": "array",
157
  "description": "The page numbers to get the content of."
 
159
  }
160
  output_type = "string"
161
 
162
+ def __init__(self, database: DataBase):
163
  super().__init__()
164
+ self.database: DataBase = database
165
 
166
+ def forward(self, url: str, page_numbers: list[int]) -> str:
167
+ markdown = self.database.get_markdown_of_url(url)
168
+ return get_markdown_by_page_numbers(markdown, page_numbers)
169
 
170
  class FindInMarkdownTool(Tool):
171
  name = ToolNames.FIND_IN_MARKDOWN.value
172
+ description = f"Finds the page numbers of the url that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the url that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages. {URL_EXPLAINATION}"
173
  inputs = {
174
+ "url": {
175
+ "type": "string",
176
+ "description": "The URL to find in."
177
+ },
178
  "search_queries": {
179
  "type": "array",
180
+ "description": "The search queries to find in the url. List of strings."
181
  }
182
  }
183
  output_type = "array"
184
 
185
+ def __init__(self, database: DataBase):
186
  super().__init__()
187
+ self.database: DataBase = database
188
 
189
+ def forward(self, url: str, search_queries: list[str]) -> list[int]:
190
+ markdown = self.database.get_markdown_of_url(url)
191
+ return find_in_markdown(markdown, search_queries)
192
 
193
  def create_web_search_agent(model_id="deepseek/deepseek-chat"):
194
  """Create a web search agent with search, crawling, and PDF analysis capabilities."""
 
197
 
198
  # Web search and crawling tools
199
  WEB_SEARCH_TOOLS = [
200
+ SearchTool(),
 
201
  ArxivSearchTool(),
202
  PubmedSearchTool(),
203
  ScientificSearchTool(),
204
+ GetTableOfContentsTool(),
205
+ GetMarkdownTool(),
206
+ GetPagesContentTool(),
207
+ FindInMarkdownTool(),
208
  ]
209
 
210
+ web_search_agent = ToolCallingAgent(
211
  model=model,
212
  tools=WEB_SEARCH_TOOLS,
213
  max_steps=20,
 
217
  description="""A team member that can search the web, crawl URLs, download PDFs, and analyze documents.""",
218
  )
219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  return web_search_agent
tests/webcrawler/{test_pdf_agent.py → deepsearch/test_pdf_agent.py} RENAMED
File without changes
tests/webcrawler/test_sync_search_speed.py DELETED
@@ -1,75 +0,0 @@
1
- from deepengineer.webcrawler.async_search import linkup_search_async, SearchResponse, SearchResult, ScientificDomains
2
- from linkup import LinkupClient, LinkupSourcedAnswer
3
- from typing import Literal
4
- import time
5
- import asyncio
6
-
7
- def _linkup_search_sync(
8
- search_query: str,
9
- depth: Literal["standard", "deep"] = "standard",
10
- output_type: Literal['searchResults', 'sourcedAnswer', 'structured'] = "sourcedAnswer",
11
- include_images: bool = False,
12
- include_domains: list[ScientificDomains] = None,
13
-
14
- ) -> SearchResponse:
15
- client = LinkupClient()
16
- search_response: LinkupSourcedAnswer = client.search(
17
- query=search_query,
18
- depth=depth,
19
- output_type=output_type,
20
- include_images=include_images,
21
- include_domains=include_domains,
22
- )
23
-
24
- search_results = [
25
- SearchResult(
26
- title=result.name,
27
- url=result.url,
28
- content=result.snippet,
29
- raw_content=None,
30
- )
31
- for result in search_response.sources
32
- ]
33
-
34
- # Convert to our Pydantic models
35
- responses: SearchResponse = SearchResponse(
36
- query=search_query,
37
- answer=search_response.answer,
38
- search_results=search_results
39
- )
40
- return responses
41
-
42
- def linkup_search_speed_test():
43
-
44
- """
45
-
46
- Conclusion: no need to rewrite the async version to sync version. It takes roughly 6 seconds in both cases
47
- """
48
-
49
- print("Testing linkup search speed asynchronously...")
50
- start_time = time.time()
51
- for i in range(5):
52
- start_loop_time = time.time()
53
- output = asyncio.run(linkup_search_async(
54
- search_query="Would it be possible to make a thermal reactor with graphite and lead?",
55
- ))
56
- print(output.answer[:10])
57
- end_loop_time = time.time()
58
- print(f"Time taken for loop {i}: {end_loop_time - start_loop_time} seconds")
59
-
60
-
61
- print("Testing linkup search speed syncronoulsy...")
62
- start_time = time.time()
63
- for i in range(5):
64
- start_loop_time = time.time()
65
- _linkup_search_sync(
66
- search_query="Would it be possible to make a thermal reactor with graphite and lead?",
67
- )
68
- end_loop_time = time.time()
69
- print(f"Time taken for loop {i}: {end_loop_time - start_loop_time} seconds")
70
-
71
- end_time = time.time()
72
- print(f"Total time taken: {end_time - start_time} seconds")
73
-
74
-
75
-