Spaces:
Runtime error
Runtime error
File size: 10,595 Bytes
b5fafa1 6c0aeb9 b5fafa1 7f58cad b5fafa1 7f58cad bb62e6b b5fafa1 6c0aeb9 0159aaf 7f58cad 892c58b 7f58cad 7c1f478 7f58cad b5fafa1 e040f4f 7f58cad b5fafa1 e040f4f 7c1f478 e040f4f 7f58cad 6c0aeb9 e040f4f b5fafa1 e040f4f 7f58cad b5fafa1 7f58cad 892c58b 0159aaf b5fafa1 7c1f478 0159aaf b5fafa1 7c1f478 7f58cad b5fafa1 6c0aeb9 7f58cad e040f4f 7f58cad b5fafa1 7f58cad 892c58b 0159aaf b5fafa1 7c1f478 0159aaf 7f58cad 7c1f478 7f58cad b5fafa1 6c0aeb9 7f58cad e040f4f 7f58cad b5fafa1 7f58cad 892c58b 0159aaf b5fafa1 7c1f478 0159aaf 7f58cad 7c1f478 7f58cad b5fafa1 6c0aeb9 7f58cad e040f4f 7f58cad b5fafa1 7f58cad b5fafa1 0159aaf 7f58cad 0159aaf 7f58cad 7c1f478 7f58cad b5fafa1 e040f4f 7f58cad b5fafa1 6c0aeb9 e040f4f 0159aaf e040f4f 7f58cad b5fafa1 7f58cad b5fafa1 0159aaf bb62e6b b5fafa1 e040f4f 0159aaf bb62e6b 7f58cad b5fafa1 6c0aeb9 7f58cad bb62e6b b5fafa1 bb62e6b 7f58cad b5fafa1 0159aaf bb62e6b b5fafa1 bb62e6b 0159aaf bb62e6b 7f58cad b5fafa1 6c0aeb9 7f58cad bb62e6b 7f58cad b5fafa1 7f58cad b5fafa1 7f58cad b5fafa1 0159aaf bb62e6b 7f58cad bb62e6b 0159aaf bb62e6b 7f58cad b5fafa1 6c0aeb9 7f58cad bb62e6b 7f58cad b5fafa1 7f58cad b5fafa1 7f58cad b5fafa1 0159aaf bb62e6b b5fafa1 bb62e6b 0159aaf bb62e6b 7f58cad b5fafa1 dd7fa38 892c58b dd7fa38 7f58cad b5fafa1 7f58cad dd7fa38 7f58cad 0159aaf e003639 b5fafa1 7c1f478 e003639 7f58cad e003639 7f58cad 43d9fe2 e003639 b5fafa1 7f58cad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 |
import asyncio
from enum import Enum
from smolagents import CodeAgent, LiteLLMModel
from deepengineer.webcrawler.async_search import (
SearchResponse,
arxiv_search_async,
linkup_search_async,
pubmed_search_async,
scientific_search_async,
)
from deepengineer.webcrawler.crawl_database import DataBase
from deepengineer.webcrawler.pdf_utils import (
convert_ocr_response_to_markdown,
find_in_markdown,
get_markdown_by_page_numbers,
get_table_of_contents_per_page_markdown,
)
from deepengineer.logging_tools import LoggingTool
import queue
class ToolNames(Enum):
# Search tools
SEARCH_TOOL = "web_search_tool"
ARXIV_SEARCH = "arxiv_search"
PUBMED_SEARCH = "pubmed_search"
SCIENCEDIRECT_SEARCH = "sciencedirect_search"
SCIENTIFIC_SEARCH = "scientific_search"
# Exploring link tools
GET_TABLE_OF_CONTENTS = "get_table_of_contents_of_url"
GET_MARKDOWN = "get_markdown_of_url"
GET_PAGES_CONTENT = "get_pages_content"
FIND_IN_MARKDOWN = "find_in_markdown"
def filter_search_results(
search_response: SearchResponse, max_nb_results: int = 5
) -> SearchResponse:
search_response.search_results = search_response.search_results[:max_nb_results]
return search_response.to_string()
class SearchTool(LoggingTool):
name = ToolNames.SEARCH_TOOL.value
description = """Search the web using Linkup API. Good for deep research with sourced answers.
Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
"""
inputs = {
"search_query": {
"type": "string",
"description": "The search query to execute",
},
}
output_type = "object"
def __init__(self, log_queue: queue.Queue | None = None):
super().__init__(log_queue=log_queue)
def forward(self, search_query: str) -> str:
self.push_log(f"π Searching web for: {search_query}")
result = asyncio.run(
linkup_search_async(
search_query=search_query,
)
)
return filter_search_results(result)
class ArxivSearchTool(LoggingTool):
name = ToolNames.ARXIV_SEARCH.value
description = """Search arXiv for academic papers and preprints with Linkup API.
Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
"""
inputs = {
"search_query": {
"type": "string",
"description": "The search query to execute on arXiv",
}
}
output_type = "object"
def __init__(self, log_queue: queue.Queue | None = None):
super().__init__(log_queue=log_queue)
def forward(self, search_query: str) -> str:
self.push_log(f"π Searching arXiv for: {search_query}")
result = asyncio.run(arxiv_search_async(search_query))
return filter_search_results(result)
class PubmedSearchTool(LoggingTool):
name = ToolNames.PUBMED_SEARCH.value
description = """Search PubMed for medical and scientific literature with Linkup API.
Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
"""
inputs = {
"search_query": {
"type": "string",
"description": "The search query to execute on PubMed",
}
}
output_type = "object"
def __init__(self, log_queue: queue.Queue | None = None):
super().__init__(log_queue=log_queue)
def forward(self, search_query: str) -> str:
self.push_log(f"π Searching PubMed for: {search_query}")
result = asyncio.run(pubmed_search_async(search_query))
return filter_search_results(result)
class ScientificSearchTool(LoggingTool):
name = ToolNames.SCIENTIFIC_SEARCH.value
description = """Search across multiple scientific domains: Wikipedia, arXiv, PubMed, and ScienceDirect.
Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
"""
inputs = {
"search_query": {
"type": "string",
"description": "The search query to execute across scientific domains",
}
}
output_type = "object"
def __init__(self, log_queue: queue.Queue | None = None):
super().__init__(log_queue=log_queue)
def forward(self, search_query: str) -> dict:
self.push_log(f"π Searching scientific domains for: {search_query}")
result = asyncio.run(scientific_search_async(search_query))
return filter_search_results(result)
URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""
class GetTableOfContentsTool(LoggingTool):
name = ToolNames.GET_TABLE_OF_CONTENTS.value
description = f"""Returns all of the titles in the url along with the page number they are on.
{URL_EXPLAINATION}
"""
inputs = {
"url": {
"type": "string",
"description": "The URL to get the table of contents of.",
}
}
output_type = "string"
def __init__(self, database: DataBase, log_queue: queue.Queue | None = None):
super().__init__(log_queue=log_queue)
self.database: DataBase = database
def forward(self, url: str) -> str:
self.push_log(f"π Getting table of contents for url: {url}")
markdown = self.database.get_markdown_of_url(url)
table_of_contents: str = get_table_of_contents_per_page_markdown(markdown)
return table_of_contents
class GetMarkdownTool(LoggingTool):
name = ToolNames.GET_MARKDOWN.value
description = f"Returns in markdown entire content of the url. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can also use {ToolNames.GET_TABLE_OF_CONTENTS.value} first to get the table of contents of the document including the number of pages."
inputs = {
"url": {"type": "string", "description": "The URL to get the markdown of."}
}
output_type = "string"
def __init__(self, database: DataBase, log_queue: queue.Queue | None = None):
super().__init__(log_queue=log_queue)
self.database: DataBase = database
def forward(self, url: str) -> str:
self.push_log(f"π Getting markdown for url: {url}")
markdown = self.database.get_markdown_of_url(url)
markdown_content: str = convert_ocr_response_to_markdown(markdown)
return markdown_content
class GetPagesContentTool(LoggingTool):
name = ToolNames.GET_PAGES_CONTENT.value
description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the url including the number of pages. Expects a list of page numbers as integers as input. {URL_EXPLAINATION}"
inputs = {
"url": {"type": "string", "description": "The URL to get the content of."},
"page_numbers": {
"type": "array",
"description": "The page numbers to get the content of.",
},
}
output_type = "string"
def __init__(self, database: DataBase, log_queue: queue.Queue | None = None):
super().__init__(log_queue=log_queue)
self.database: DataBase = database
def forward(self, url: str, page_numbers: list[int]) -> str:
self.push_log(f"π Getting content of pages {page_numbers} for url: {url}")
markdown = self.database.get_markdown_of_url(url)
return get_markdown_by_page_numbers(markdown, page_numbers)
class FindInMarkdownTool(LoggingTool):
name = ToolNames.FIND_IN_MARKDOWN.value
description = f"Finds the page numbers of the url that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the url that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages. {URL_EXPLAINATION}"
inputs = {
"url": {"type": "string", "description": "The URL to find in."},
"search_queries": {
"type": "array",
"description": "The search queries to find in the url. List of strings.",
},
}
output_type = "array"
def __init__(self, database: DataBase, log_queue: queue.Queue | None = None):
super().__init__(log_queue=log_queue)
self.database: DataBase = database
def forward(self, url: str, search_queries: list[str]) -> list[int]:
self.push_log(f"π Finding {search_queries} in url: {url}")
markdown = self.database.get_markdown_of_url(url)
return find_in_markdown(markdown, search_queries)
def create_web_search_agent(
model_id="deepseek/deepseek-reasoner",
database: DataBase | None = None,
log_queue: queue.Queue | None = None,
):
"""Create a web search agent with search, crawling, and PDF analysis capabilities."""
model = LiteLLMModel(model_id=model_id)
if database is None:
database = DataBase()
# Web search and crawling tools
WEB_SEARCH_TOOLS = [
SearchTool(log_queue=log_queue),
ArxivSearchTool(log_queue=log_queue),
PubmedSearchTool(log_queue=log_queue),
ScientificSearchTool(log_queue=log_queue),
GetTableOfContentsTool(database=database, log_queue=log_queue),
GetMarkdownTool(database=database, log_queue=log_queue),
GetPagesContentTool(database=database, log_queue=log_queue),
FindInMarkdownTool(database=database, log_queue=log_queue),
]
web_search_agent = CodeAgent(
model=model,
tools=WEB_SEARCH_TOOLS,
max_steps=20,
verbosity_level=2,
planning_interval=4,
name="web_search_agent",
description="""A team member that will search the internet to answer your question.
Ask him for all your questions that require browsing the web. It can also search arXiv, PubMed, and ScienceDirect, download the documents and extract the relevant information.
Provide him as much context as possible, especially if you need to search on a specific website!
And don't hesitate to provide him with a complex search task.
Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords.""",
)
return web_search_agent
|