File size: 10,595 Bytes
b5fafa1
 
 
6c0aeb9
b5fafa1
7f58cad
b5fafa1
 
 
 
 
7f58cad
bb62e6b
b5fafa1
 
 
 
 
 
6c0aeb9
0159aaf
7f58cad
892c58b
7f58cad
 
7c1f478
7f58cad
 
 
 
b5fafa1
e040f4f
 
 
7f58cad
 
 
b5fafa1
 
 
 
e040f4f
7c1f478
e040f4f
7f58cad
6c0aeb9
e040f4f
b5fafa1
e040f4f
 
7f58cad
 
 
b5fafa1
7f58cad
 
 
892c58b
0159aaf
 
b5fafa1
7c1f478
0159aaf
b5fafa1
 
 
 
 
7c1f478
7f58cad
b5fafa1
6c0aeb9
7f58cad
e040f4f
 
 
7f58cad
 
 
b5fafa1
7f58cad
 
 
892c58b
0159aaf
 
b5fafa1
7c1f478
0159aaf
7f58cad
7c1f478
7f58cad
b5fafa1
6c0aeb9
7f58cad
e040f4f
 
 
7f58cad
 
 
b5fafa1
7f58cad
 
 
892c58b
0159aaf
 
b5fafa1
7c1f478
0159aaf
7f58cad
7c1f478
7f58cad
b5fafa1
6c0aeb9
7f58cad
e040f4f
 
 
7f58cad
 
 
b5fafa1
7f58cad
 
 
b5fafa1
0159aaf
 
 
7f58cad
0159aaf
7f58cad
7c1f478
7f58cad
b5fafa1
e040f4f
7f58cad
b5fafa1
6c0aeb9
e040f4f
0159aaf
e040f4f
 
7f58cad
 
 
b5fafa1
7f58cad
 
 
b5fafa1
0159aaf
 
bb62e6b
b5fafa1
e040f4f
0159aaf
bb62e6b
 
 
7f58cad
b5fafa1
6c0aeb9
7f58cad
bb62e6b
 
b5fafa1
bb62e6b
7f58cad
b5fafa1
0159aaf
 
bb62e6b
b5fafa1
bb62e6b
0159aaf
bb62e6b
 
 
7f58cad
b5fafa1
6c0aeb9
7f58cad
bb62e6b
7f58cad
b5fafa1
7f58cad
 
b5fafa1
7f58cad
 
 
b5fafa1
0159aaf
 
bb62e6b
7f58cad
bb62e6b
0159aaf
bb62e6b
 
7f58cad
b5fafa1
6c0aeb9
7f58cad
bb62e6b
7f58cad
b5fafa1
7f58cad
 
b5fafa1
 
7f58cad
 
b5fafa1
0159aaf
 
bb62e6b
b5fafa1
bb62e6b
0159aaf
bb62e6b
 
7f58cad
b5fafa1
dd7fa38
892c58b
 
 
dd7fa38
7f58cad
b5fafa1
7f58cad
dd7fa38
 
7f58cad
 
 
0159aaf
 
 
 
 
 
 
 
e003639
b5fafa1
7c1f478
e003639
7f58cad
e003639
 
 
7f58cad
43d9fe2
 
 
 
 
e003639
b5fafa1
7f58cad
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import asyncio
from enum import Enum

from smolagents import CodeAgent, LiteLLMModel

from deepengineer.webcrawler.async_search import (
    SearchResponse,
    arxiv_search_async,
    linkup_search_async,
    pubmed_search_async,
    scientific_search_async,
)
from deepengineer.webcrawler.crawl_database import DataBase
from deepengineer.webcrawler.pdf_utils import (
    convert_ocr_response_to_markdown,
    find_in_markdown,
    get_markdown_by_page_numbers,
    get_table_of_contents_per_page_markdown,
)
from deepengineer.logging_tools import LoggingTool
import queue


class ToolNames(Enum):
    # Search tools
    SEARCH_TOOL = "web_search_tool"
    ARXIV_SEARCH = "arxiv_search"
    PUBMED_SEARCH = "pubmed_search"
    SCIENCEDIRECT_SEARCH = "sciencedirect_search"
    SCIENTIFIC_SEARCH = "scientific_search"

    # Exploring link tools
    GET_TABLE_OF_CONTENTS = "get_table_of_contents_of_url"
    GET_MARKDOWN = "get_markdown_of_url"
    GET_PAGES_CONTENT = "get_pages_content"
    FIND_IN_MARKDOWN = "find_in_markdown"


def filter_search_results(
    search_response: SearchResponse, max_nb_results: int = 5
) -> SearchResponse:
    search_response.search_results = search_response.search_results[:max_nb_results]
    return search_response.to_string()


class SearchTool(LoggingTool):
    name = ToolNames.SEARCH_TOOL.value
    description = """Search the web using Linkup API. Good for deep research with sourced answers.
    Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
    """
    inputs = {
        "search_query": {
            "type": "string",
            "description": "The search query to execute",
        },
    }
    output_type = "object"

    def __init__(self, log_queue: queue.Queue | None = None):
        super().__init__(log_queue=log_queue)

    def forward(self, search_query: str) -> str:
        self.push_log(f"πŸ” Searching web for: {search_query}")
        result = asyncio.run(
            linkup_search_async(
                search_query=search_query,
            )
        )
        return filter_search_results(result)


class ArxivSearchTool(LoggingTool):
    name = ToolNames.ARXIV_SEARCH.value
    description = """Search arXiv for academic papers and preprints with Linkup API.
    Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
    """
    inputs = {
        "search_query": {
            "type": "string",
            "description": "The search query to execute on arXiv",
        }
    }
    output_type = "object"

    def __init__(self, log_queue: queue.Queue | None = None):
        super().__init__(log_queue=log_queue)

    def forward(self, search_query: str) -> str:
        self.push_log(f"πŸ” Searching arXiv for: {search_query}")
        result = asyncio.run(arxiv_search_async(search_query))
        return filter_search_results(result)


class PubmedSearchTool(LoggingTool):
    name = ToolNames.PUBMED_SEARCH.value
    description = """Search PubMed for medical and scientific literature with Linkup API.
    Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
    """
    inputs = {
        "search_query": {
            "type": "string",
            "description": "The search query to execute on PubMed",
        }
    }
    output_type = "object"

    def __init__(self, log_queue: queue.Queue | None = None):
        super().__init__(log_queue=log_queue)

    def forward(self, search_query: str) -> str:
        self.push_log(f"πŸ” Searching PubMed for: {search_query}")
        result = asyncio.run(pubmed_search_async(search_query))
        return filter_search_results(result)


class ScientificSearchTool(LoggingTool):
    name = ToolNames.SCIENTIFIC_SEARCH.value
    description = """Search across multiple scientific domains: Wikipedia, arXiv, PubMed, and ScienceDirect.
    Linkup also provides an answer. This answer is not always correct, so you might want to check the sources.
    """
    inputs = {
        "search_query": {
            "type": "string",
            "description": "The search query to execute across scientific domains",
        }
    }
    output_type = "object"

    def __init__(self, log_queue: queue.Queue | None = None):
        super().__init__(log_queue=log_queue)

    def forward(self, search_query: str) -> dict:
        self.push_log(f"πŸ” Searching scientific domains for: {search_query}")
        result = asyncio.run(scientific_search_async(search_query))
        return filter_search_results(result)


URL_EXPLAINATION = """The URL can be be converted to a markdown. If the URL points to a PDF, the pdf is converted to markdown, otherwise the URL is crawled and the markdown is extracted. This markdown is split into pages that are numbered. You can use the page numbers to get the content of the pages."""


class GetTableOfContentsTool(LoggingTool):
    name = ToolNames.GET_TABLE_OF_CONTENTS.value
    description = f"""Returns all of the titles in the url along with the page number they are on.
    {URL_EXPLAINATION}
    """
    inputs = {
        "url": {
            "type": "string",
            "description": "The URL to get the table of contents of.",
        }
    }
    output_type = "string"

    def __init__(self, database: DataBase, log_queue: queue.Queue | None = None):
        super().__init__(log_queue=log_queue)
        self.database: DataBase = database

    def forward(self, url: str) -> str:
        self.push_log(f"πŸ” Getting table of contents for url: {url}")
        markdown = self.database.get_markdown_of_url(url)
        table_of_contents: str = get_table_of_contents_per_page_markdown(markdown)
        return table_of_contents


class GetMarkdownTool(LoggingTool):
    name = ToolNames.GET_MARKDOWN.value
    description = f"Returns in markdown entire content of the url. Beware this might be too long to be useful, except for small documents, use {ToolNames.GET_PAGES_CONTENT.value} instead. You can also use {ToolNames.GET_TABLE_OF_CONTENTS.value} first to get the table of contents of the document including the number of pages."
    inputs = {
        "url": {"type": "string", "description": "The URL to get the markdown of."}
    }
    output_type = "string"

    def __init__(self, database: DataBase, log_queue: queue.Queue | None = None):
        super().__init__(log_queue=log_queue)
        self.database: DataBase = database

    def forward(self, url: str) -> str:
        self.push_log(f"πŸ” Getting markdown for url: {url}")
        markdown = self.database.get_markdown_of_url(url)
        markdown_content: str = convert_ocr_response_to_markdown(markdown)
        return markdown_content


class GetPagesContentTool(LoggingTool):
    name = ToolNames.GET_PAGES_CONTENT.value
    description = f"Returns the content of the pages. You can use {ToolNames.GET_TABLE_OF_CONTENTS.value} to get the table of contents of the url including the number of pages. Expects a list of page numbers as integers as input. {URL_EXPLAINATION}"
    inputs = {
        "url": {"type": "string", "description": "The URL to get the content of."},
        "page_numbers": {
            "type": "array",
            "description": "The page numbers to get the content of.",
        },
    }
    output_type = "string"

    def __init__(self, database: DataBase, log_queue: queue.Queue | None = None):
        super().__init__(log_queue=log_queue)
        self.database: DataBase = database

    def forward(self, url: str, page_numbers: list[int]) -> str:
        self.push_log(f"πŸ” Getting content of pages {page_numbers} for url: {url}")
        markdown = self.database.get_markdown_of_url(url)
        return get_markdown_by_page_numbers(markdown, page_numbers)


class FindInMarkdownTool(LoggingTool):
    name = ToolNames.FIND_IN_MARKDOWN.value
    description = f"Finds the page numbers of the url that contain the search queries. If you are looking for a specific information, you can use this tool to find the page numbers of the url that contain the information and then use {ToolNames.GET_PAGES_CONTENT.value} to get the content of the pages. {URL_EXPLAINATION}"
    inputs = {
        "url": {"type": "string", "description": "The URL to find in."},
        "search_queries": {
            "type": "array",
            "description": "The search queries to find in the url. List of strings.",
        },
    }
    output_type = "array"

    def __init__(self, database: DataBase, log_queue: queue.Queue | None = None):
        super().__init__(log_queue=log_queue)
        self.database: DataBase = database

    def forward(self, url: str, search_queries: list[str]) -> list[int]:
        self.push_log(f"πŸ” Finding {search_queries} in url: {url}")
        markdown = self.database.get_markdown_of_url(url)
        return find_in_markdown(markdown, search_queries)


def create_web_search_agent(
    model_id="deepseek/deepseek-reasoner",
    database: DataBase | None = None,
    log_queue: queue.Queue | None = None,
):
    """Create a web search agent with search, crawling, and PDF analysis capabilities."""

    model = LiteLLMModel(model_id=model_id)
    if database is None:
        database = DataBase()

    # Web search and crawling tools
    WEB_SEARCH_TOOLS = [
        SearchTool(log_queue=log_queue),
        ArxivSearchTool(log_queue=log_queue),
        PubmedSearchTool(log_queue=log_queue),
        ScientificSearchTool(log_queue=log_queue),
        GetTableOfContentsTool(database=database, log_queue=log_queue),
        GetMarkdownTool(database=database, log_queue=log_queue),
        GetPagesContentTool(database=database, log_queue=log_queue),
        FindInMarkdownTool(database=database, log_queue=log_queue),
    ]

    web_search_agent = CodeAgent(
        model=model,
        tools=WEB_SEARCH_TOOLS,
        max_steps=20,
        verbosity_level=2,
        planning_interval=4,
        name="web_search_agent",
        description="""A team member that will search the internet to answer your question.
    Ask him for all your questions that require browsing the web. It can also search arXiv, PubMed, and ScienceDirect, download the documents and extract the relevant information.
    Provide him as much context as possible, especially if you need to search on a specific website!
    And don't hesitate to provide him with a complex search task.
    Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords.""",
    )

    return web_search_agent