Charles Azam commited on
Commit
e003639
·
1 Parent(s): 1adf00c

feat: init search agents

Browse files
src/deepengineer/deepsearch/analyse_markdown_agent.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def create_agent(model_id="o1"):
2
+ model_params = {
3
+ "model_id": model_id,
4
+ "custom_role_conversions": custom_role_conversions,
5
+ "max_completion_tokens": 8192,
6
+ }
7
+ if model_id == "o1":
8
+ model_params["reasoning_effort"] = "high"
9
+ model = LiteLLMModel(model_id="deepseek/deepseek-chat")
10
+
11
+ text_limit = 100000
12
+ browser = SimpleTextBrowser(**BROWSER_CONFIG)
13
+ WEB_TOOLS = [
14
+ GoogleSearchTool(provider="serper"),
15
+ VisitTool(browser),
16
+ PageUpTool(browser),
17
+ PageDownTool(browser),
18
+ FinderTool(browser),
19
+ FindNextTool(browser),
20
+ ArchiveSearchTool(browser),
21
+ TextInspectorTool(model, text_limit),
22
+ ]
23
+ text_webbrowser_agent = ToolCallingAgent(
24
+ model=model,
25
+ tools=WEB_TOOLS,
26
+ max_steps=20,
27
+ verbosity_level=2,
28
+ planning_interval=4,
29
+ name="search_agent",
30
+ description="""A team member that will search the internet to answer your question.
31
+ Ask him for all your questions that require browsing the web.
32
+ Provide him as much context as possible, in particular if you need to search on a specific timeframe!
33
+ And don't hesitate to provide him with a complex search task, like finding a difference between two webpages.
34
+ Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords.
35
+ """,
36
+ provide_run_summary=True,
37
+ )
38
+ text_webbrowser_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files.
39
+ If a non-html page is in another format, especially .pdf or a Youtube video, use tool 'inspect_file_as_text' to inspect it.
40
+ Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information."""
41
+
42
+ manager_agent = CodeAgent(
43
+ model=model,
44
+ tools=[visualizer, TextInspectorTool(model, text_limit)],
45
+ max_steps=12,
46
+ verbosity_level=2,
47
+ additional_authorized_imports=["*"],
48
+ planning_interval=4,
49
+ managed_agents=[text_webbrowser_agent],
50
+ )
51
+
52
+ return manager_agent
src/deepengineer/deepsearch/draw_agent.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+ from time import sleep
3
+
4
+ import helium
5
+ from dotenv import load_dotenv
6
+ from PIL import Image
7
+ from selenium import webdriver
8
+ from selenium.webdriver.common.by import By
9
+ from selenium.webdriver.common.keys import Keys
10
+
11
+ from smolagents import CodeAgent, tool
12
+ from smolagents.agents import ActionStep
13
+
14
+ # Load environment variables
15
+ load_dotenv()
16
+
17
+ @tool
18
+ def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
19
+ """
20
+ Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
21
+ Args:
22
+ text: The text to search for
23
+ nth_result: Which occurrence to jump to (default: 1)
24
+ """
25
+ elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
26
+ if nth_result > len(elements):
27
+ raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
28
+ result = f"Found {len(elements)} matches for '{text}'."
29
+ elem = elements[nth_result - 1]
30
+ driver.execute_script("arguments[0].scrollIntoView(true);", elem)
31
+ result += f"Focused on element {nth_result} of {len(elements)}"
32
+ return result
33
+
34
+ @tool
35
+ def go_back() -> None:
36
+ """Goes back to previous page."""
37
+ driver.back()
38
+
39
+ @tool
40
+ def close_popups() -> str:
41
+ """
42
+ Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows!
43
+ This does not work on cookie consent banners.
44
+ """
45
+ webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
46
+
47
+
48
+ # Configure Chrome options
49
+ chrome_options = webdriver.ChromeOptions()
50
+ chrome_options.add_argument("--force-device-scale-factor=1")
51
+ chrome_options.add_argument("--window-size=1000,1350")
52
+ chrome_options.add_argument("--disable-pdf-viewer")
53
+ chrome_options.add_argument("--window-position=0,0")
54
+
55
+ # Initialize the browser
56
+ driver = helium.start_chrome(headless=False, options=chrome_options)
57
+
58
+ # Set up screenshot callback
59
+ def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
60
+ sleep(1.0) # Let JavaScript animations happen before taking the screenshot
61
+ driver = helium.get_driver()
62
+ current_step = memory_step.step_number
63
+ if driver is not None:
64
+ for previous_memory_step in agent.memory.steps: # Remove previous screenshots for lean processing
65
+ if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
66
+ previous_memory_step.observations_images = None
67
+ png_bytes = driver.get_screenshot_as_png()
68
+ image = Image.open(BytesIO(png_bytes))
69
+ print(f"Captured a browser screenshot: {image.size} pixels")
70
+ memory_step.observations_images = [image.copy()] # Create a copy to ensure it persists
71
+
72
+ # Update observations with current URL
73
+ url_info = f"Current url: {driver.current_url}"
74
+ memory_step.observations = (
75
+ url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
76
+ )
77
+
78
+ from smolagents import InferenceClientModel
79
+
80
+ # Initialize the model
81
+ model_id = "Qwen/Qwen2-VL-72B-Instruct" # You can change this to your preferred VLM model
82
+ model = InferenceClientModel(model_id=model_id)
83
+
84
+ # Create the agent
85
+ agent = CodeAgent(
86
+ tools=[go_back, close_popups, search_item_ctrl_f],
87
+ model=model,
88
+ additional_authorized_imports=["helium"],
89
+ step_callbacks=[save_screenshot],
90
+ max_steps=20,
91
+ verbosity_level=2,
92
+ )
93
+
94
+ # Import helium for the agent
95
+ agent.python_executor("from helium import *", agent.state)
96
+
97
+ helium_instructions = """
98
+ You can use helium to access websites. Don't bother about the helium driver, it's already managed.
99
+ We've already ran "from helium import *"
100
+ Then you can go to pages!
101
+ Code:
102
+ ```py
103
+ go_to('github.com/trending')
104
+ ```<end_code>
105
+
106
+ You can directly click clickable elements by inputting the text that appears on them.
107
+ Code:
108
+ ```py
109
+ click("Top products")
110
+ ```<end_code>
111
+
112
+ If it's a link:
113
+ Code:
114
+ ```py
115
+ click(Link("Top products"))
116
+ ```<end_code>
117
+
118
+ If you try to interact with an element and it's not found, you'll get a LookupError.
119
+ In general stop your action after each button click to see what happens on your screenshot.
120
+ Never try to login in a page.
121
+
122
+ To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
123
+ Code:
124
+ ```py
125
+ scroll_down(num_pixels=1200) # This will scroll one viewport down
126
+ ```<end_code>
127
+
128
+ When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
129
+ Just use your built-in tool `close_popups` to close them:
130
+ Code:
131
+ ```py
132
+ close_popups()
133
+ ```<end_code>
134
+
135
+ You can use .exists() to check for the existence of an element. For example:
136
+ Code:
137
+ ```py
138
+ if Text('Accept cookies?').exists():
139
+ click('I accept')
140
+ ```<end_code>
141
+ """
142
+
143
+ search_request = """
144
+ Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
145
+ """
146
+
147
+ agent_output = agent.run(search_request + helium_instructions)
148
+ print("Final output:")
149
+ print(agent_output)
150
+
151
+ github_request = """
152
+ I'm trying to find how hard I have to work to get a repo in github.com/trending.
153
+ Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
154
+ """
155
+
156
+ agent_output = agent.run(github_request + helium_instructions)
157
+ print("Final output:")
158
+ print(agent_output)
src/deepengineer/deepsearch/main_agent.py ADDED
File without changes
src/deepengineer/deepsearch/scawl_web_agent.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def create_agent(model_id="o1"):
2
+ model_params = {
3
+ "model_id": model_id,
4
+ "custom_role_conversions": custom_role_conversions,
5
+ "max_completion_tokens": 8192,
6
+ }
7
+ if model_id == "o1":
8
+ model_params["reasoning_effort"] = "high"
9
+ model = LiteLLMModel(model_id="deepseek/deepseek-chat")
10
+
11
+ text_limit = 100000
12
+ browser = SimpleTextBrowser(**BROWSER_CONFIG)
13
+ WEB_TOOLS = [
14
+ GoogleSearchTool(provider="serper"),
15
+ VisitTool(browser),
16
+ PageUpTool(browser),
17
+ PageDownTool(browser),
18
+ FinderTool(browser),
19
+ FindNextTool(browser),
20
+ ArchiveSearchTool(browser),
21
+ TextInspectorTool(model, text_limit),
22
+ ]
23
+ text_webbrowser_agent = ToolCallingAgent(
24
+ model=model,
25
+ tools=WEB_TOOLS,
26
+ max_steps=20,
27
+ verbosity_level=2,
28
+ planning_interval=4,
29
+ name="search_agent",
30
+ description="""A team member that will search the internet to answer your question.
31
+ Ask him for all your questions that require browsing the web.
32
+ Provide him as much context as possible, in particular if you need to search on a specific timeframe!
33
+ And don't hesitate to provide him with a complex search task, like finding a difference between two webpages.
34
+ Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords.
35
+ """,
36
+ provide_run_summary=True,
37
+ )
38
+ text_webbrowser_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files.
39
+ If a non-html page is in another format, especially .pdf or a Youtube video, use tool 'inspect_file_as_text' to inspect it.
40
+ Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information."""
41
+
42
+ manager_agent = CodeAgent(
43
+ model=model,
44
+ tools=[visualizer, TextInspectorTool(model, text_limit)],
45
+ max_steps=12,
46
+ verbosity_level=2,
47
+ additional_authorized_imports=["*"],
48
+ planning_interval=4,
49
+ managed_agents=[text_webbrowser_agent],
50
+ )
51
+
52
+ return manager_agent