Spaces:
Runtime error
Runtime error
Charles Azam
commited on
Commit
·
e003639
1
Parent(s):
1adf00c
feat: init search agents
Browse files
src/deepengineer/deepsearch/analyse_markdown_agent.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def create_agent(model_id="o1"):
|
| 2 |
+
model_params = {
|
| 3 |
+
"model_id": model_id,
|
| 4 |
+
"custom_role_conversions": custom_role_conversions,
|
| 5 |
+
"max_completion_tokens": 8192,
|
| 6 |
+
}
|
| 7 |
+
if model_id == "o1":
|
| 8 |
+
model_params["reasoning_effort"] = "high"
|
| 9 |
+
model = LiteLLMModel(model_id="deepseek/deepseek-chat")
|
| 10 |
+
|
| 11 |
+
text_limit = 100000
|
| 12 |
+
browser = SimpleTextBrowser(**BROWSER_CONFIG)
|
| 13 |
+
WEB_TOOLS = [
|
| 14 |
+
GoogleSearchTool(provider="serper"),
|
| 15 |
+
VisitTool(browser),
|
| 16 |
+
PageUpTool(browser),
|
| 17 |
+
PageDownTool(browser),
|
| 18 |
+
FinderTool(browser),
|
| 19 |
+
FindNextTool(browser),
|
| 20 |
+
ArchiveSearchTool(browser),
|
| 21 |
+
TextInspectorTool(model, text_limit),
|
| 22 |
+
]
|
| 23 |
+
text_webbrowser_agent = ToolCallingAgent(
|
| 24 |
+
model=model,
|
| 25 |
+
tools=WEB_TOOLS,
|
| 26 |
+
max_steps=20,
|
| 27 |
+
verbosity_level=2,
|
| 28 |
+
planning_interval=4,
|
| 29 |
+
name="search_agent",
|
| 30 |
+
description="""A team member that will search the internet to answer your question.
|
| 31 |
+
Ask him for all your questions that require browsing the web.
|
| 32 |
+
Provide him as much context as possible, in particular if you need to search on a specific timeframe!
|
| 33 |
+
And don't hesitate to provide him with a complex search task, like finding a difference between two webpages.
|
| 34 |
+
Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords.
|
| 35 |
+
""",
|
| 36 |
+
provide_run_summary=True,
|
| 37 |
+
)
|
| 38 |
+
text_webbrowser_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files.
|
| 39 |
+
If a non-html page is in another format, especially .pdf or a Youtube video, use tool 'inspect_file_as_text' to inspect it.
|
| 40 |
+
Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information."""
|
| 41 |
+
|
| 42 |
+
manager_agent = CodeAgent(
|
| 43 |
+
model=model,
|
| 44 |
+
tools=[visualizer, TextInspectorTool(model, text_limit)],
|
| 45 |
+
max_steps=12,
|
| 46 |
+
verbosity_level=2,
|
| 47 |
+
additional_authorized_imports=["*"],
|
| 48 |
+
planning_interval=4,
|
| 49 |
+
managed_agents=[text_webbrowser_agent],
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
return manager_agent
|
src/deepengineer/deepsearch/draw_agent.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from io import BytesIO
|
| 2 |
+
from time import sleep
|
| 3 |
+
|
| 4 |
+
import helium
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
from PIL import Image
|
| 7 |
+
from selenium import webdriver
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
from selenium.webdriver.common.keys import Keys
|
| 10 |
+
|
| 11 |
+
from smolagents import CodeAgent, tool
|
| 12 |
+
from smolagents.agents import ActionStep
|
| 13 |
+
|
| 14 |
+
# Load environment variables
|
| 15 |
+
load_dotenv()
|
| 16 |
+
|
| 17 |
+
@tool
|
| 18 |
+
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
|
| 19 |
+
"""
|
| 20 |
+
Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
|
| 21 |
+
Args:
|
| 22 |
+
text: The text to search for
|
| 23 |
+
nth_result: Which occurrence to jump to (default: 1)
|
| 24 |
+
"""
|
| 25 |
+
elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
|
| 26 |
+
if nth_result > len(elements):
|
| 27 |
+
raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
|
| 28 |
+
result = f"Found {len(elements)} matches for '{text}'."
|
| 29 |
+
elem = elements[nth_result - 1]
|
| 30 |
+
driver.execute_script("arguments[0].scrollIntoView(true);", elem)
|
| 31 |
+
result += f"Focused on element {nth_result} of {len(elements)}"
|
| 32 |
+
return result
|
| 33 |
+
|
| 34 |
+
@tool
|
| 35 |
+
def go_back() -> None:
|
| 36 |
+
"""Goes back to previous page."""
|
| 37 |
+
driver.back()
|
| 38 |
+
|
| 39 |
+
@tool
|
| 40 |
+
def close_popups() -> str:
|
| 41 |
+
"""
|
| 42 |
+
Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows!
|
| 43 |
+
This does not work on cookie consent banners.
|
| 44 |
+
"""
|
| 45 |
+
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# Configure Chrome options
|
| 49 |
+
chrome_options = webdriver.ChromeOptions()
|
| 50 |
+
chrome_options.add_argument("--force-device-scale-factor=1")
|
| 51 |
+
chrome_options.add_argument("--window-size=1000,1350")
|
| 52 |
+
chrome_options.add_argument("--disable-pdf-viewer")
|
| 53 |
+
chrome_options.add_argument("--window-position=0,0")
|
| 54 |
+
|
| 55 |
+
# Initialize the browser
|
| 56 |
+
driver = helium.start_chrome(headless=False, options=chrome_options)
|
| 57 |
+
|
| 58 |
+
# Set up screenshot callback
|
| 59 |
+
def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
|
| 60 |
+
sleep(1.0) # Let JavaScript animations happen before taking the screenshot
|
| 61 |
+
driver = helium.get_driver()
|
| 62 |
+
current_step = memory_step.step_number
|
| 63 |
+
if driver is not None:
|
| 64 |
+
for previous_memory_step in agent.memory.steps: # Remove previous screenshots for lean processing
|
| 65 |
+
if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
|
| 66 |
+
previous_memory_step.observations_images = None
|
| 67 |
+
png_bytes = driver.get_screenshot_as_png()
|
| 68 |
+
image = Image.open(BytesIO(png_bytes))
|
| 69 |
+
print(f"Captured a browser screenshot: {image.size} pixels")
|
| 70 |
+
memory_step.observations_images = [image.copy()] # Create a copy to ensure it persists
|
| 71 |
+
|
| 72 |
+
# Update observations with current URL
|
| 73 |
+
url_info = f"Current url: {driver.current_url}"
|
| 74 |
+
memory_step.observations = (
|
| 75 |
+
url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
from smolagents import InferenceClientModel
|
| 79 |
+
|
| 80 |
+
# Initialize the model
|
| 81 |
+
model_id = "Qwen/Qwen2-VL-72B-Instruct" # You can change this to your preferred VLM model
|
| 82 |
+
model = InferenceClientModel(model_id=model_id)
|
| 83 |
+
|
| 84 |
+
# Create the agent
|
| 85 |
+
agent = CodeAgent(
|
| 86 |
+
tools=[go_back, close_popups, search_item_ctrl_f],
|
| 87 |
+
model=model,
|
| 88 |
+
additional_authorized_imports=["helium"],
|
| 89 |
+
step_callbacks=[save_screenshot],
|
| 90 |
+
max_steps=20,
|
| 91 |
+
verbosity_level=2,
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
# Import helium for the agent
|
| 95 |
+
agent.python_executor("from helium import *", agent.state)
|
| 96 |
+
|
| 97 |
+
helium_instructions = """
|
| 98 |
+
You can use helium to access websites. Don't bother about the helium driver, it's already managed.
|
| 99 |
+
We've already ran "from helium import *"
|
| 100 |
+
Then you can go to pages!
|
| 101 |
+
Code:
|
| 102 |
+
```py
|
| 103 |
+
go_to('github.com/trending')
|
| 104 |
+
```<end_code>
|
| 105 |
+
|
| 106 |
+
You can directly click clickable elements by inputting the text that appears on them.
|
| 107 |
+
Code:
|
| 108 |
+
```py
|
| 109 |
+
click("Top products")
|
| 110 |
+
```<end_code>
|
| 111 |
+
|
| 112 |
+
If it's a link:
|
| 113 |
+
Code:
|
| 114 |
+
```py
|
| 115 |
+
click(Link("Top products"))
|
| 116 |
+
```<end_code>
|
| 117 |
+
|
| 118 |
+
If you try to interact with an element and it's not found, you'll get a LookupError.
|
| 119 |
+
In general stop your action after each button click to see what happens on your screenshot.
|
| 120 |
+
Never try to login in a page.
|
| 121 |
+
|
| 122 |
+
To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
|
| 123 |
+
Code:
|
| 124 |
+
```py
|
| 125 |
+
scroll_down(num_pixels=1200) # This will scroll one viewport down
|
| 126 |
+
```<end_code>
|
| 127 |
+
|
| 128 |
+
When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
|
| 129 |
+
Just use your built-in tool `close_popups` to close them:
|
| 130 |
+
Code:
|
| 131 |
+
```py
|
| 132 |
+
close_popups()
|
| 133 |
+
```<end_code>
|
| 134 |
+
|
| 135 |
+
You can use .exists() to check for the existence of an element. For example:
|
| 136 |
+
Code:
|
| 137 |
+
```py
|
| 138 |
+
if Text('Accept cookies?').exists():
|
| 139 |
+
click('I accept')
|
| 140 |
+
```<end_code>
|
| 141 |
+
"""
|
| 142 |
+
|
| 143 |
+
search_request = """
|
| 144 |
+
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
|
| 145 |
+
"""
|
| 146 |
+
|
| 147 |
+
agent_output = agent.run(search_request + helium_instructions)
|
| 148 |
+
print("Final output:")
|
| 149 |
+
print(agent_output)
|
| 150 |
+
|
| 151 |
+
github_request = """
|
| 152 |
+
I'm trying to find how hard I have to work to get a repo in github.com/trending.
|
| 153 |
+
Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
|
| 154 |
+
"""
|
| 155 |
+
|
| 156 |
+
agent_output = agent.run(github_request + helium_instructions)
|
| 157 |
+
print("Final output:")
|
| 158 |
+
print(agent_output)
|
src/deepengineer/deepsearch/main_agent.py
ADDED
|
File without changes
|
src/deepengineer/deepsearch/scawl_web_agent.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def create_agent(model_id="o1"):
|
| 2 |
+
model_params = {
|
| 3 |
+
"model_id": model_id,
|
| 4 |
+
"custom_role_conversions": custom_role_conversions,
|
| 5 |
+
"max_completion_tokens": 8192,
|
| 6 |
+
}
|
| 7 |
+
if model_id == "o1":
|
| 8 |
+
model_params["reasoning_effort"] = "high"
|
| 9 |
+
model = LiteLLMModel(model_id="deepseek/deepseek-chat")
|
| 10 |
+
|
| 11 |
+
text_limit = 100000
|
| 12 |
+
browser = SimpleTextBrowser(**BROWSER_CONFIG)
|
| 13 |
+
WEB_TOOLS = [
|
| 14 |
+
GoogleSearchTool(provider="serper"),
|
| 15 |
+
VisitTool(browser),
|
| 16 |
+
PageUpTool(browser),
|
| 17 |
+
PageDownTool(browser),
|
| 18 |
+
FinderTool(browser),
|
| 19 |
+
FindNextTool(browser),
|
| 20 |
+
ArchiveSearchTool(browser),
|
| 21 |
+
TextInspectorTool(model, text_limit),
|
| 22 |
+
]
|
| 23 |
+
text_webbrowser_agent = ToolCallingAgent(
|
| 24 |
+
model=model,
|
| 25 |
+
tools=WEB_TOOLS,
|
| 26 |
+
max_steps=20,
|
| 27 |
+
verbosity_level=2,
|
| 28 |
+
planning_interval=4,
|
| 29 |
+
name="search_agent",
|
| 30 |
+
description="""A team member that will search the internet to answer your question.
|
| 31 |
+
Ask him for all your questions that require browsing the web.
|
| 32 |
+
Provide him as much context as possible, in particular if you need to search on a specific timeframe!
|
| 33 |
+
And don't hesitate to provide him with a complex search task, like finding a difference between two webpages.
|
| 34 |
+
Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords.
|
| 35 |
+
""",
|
| 36 |
+
provide_run_summary=True,
|
| 37 |
+
)
|
| 38 |
+
text_webbrowser_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files.
|
| 39 |
+
If a non-html page is in another format, especially .pdf or a Youtube video, use tool 'inspect_file_as_text' to inspect it.
|
| 40 |
+
Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information."""
|
| 41 |
+
|
| 42 |
+
manager_agent = CodeAgent(
|
| 43 |
+
model=model,
|
| 44 |
+
tools=[visualizer, TextInspectorTool(model, text_limit)],
|
| 45 |
+
max_steps=12,
|
| 46 |
+
verbosity_level=2,
|
| 47 |
+
additional_authorized_imports=["*"],
|
| 48 |
+
planning_interval=4,
|
| 49 |
+
managed_agents=[text_webbrowser_agent],
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
return manager_agent
|