frdel commited on
Commit
7a28f98
·
2 Parent(s): 7473741be6802d

Merge branch 'pr/440' into development

Browse files
docker/run/fs/ins/install_playwright.sh CHANGED
@@ -9,10 +9,9 @@ uv pip install playwright
9
  # install chromium with dependencies
10
  # for kali-based
11
  # if [ "$@" = "hacking" ]; then
12
- apt-get install -y fonts-unifont libnss3 libnspr4 libatk1.0-0 libatspi2.0-0 libxcomposite1 libxdamage1 libatk-bridge2.0-0
13
- playwright install chromium-headless-shell
14
  # else
15
  # # for debian based
16
- # playwright install --with-deps chromium-headless-shell
17
  # fi
18
-
 
9
  # install chromium with dependencies
10
  # for kali-based
11
  # if [ "$@" = "hacking" ]; then
12
+ apt-get install -y fonts-unifont libnss3 libnspr4 libatk1.0-0 libatspi2.0-0 libxcomposite1 libxdamage1 libatk-bridge2.0-0 libcups2
13
+ playwright install chromium
14
  # else
15
  # # for debian based
16
+ # playwright install --with-deps chromium
17
  # fi
 
prompts/default/browser_agent.system.md CHANGED
@@ -12,6 +12,18 @@ If you are waiting for instructions: you should end the task and mark as done
12
  Your responses must always be formatted as a JSON object
13
  The response JSON must contain at least the following fields: "title", "response", "page_summary"
14
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  ## Response fields
16
  * title (type: str) - The ttitle of the current web page
17
  * response (type: str) - Your response to your superior's last request
 
12
  Your responses must always be formatted as a JSON object
13
  The response JSON must contain at least the following fields: "title", "response", "page_summary"
14
 
15
+ ## Task Completion
16
+ When you have completed the assigned task OR are waiting for further instructions:
17
+ 1. Use the "Complete task" action to mark the task as complete
18
+ 2. Provide the required parameters: title, response, and page_summary
19
+ 3. Do NOT continue taking actions after calling "Complete task"
20
+
21
+ ## Important Notes
22
+ - Always call "Complete task" when your objective is achieved
23
+ - If you navigate to a website and no further actions are requested, call "Complete task" immediately
24
+ - If you complete any requested interaction (clicking, typing, etc.), call "Complete task"
25
+ - Never leave a task running indefinitely - always conclude with "Complete task"
26
+
27
  ## Response fields
28
  * title (type: str) - The ttitle of the current web page
29
  * response (type: str) - Your response to your superior's last request
python/tools/browser_agent.py CHANGED
@@ -1,17 +1,20 @@
1
  import asyncio
2
  import json
3
  import time
 
4
  from agent import Agent, InterventionException
5
 
6
  import models
7
  from python.helpers.tool import Tool, Response
8
  from python.helpers import files, defer, persist_chat, strings
9
  from python.helpers.browser_use import browser_use
 
 
10
  from python.extensions.message_loop_start._10_iteration_no import get_iter_no
11
  from pydantic import BaseModel
12
  import uuid
13
  from python.helpers.dirty_json import DirtyJson
14
- from langchain_core.messages import SystemMessage
15
 
16
  class State:
17
  @staticmethod
@@ -21,49 +24,47 @@ class State:
21
 
22
  def __init__(self, agent: Agent):
23
  self.agent = agent
24
- self.context = None
25
- self.task = None
26
- self.use_agent = None
27
- self.browser = None
28
  self.iter_no = 0
29
 
30
-
31
  def __del__(self):
32
  self.kill_task()
33
 
34
  async def _initialize(self):
35
- if self.context:
36
  return
37
 
38
- self.browser = browser_use.Browser(
39
- config=browser_use.BrowserConfig(
40
  headless=True,
41
  disable_security=True,
 
 
 
 
 
42
  )
43
  )
44
 
45
- # Await the coroutine to get the browser context
46
- self.context = await self.browser.new_context()
47
-
48
- # override async methods to create hooks
49
  self.override_hooks()
50
 
51
- # Add init script to the context - this will be applied to all new pages
52
- await self.context._initialize_session()
53
- pw_context = self.context.session.context # type: ignore
54
- js_override = files.get_abs_path("lib/browser/init_override.js")
55
- await pw_context.add_init_script(path=js_override) # type: ignore
56
 
57
  def start_task(self, task: str):
58
  if self.task and self.task.is_alive():
59
  self.kill_task()
60
 
61
- if not self.task:
62
- self.task = defer.DeferredTask(
63
- thread_name="BrowserAgent" + self.agent.context.id
64
- )
65
- if self.agent.context.task:
66
- self.agent.context.task.add_child_task(self.task, terminate_thread=True)
67
  self.task.start_task(self._run_task, task)
68
  return self.task
69
 
@@ -71,24 +72,23 @@ class State:
71
  if self.task:
72
  self.task.kill(terminate_thread=True)
73
  self.task = None
74
- self.context = None
75
- self.use_agent = None
76
- self.browser = None
77
- self.iter_no = 0
 
 
 
 
 
 
 
 
 
78
 
79
  async def _run_task(self, task: str):
80
-
81
- agent = self.agent
82
-
83
  await self._initialize()
84
 
85
- class CustomSystemPrompt(browser_use.SystemPrompt):
86
- def get_system_message(self) -> SystemMessage:
87
- existing_rules = super().get_system_message().text()
88
- new_rules = agent.read_prompt("prompts/browser_agent.system.md")
89
- return SystemMessage(content=f"{existing_rules}\n{new_rules}".strip())
90
-
91
- # Model of task result
92
  class DoneResult(BaseModel):
93
  title: str
94
  response: str
@@ -97,18 +97,16 @@ class State:
97
  # Initialize controller
98
  controller = browser_use.Controller()
99
 
100
- # we overwrite done() in this example to demonstrate the validator
101
- @controller.registry.action("Done with task", param_model=DoneResult)
102
- async def done(params: DoneResult):
103
  result = browser_use.ActionResult(
104
- is_done=True, extracted_content=params.model_dump_json()
 
 
105
  )
106
  return result
107
 
108
- # @controller.action("Ask user for information")
109
- # def ask_user(question: str) -> str:
110
- # return "..."
111
-
112
  model = models.get_model(
113
  type=models.ModelType.CHAT,
114
  provider=self.agent.config.browser_model.provider,
@@ -118,27 +116,29 @@ class State:
118
 
119
  self.use_agent = browser_use.Agent(
120
  task=task,
121
- browser_context=self.context,
122
  llm=model,
123
  use_vision=self.agent.config.browser_model.vision,
124
- system_prompt_class=CustomSystemPrompt,
125
  controller=controller,
 
126
  )
127
 
128
  self.iter_no = get_iter_no(self.agent)
129
 
130
- # orig_err_hnd = self.use_agent._handle_step_error
131
- # def new_err_hnd(*args, **kwargs):
132
- # if isinstance(args[0], InterventionException):
133
- # raise args[0]
134
- # return orig_err_hnd(*args, **kwargs)
135
- # self.use_agent._handle_step_error = new_err_hnd
136
-
137
- result = await self.use_agent.run()
138
- return result
 
 
139
 
140
  def override_hooks(self):
141
- # override async function to create a hook
142
  def override_hook(func):
143
  async def wrapper(*args, **kwargs):
144
  await self.agent.wait_if_paused()
@@ -147,14 +147,24 @@ class State:
147
  return await func(*args, **kwargs)
148
  return wrapper
149
 
150
- if self.context:
151
- self.context.get_state = override_hook(self.context.get_state)
152
- self.context.get_session = override_hook(self.context.get_session)
153
- self.context.remove_highlights = override_hook(self.context.remove_highlights)
154
 
155
  async def get_page(self):
 
 
 
 
 
 
 
 
 
 
156
  if self.use_agent:
157
- return await self.use_agent.browser_context.get_current_page()
 
 
158
 
159
 
160
  class BrowserAgent(Tool):
@@ -165,8 +175,17 @@ class BrowserAgent(Tool):
165
  await self.prepare_state(reset=reset)
166
  task = self.state.start_task(message)
167
 
168
- # wait for browser agent to finish and update progress
 
 
 
169
  while not task.is_ready():
 
 
 
 
 
 
170
  await self.agent.handle_intervention()
171
  await asyncio.sleep(1)
172
  try:
@@ -177,22 +196,43 @@ class BrowserAgent(Tool):
177
  screenshot = update.get("screenshot", None)
178
  if screenshot:
179
  self.log.update(screenshot=screenshot)
180
- except Exception as e:
181
  pass
182
 
183
- # collect result
184
- result = await task.result()
185
- answer = result.final_result()
186
  try:
187
- if answer and isinstance(answer, str) and answer.strip():
188
- answer_data = DirtyJson.parse_string(answer)
189
- answer_text = strings.dict_to_text(answer_data) # type: ignore
190
- else:
191
- answer_text = str(answer) if answer else "No result returned"
192
  except Exception as e:
193
- answer_text = str(answer) if answer else f"Error processing result: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  self.log.update(answer=answer_text)
195
- return Response(message=answer, break_loop=False)
196
 
197
  def get_log_object(self):
198
  return self.agent.context.log.log(
@@ -202,9 +242,6 @@ class BrowserAgent(Tool):
202
  kvps=self.args,
203
  )
204
 
205
- # async def after_execution(self, response, **kwargs):
206
- # await self.agent.hist_add_tool_result(self.name, response.message)
207
-
208
  async def get_update(self):
209
  await self.prepare_state()
210
 
@@ -212,7 +249,6 @@ class BrowserAgent(Tool):
212
  agent = self.agent
213
  ua = self.state.use_agent
214
  page = await self.state.get_page()
215
- ctx = self.state.context
216
 
217
  if ua and page:
218
  try:
@@ -223,14 +259,6 @@ class BrowserAgent(Tool):
223
 
224
  log = []
225
 
226
- # dom_service = browser_use.DomService(page)
227
- # dom_state = await browser_use.utils.time_execution_sync('get_clickable_elements')(
228
- # dom_service.get_clickable_elements
229
- # )()
230
- # elements = dom_state.element_tree
231
- # selector_map = dom_state.selector_map
232
- # el_text = elements.clickable_elements_to_string()
233
-
234
  for message in ua.message_manager.get_messages():
235
  if message.type == "system":
236
  continue
@@ -265,7 +293,7 @@ class BrowserAgent(Tool):
265
  if self.state.task:
266
  await self.state.task.execute_inside(_get_update)
267
 
268
- except Exception as e:
269
  pass
270
 
271
  return result
 
1
  import asyncio
2
  import json
3
  import time
4
+ from typing import Optional
5
  from agent import Agent, InterventionException
6
 
7
  import models
8
  from python.helpers.tool import Tool, Response
9
  from python.helpers import files, defer, persist_chat, strings
10
  from python.helpers.browser_use import browser_use
11
+ from python.helpers.print_style import PrintStyle
12
+
13
  from python.extensions.message_loop_start._10_iteration_no import get_iter_no
14
  from pydantic import BaseModel
15
  import uuid
16
  from python.helpers.dirty_json import DirtyJson
17
+
18
 
19
  class State:
20
  @staticmethod
 
24
 
25
  def __init__(self, agent: Agent):
26
  self.agent = agent
27
+ self.browser_session: Optional[browser_use.BrowserSession] = None
28
+ self.task: Optional[defer.DeferredTask] = None
29
+ self.use_agent: Optional[browser_use.Agent] = None
 
30
  self.iter_no = 0
31
 
 
32
  def __del__(self):
33
  self.kill_task()
34
 
35
  async def _initialize(self):
36
+ if self.browser_session:
37
  return
38
 
39
+ self.browser_session = browser_use.BrowserSession(
40
+ browser_profile=browser_use.BrowserProfile(
41
  headless=True,
42
  disable_security=True,
43
+ chromium_sandbox=False,
44
+ minimum_wait_page_load_time=1.0,
45
+ wait_for_network_idle_page_load_time=2.0,
46
+ maximum_wait_page_load_time=10.0,
47
+ args=['--headless=new'],
48
  )
49
  )
50
 
51
+ await self.browser_session.start()
 
 
 
52
  self.override_hooks()
53
 
54
+ # Add init script to the browser session
55
+ if self.browser_session.browser_context:
56
+ js_override = files.get_abs_path("lib/browser/init_override.js")
57
+ await self.browser_session.browser_context.add_init_script(path=js_override)
 
58
 
59
  def start_task(self, task: str):
60
  if self.task and self.task.is_alive():
61
  self.kill_task()
62
 
63
+ self.task = defer.DeferredTask(
64
+ thread_name="BrowserAgent" + self.agent.context.id
65
+ )
66
+ if self.agent.context.task:
67
+ self.agent.context.task.add_child_task(self.task, terminate_thread=True)
 
68
  self.task.start_task(self._run_task, task)
69
  return self.task
70
 
 
72
  if self.task:
73
  self.task.kill(terminate_thread=True)
74
  self.task = None
75
+ if self.browser_session:
76
+ try:
77
+ import asyncio
78
+ loop = asyncio.new_event_loop()
79
+ asyncio.set_event_loop(loop)
80
+ loop.run_until_complete(self.browser_session.close())
81
+ loop.close()
82
+ except Exception as e:
83
+ PrintStyle().error(f"Error closing browser session: {e}")
84
+ finally:
85
+ self.browser_session = None
86
+ self.use_agent = None
87
+ self.iter_no = 0
88
 
89
  async def _run_task(self, task: str):
 
 
 
90
  await self._initialize()
91
 
 
 
 
 
 
 
 
92
  class DoneResult(BaseModel):
93
  title: str
94
  response: str
 
97
  # Initialize controller
98
  controller = browser_use.Controller()
99
 
100
+ # Register custom completion action with proper ActionResult fields
101
+ @controller.registry.action("Complete task", param_model=DoneResult)
102
+ async def complete_task(params: DoneResult):
103
  result = browser_use.ActionResult(
104
+ is_done=True,
105
+ success=True,
106
+ extracted_content=params.model_dump_json()
107
  )
108
  return result
109
 
 
 
 
 
110
  model = models.get_model(
111
  type=models.ModelType.CHAT,
112
  provider=self.agent.config.browser_model.provider,
 
116
 
117
  self.use_agent = browser_use.Agent(
118
  task=task,
119
+ browser_session=self.browser_session,
120
  llm=model,
121
  use_vision=self.agent.config.browser_model.vision,
122
+ extend_system_message=self.agent.read_prompt("prompts/browser_agent.system.md"),
123
  controller=controller,
124
+ enable_memory=False, # Disable memory to avoid state conflicts
125
  )
126
 
127
  self.iter_no = get_iter_no(self.agent)
128
 
129
+ try:
130
+ result = await self.use_agent.run(max_steps=50)
131
+ return result
132
+ finally:
133
+ if self.browser_session:
134
+ try:
135
+ await self.browser_session.close()
136
+ except Exception as e:
137
+ PrintStyle().error(f"Error closing browser session in task cleanup: {e}")
138
+ finally:
139
+ self.browser_session = None
140
 
141
  def override_hooks(self):
 
142
  def override_hook(func):
143
  async def wrapper(*args, **kwargs):
144
  await self.agent.wait_if_paused()
 
147
  return await func(*args, **kwargs)
148
  return wrapper
149
 
150
+ if self.browser_session and hasattr(self.browser_session, 'remove_highlights'):
151
+ self.browser_session.remove_highlights = override_hook(self.browser_session.remove_highlights)
 
 
152
 
153
  async def get_page(self):
154
+ if self.use_agent and self.browser_session:
155
+ try:
156
+ return await self.use_agent.browser_session.get_current_page()
157
+ except Exception:
158
+ # Browser session might be closed or invalid
159
+ return None
160
+ return None
161
+
162
+ async def get_selector_map(self):
163
+ """Get the selector map for the current page state."""
164
  if self.use_agent:
165
+ await self.use_agent.browser_session.get_state_summary(cache_clickable_elements_hashes=True)
166
+ return await self.use_agent.browser_session.get_selector_map()
167
+ return {}
168
 
169
 
170
  class BrowserAgent(Tool):
 
175
  await self.prepare_state(reset=reset)
176
  task = self.state.start_task(message)
177
 
178
+ # wait for browser agent to finish and update progress with timeout
179
+ timeout_seconds = 300 # 5 minute timeout
180
+ start_time = time.time()
181
+
182
  while not task.is_ready():
183
+ # Check for timeout to prevent infinite waiting
184
+ if time.time() - start_time > timeout_seconds:
185
+ PrintStyle().warning(f"Browser agent task timeout after {timeout_seconds} seconds, forcing completion")
186
+ self.state.kill_task()
187
+ break
188
+
189
  await self.agent.handle_intervention()
190
  await asyncio.sleep(1)
191
  try:
 
196
  screenshot = update.get("screenshot", None)
197
  if screenshot:
198
  self.log.update(screenshot=screenshot)
199
+ except Exception:
200
  pass
201
 
202
+ # collect result with error handling
 
 
203
  try:
204
+ result = await task.result()
205
+ PrintStyle().debug(f"Browser agent task completed, is_done: {result['is_done']}")
 
 
 
206
  except Exception as e:
207
+ PrintStyle().error(f"Error getting browser agent task result: {str(e)}")
208
+ # Return a timeout response if task.result() fails
209
+ answer_text = f"Browser agent task failed to return result: {str(e)}"
210
+ self.log.update(answer=answer_text)
211
+ return Response(message=answer_text, break_loop=False)
212
+ finally:
213
+ # Stop any further browser access after task completion
214
+ self.state.kill_task()
215
+
216
+ # Check if task completed successfully
217
+ if result['is_done']:
218
+ answer = result['final_result']
219
+ try:
220
+ if answer and isinstance(answer, str) and answer.strip():
221
+ answer_data = DirtyJson.parse_string(answer)
222
+ answer_text = strings.dict_to_text(answer_data) # type: ignore
223
+ else:
224
+ answer_text = str(answer) if answer else "Task completed successfully"
225
+ except Exception as e:
226
+ answer_text = str(answer) if answer else f"Task completed with parse error: {str(e)}"
227
+ else:
228
+ # Task hit max_steps without calling done()
229
+ urls = result['urls']
230
+ current_url = urls[-1] if urls else "unknown"
231
+ answer_text = (f"Task reached step limit without completion. Last page: {current_url}. "
232
+ f"The browser agent may need clearer instructions on when to finish.")
233
+
234
  self.log.update(answer=answer_text)
235
+ return Response(message=answer_text, break_loop=False)
236
 
237
  def get_log_object(self):
238
  return self.agent.context.log.log(
 
242
  kvps=self.args,
243
  )
244
 
 
 
 
245
  async def get_update(self):
246
  await self.prepare_state()
247
 
 
249
  agent = self.agent
250
  ua = self.state.use_agent
251
  page = await self.state.get_page()
 
252
 
253
  if ua and page:
254
  try:
 
259
 
260
  log = []
261
 
 
 
 
 
 
 
 
 
262
  for message in ua.message_manager.get_messages():
263
  if message.type == "system":
264
  continue
 
293
  if self.state.task:
294
  await self.state.task.execute_inside(_get_update)
295
 
296
+ except Exception:
297
  pass
298
 
299
  return result
requirements.txt CHANGED
@@ -1,10 +1,10 @@
1
  a2wsgi==1.10.8
2
  ansio==0.0.1
3
- beautifulsoup4==4.12.3
4
- browser-use==0.1.40
5
  docker==7.1.0
6
  duckduckgo-search==6.1.12
7
- faiss-cpu==1.8.0.post1
8
  fastmcp==2.3.4
9
  flask[async]==3.0.3
10
  flask-basicauth==0.2.0
@@ -13,12 +13,12 @@ GitPython==3.1.43
13
  inputimeout==1.0.4
14
  langchain-anthropic==0.3.3
15
  langchain-community==0.3.19
16
- langchain-google-genai==2.0.8
17
  langchain-groq==0.2.2
18
  langchain-huggingface==0.1.2
19
  langchain-mistralai==0.2.4
20
- langchain-ollama==0.2.2
21
- langchain-openai==0.3.1
22
  openai-whisper==20240930
23
  lxml_html_clean==0.3.1
24
  markdown==3.7
 
1
  a2wsgi==1.10.8
2
  ansio==0.0.1
3
+ beautifulsoup4==4.13.4
4
+ browser-use==0.2.5
5
  docker==7.1.0
6
  duckduckgo-search==6.1.12
7
+ faiss-cpu==1.11.0
8
  fastmcp==2.3.4
9
  flask[async]==3.0.3
10
  flask-basicauth==0.2.0
 
13
  inputimeout==1.0.4
14
  langchain-anthropic==0.3.3
15
  langchain-community==0.3.19
16
+ langchain-google-genai==2.1.2
17
  langchain-groq==0.2.2
18
  langchain-huggingface==0.1.2
19
  langchain-mistralai==0.2.4
20
+ langchain-ollama==0.3.0
21
+ langchain-openai==0.3.11
22
  openai-whisper==20240930
23
  lxml_html_clean==0.3.1
24
  markdown==3.7