Vision and Browsing Tools

#2
Files changed (1) hide show
  1. tools.py +19 -0
tools.py CHANGED
@@ -89,3 +89,22 @@ def close_popups() -> str:
89
  Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners.
90
  """
91
  webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners.
90
  """
91
  webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
92
+
93
+
94
+ def save_screenshot(step_log: ActionStep, agent: CodeAgent) -> None:
95
+ sleep(1.0) # Let JavaScript animations happen before taking the screenshot
96
+ driver = helium.get_driver()
97
+ current_step = step_log.step_number
98
+ if driver is not None:
99
+ for step_logs in agent.logs: # Remove previous screenshots from logs for lean processing
100
+ if isinstance(step_log, ActionStep) and step_log.step_number <= current_step - 2:
101
+ step_logs.observations_images = None
102
+ png_bytes = driver.get_screenshot_as_png()
103
+ image = Image.open(BytesIO(png_bytes))
104
+ print(f"Captured a browser screenshot: {image.size} pixels")
105
+ step_log.observations_images = [image.copy()] # Create a copy to ensure it persists, important!
106
+
107
+ # Update observations with current URL
108
+ url_info = f"Current url: {driver.current_url}"
109
+ step_log.observations = url_info if step_logs.observations is None else step_log.observations + "\n" + url_info
110
+ return