from io import BytesIO from time import sleep import os import sys # Add the parent directory to the Python path so modules can be found sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import helium from dotenv import load_dotenv from PIL import Image from selenium import webdriver from smolagents import CodeAgent from smolagents.agents import ActionStep from agents.agent import MyAgent from prompts.helium import HELIUM_PROMPT load_dotenv() # Configure Chrome options chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--force-device-scale-factor=1") chrome_options.add_argument("--window-size=1000,1350") chrome_options.add_argument("--disable-pdf-viewer") chrome_options.add_argument("--window-position=0,0") # Initialize the browser driver = helium.start_chrome(headless=False, options=chrome_options) def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None: sleep(1.0) # Let JavaScript animations happen before taking the screenshot driver = helium.get_driver() current_step = memory_step.step_number if driver is not None: for ( previous_memory_step ) in agent.memory.steps: # Remove previous screenshots for lean processing if ( isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2 ): previous_memory_step.observations_images = None png_bytes = driver.get_screenshot_as_png() image = Image.open(BytesIO(png_bytes)) print(f"Captured a browser screenshot: {image.size} pixels") memory_step.observations_images = [ image.copy() ] # Create a copy to ensure it persists # Update observations with current URL url_info = f"Current url: {driver.current_url}" memory_step.observations = ( url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info ) video_agent = MyAgent( api_key=os.getenv("GEMINI_API_KEY"), temperature=0.0, add_base_tools=False, additional_authorized_imports=["helium"], step_callbacks=[save_screenshot], max_steps=20, verbosity_level=2, ) video_agent.agent.python_executor("from helium import *", video_agent.agent.state) search_request = """ Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident. """ agent_output = video_agent(search_request + HELIUM_PROMPT) print("Final output:") print(agent_output)