Final_Assignment_Template

Sleeping

File size: 2,620 Bytes

from io import BytesIO
from time import sleep
import os
import sys

# Add the parent directory to the Python path so modules can be found
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import helium
from dotenv import load_dotenv
from PIL import Image
from selenium import webdriver

from smolagents import CodeAgent
from smolagents.agents import ActionStep
from agents.agent import MyAgent
from prompts.helium import HELIUM_PROMPT

load_dotenv()

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")

# Initialize the browser
driver = helium.start_chrome(headless=False, options=chrome_options)


def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
    sleep(1.0)  # Let JavaScript animations happen before taking the screenshot
    driver = helium.get_driver()
    current_step = memory_step.step_number
    if driver is not None:
        for (
            previous_memory_step
        ) in agent.memory.steps:  # Remove previous screenshots for lean processing
            if (
                isinstance(previous_memory_step, ActionStep)
                and previous_memory_step.step_number <= current_step - 2
            ):
                previous_memory_step.observations_images = None
        png_bytes = driver.get_screenshot_as_png()
        image = Image.open(BytesIO(png_bytes))
        print(f"Captured a browser screenshot: {image.size} pixels")
        memory_step.observations_images = [
            image.copy()
        ]  # Create a copy to ensure it persists

    # Update observations with current URL
    url_info = f"Current url: {driver.current_url}"
    memory_step.observations = (
        url_info
        if memory_step.observations is None
        else memory_step.observations + "\n" + url_info
    )


video_agent = MyAgent(
    api_key=os.getenv("GEMINI_API_KEY"),
    temperature=0.0,
    add_base_tools=False,
    additional_authorized_imports=["helium"],
    step_callbacks=[save_screenshot],
    max_steps=20,
    verbosity_level=2,
)

video_agent.agent.python_executor("from helium import *", video_agent.agent.state)


search_request = """
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
"""

agent_output = video_agent(search_request + HELIUM_PROMPT)
print("Final output:")
print(agent_output)