""" Show how to use sample_images to add image context for your task """ import asyncio import base64 from pathlib import Path from typing import Any from dotenv import load_dotenv from browser_use import Agent from browser_use.llm import ChatOpenAI from browser_use.llm.messages import ContentPartImageParam, ContentPartTextParam, ImageURL # Load environment variables load_dotenv() def image_to_base64(image_path: str) -> str: """ Convert image file to base64 string. Args: image_path: Path to the image file Returns: Base64 encoded string of the image Raises: FileNotFoundError: If image file doesn't exist IOError: If image file cannot be read """ image_file = Path(image_path) if not image_file.exists(): raise FileNotFoundError(f'Image file not found: {image_path}') try: with open(image_file, 'rb') as f: encoded_string = base64.b64encode(f.read()) return encoded_string.decode('utf-8') except OSError as e: raise OSError(f'Failed to read image file: {e}') def create_sample_images() -> list[ContentPartTextParam | ContentPartImageParam]: """ Create image context for the agent. Returns: list of content parts containing text and image data """ # Image path - replace with your actual image path image_path = 'sample_image.png' # Image context configuration image_context: list[dict[str, Any]] = [ { 'type': 'text', 'value': ( 'The following image explains the google layout. ' 'The image highlights several buttons with red boxes, ' 'and next to them are corresponding labels in red text.\n' 'Each label corresponds to a button as follows:\n' 'Label 1 is the "image" button.' ), }, {'type': 'image', 'value': image_to_base64(image_path)}, ] # Convert to content parts content_parts = [] for item in image_context: if item['type'] == 'text': content_parts.append(ContentPartTextParam(text=item['value'])) elif item['type'] == 'image': content_parts.append( ContentPartImageParam( image_url=ImageURL( url=f'data:image/jpeg;base64,{item["value"]}', media_type='image/jpeg', ), ) ) return content_parts async def main() -> None: """ Main function to run the browser agent with image context. """ # Task configuration task_str = 'goto https://www.google.com/ and click image button' # Initialize the language model model = ChatOpenAI(model='gpt-4.1') # Create sample images for context try: sample_images = create_sample_images() except (FileNotFoundError, OSError) as e: print(f'Error loading sample images: {e}') print('Continuing without sample images...') sample_images = [] # Initialize and run the agent agent = Agent(task=task_str, llm=model, sample_images=sample_images) await agent.run() if __name__ == '__main__': asyncio.run(main())