HMM / browser-use-main /examples /features /add_image_context.py
Speedofmastery's picture
Merge Landrun + Browser-Use + Chromium with AI agent support (without binary files)
d7b3d84
"""
Show how to use sample_images to add image context for your task
"""
import asyncio
import base64
from pathlib import Path
from typing import Any
from dotenv import load_dotenv
from browser_use import Agent
from browser_use.llm import ChatOpenAI
from browser_use.llm.messages import ContentPartImageParam, ContentPartTextParam, ImageURL
# Load environment variables
load_dotenv()
def image_to_base64(image_path: str) -> str:
"""
Convert image file to base64 string.
Args:
image_path: Path to the image file
Returns:
Base64 encoded string of the image
Raises:
FileNotFoundError: If image file doesn't exist
IOError: If image file cannot be read
"""
image_file = Path(image_path)
if not image_file.exists():
raise FileNotFoundError(f'Image file not found: {image_path}')
try:
with open(image_file, 'rb') as f:
encoded_string = base64.b64encode(f.read())
return encoded_string.decode('utf-8')
except OSError as e:
raise OSError(f'Failed to read image file: {e}')
def create_sample_images() -> list[ContentPartTextParam | ContentPartImageParam]:
"""
Create image context for the agent.
Returns:
list of content parts containing text and image data
"""
# Image path - replace with your actual image path
image_path = 'sample_image.png'
# Image context configuration
image_context: list[dict[str, Any]] = [
{
'type': 'text',
'value': (
'The following image explains the google layout. '
'The image highlights several buttons with red boxes, '
'and next to them are corresponding labels in red text.\n'
'Each label corresponds to a button as follows:\n'
'Label 1 is the "image" button.'
),
},
{'type': 'image', 'value': image_to_base64(image_path)},
]
# Convert to content parts
content_parts = []
for item in image_context:
if item['type'] == 'text':
content_parts.append(ContentPartTextParam(text=item['value']))
elif item['type'] == 'image':
content_parts.append(
ContentPartImageParam(
image_url=ImageURL(
url=f'data:image/jpeg;base64,{item["value"]}',
media_type='image/jpeg',
),
)
)
return content_parts
async def main() -> None:
"""
Main function to run the browser agent with image context.
"""
# Task configuration
task_str = 'goto https://www.google.com/ and click image button'
# Initialize the language model
model = ChatOpenAI(model='gpt-4.1')
# Create sample images for context
try:
sample_images = create_sample_images()
except (FileNotFoundError, OSError) as e:
print(f'Error loading sample images: {e}')
print('Continuing without sample images...')
sample_images = []
# Initialize and run the agent
agent = Agent(task=task_str, llm=model, sample_images=sample_images)
await agent.run()
if __name__ == '__main__':
asyncio.run(main())