Spaces:

Speedofmastery
/

HMM

Sleeping

App Files Files Community

HMM / browser-use-main /examples /custom-functions /cua.py

Speedofmastery

Merge Landrun + Browser-Use + Chromium with AI agent support (without binary files)

d7b3d84 4 months ago

raw

history blame contribute delete

10.6 kB

	"""
	OpenAI Computer Use Assistant (CUA) Integration

	This example demonstrates how to integrate OpenAI's Computer Use Assistant as a fallback
	action when standard browser actions are insufficient to achieve the desired goal.
	The CUA can perform complex computer interactions that might be difficult to achieve
	through regular browser-use actions.
	"""

	import asyncio
	import base64
	import os
	import sys
	from io import BytesIO

	from PIL import Image

	sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

	from dotenv import load_dotenv

	load_dotenv()

	from openai import AsyncOpenAI
	from pydantic import BaseModel, Field

	from browser_use import Agent, ChatOpenAI, Tools
	from browser_use.agent.views import ActionResult
	from browser_use.browser import BrowserSession


	class OpenAICUAAction(BaseModel):
	"""Parameters for OpenAI Computer Use Assistant action."""

	description: str = Field(..., description='Description of your next goal')


	async def handle_model_action(browser_session: BrowserSession, action) -> ActionResult:
	"""
	Given a computer action (e.g., click, double_click, scroll, etc.),
	execute the corresponding operation using CDP.
	"""
	action_type = action.type
	ERROR_MSG: str = 'Could not execute the CUA action.'

	if not browser_session.agent_focus:
	return ActionResult(error='No active browser session')

	try:
	match action_type:
	case 'click':
	x, y = action.x, action.y
	button = action.button
	print(f"Action: click at ({x}, {y}) with button '{button}'")
	# Not handling things like middle click, etc.
	if button != 'left' and button != 'right':
	button = 'left'

	# Use CDP to click
	await browser_session.agent_focus.cdp_client.send.Input.dispatchMouseEvent(
	params={
	'type': 'mousePressed',
	'x': x,
	'y': y,
	'button': button,
	'clickCount': 1,
	},
	session_id=browser_session.agent_focus.session_id,
	)
	await browser_session.agent_focus.cdp_client.send.Input.dispatchMouseEvent(
	params={
	'type': 'mouseReleased',
	'x': x,
	'y': y,
	'button': button,
	},
	session_id=browser_session.agent_focus.session_id,
	)
	msg = f'Clicked at ({x}, {y}) with button {button}'
	return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=msg)

	case 'scroll':
	x, y = action.x, action.y
	scroll_x, scroll_y = action.scroll_x, action.scroll_y
	print(f'Action: scroll at ({x}, {y}) with offsets (scroll_x={scroll_x}, scroll_y={scroll_y})')

	# Move mouse to position first
	await browser_session.agent_focus.cdp_client.send.Input.dispatchMouseEvent(
	params={
	'type': 'mouseMoved',
	'x': x,
	'y': y,
	},
	session_id=browser_session.agent_focus.session_id,
	)

	# Execute scroll using JavaScript
	await browser_session.agent_focus.cdp_client.send.Runtime.evaluate(
	params={
	'expression': f'window.scrollBy({scroll_x}, {scroll_y})',
	},
	session_id=browser_session.agent_focus.session_id,
	)
	msg = f'Scrolled at ({x}, {y}) with offsets (scroll_x={scroll_x}, scroll_y={scroll_y})'
	return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=msg)

	case 'keypress':
	keys = action.keys
	for k in keys:
	print(f"Action: keypress '{k}'")
	# A simple mapping for common keys; expand as needed.
	key_code = k
	if k.lower() == 'enter':
	key_code = 'Enter'
	elif k.lower() == 'space':
	key_code = 'Space'

	# Use CDP to send key
	await browser_session.agent_focus.cdp_client.send.Input.dispatchKeyEvent(
	params={
	'type': 'keyDown',
	'key': key_code,
	},
	session_id=browser_session.agent_focus.session_id,
	)
	await browser_session.agent_focus.cdp_client.send.Input.dispatchKeyEvent(
	params={
	'type': 'keyUp',
	'key': key_code,
	},
	session_id=browser_session.agent_focus.session_id,
	)
	msg = f'Pressed keys: {keys}'
	return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=msg)

	case 'type':
	text = action.text
	print(f'Action: type text: {text}')

	# Type text character by character
	for char in text:
	await browser_session.agent_focus.cdp_client.send.Input.dispatchKeyEvent(
	params={
	'type': 'char',
	'text': char,
	},
	session_id=browser_session.agent_focus.session_id,
	)
	msg = f'Typed text: {text}'
	return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=msg)

	case 'wait':
	print('Action: wait')
	await asyncio.sleep(2)
	msg = 'Waited for 2 seconds'
	return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=msg)

	case 'screenshot':
	# Nothing to do as screenshot is taken at each turn
	print('Action: screenshot')
	return ActionResult(error=ERROR_MSG)
	# Handle other actions here

	case _:
	print(f'Unrecognized action: {action}')
	return ActionResult(error=ERROR_MSG)

	except Exception as e:
	print(f'Error handling action {action}: {e}')
	return ActionResult(error=ERROR_MSG)


	tools = Tools()


	@tools.registry.action(
	'Use OpenAI Computer Use Assistant (CUA) as a fallback when standard browser actions cannot achieve the desired goal. This action sends a screenshot and description to OpenAI CUA and executes the returned computer use actions.',
	param_model=OpenAICUAAction,
	)
	async def openai_cua_fallback(params: OpenAICUAAction, browser_session: BrowserSession):
	"""
	Fallback action that uses OpenAI's Computer Use Assistant to perform complex
	computer interactions when standard browser actions are insufficient.
	"""
	print(f'🎯 CUA Action Starting - Goal: {params.description}')

	try:
	# Get browser state summary
	state = await browser_session.get_browser_state_summary()
	page_info = state.page_info
	if not page_info:
	raise Exception('Page info not found - cannot execute CUA action')

	print(f'📐 Viewport size: {page_info.viewport_width}x{page_info.viewport_height}')

	screenshot_b64 = state.screenshot
	if not screenshot_b64:
	raise Exception('Screenshot not found - cannot execute CUA action')

	print(f'📸 Screenshot captured (base64 length: {len(screenshot_b64)} chars)')

	# Debug: Check screenshot dimensions
	image = Image.open(BytesIO(base64.b64decode(screenshot_b64)))
	print(f'📏 Screenshot actual dimensions: {image.size[0]}x{image.size[1]}')

	# rescale the screenshot to the viewport size
	image = image.resize((page_info.viewport_width, page_info.viewport_height))
	# Save as PNG to bytes buffer
	buffer = BytesIO()
	image.save(buffer, format='PNG')
	buffer.seek(0)
	# Convert to base64
	screenshot_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
	print(f'📸 Rescaled screenshot to viewport size: {page_info.viewport_width}x{page_info.viewport_height}')

	client = AsyncOpenAI(api_key=os.getenv('OPENAI_API_KEY'))
	print('🔄 Sending request to OpenAI CUA...')

	prompt = f"""
	You will be given an action to execute and screenshot of the current screen.
	Output one computer_call object that will achieve this goal.
	Goal: {params.description}
	"""
	response = await client.responses.create(
	model='computer-use-preview',
	tools=[
	{
	'type': 'computer_use_preview',
	'display_width': page_info.viewport_width,
	'display_height': page_info.viewport_height,
	'environment': 'browser',
	}
	],
	input=[
	{
	'role': 'user',
	'content': [
	{'type': 'input_text', 'text': prompt},
	{
	'type': 'input_image',
	'detail': 'auto',
	'image_url': f'data:image/png;base64,{screenshot_b64}',
	},
	],
	}
	],
	truncation='auto',
	temperature=0.1,
	)

	print(f'📥 CUA response received: {response}')
	computer_calls = [item for item in response.output if item.type == 'computer_call']
	computer_call = computer_calls[0] if computer_calls else None
	if not computer_call:
	raise Exception('No computer calls found in CUA response')

	action = computer_call.action
	print(f'🎬 Executing CUA action: {action.type} - {action}')

	action_result = await handle_model_action(browser_session, action)
	await asyncio.sleep(0.1)

	print('✅ CUA action completed successfully')
	return action_result

	except Exception as e:
	msg = f'Error executing CUA action: {e}'
	print(f'❌ {msg}')
	return ActionResult(error=msg)


	async def main():
	# Initialize the language model
	llm = ChatOpenAI(
	model='o4-mini',
	temperature=1.0,
	)

	# Create browser session
	browser_session = BrowserSession()

	# Example task that might require CUA fallback
	# This could be a complex interaction that's difficult with standard actions
	task = """
	Go to https://csreis.github.io/tests/cross-site-iframe.html
	Click on "Go cross-site, complex page" using index
	Use the OpenAI CUA fallback to click on "Tree is open..." link.
	"""

	# Create agent with our custom tools that includes CUA fallback
	agent = Agent(
	task=task,
	llm=llm,
	tools=tools,
	browser_session=browser_session,
	)

	print('🚀 Starting agent with CUA fallback support...')
	print(f'Task: {task}')
	print('-' * 50)

	try:
	# Run the agent
	result = await agent.run()
	print(f'\n✅ Task completed! Result: {result}')

	except Exception as e:
	print(f'\n❌ Error running agent: {e}')

	finally:
	# Clean up browser session
	await browser_session.kill()
	print('\n🧹 Browser session closed')


	if __name__ == '__main__':
	# Example of different scenarios where CUA might be useful

	print('🔧 OpenAI Computer Use Assistant (CUA) Integration Example')
	print('=' * 60)
	print()
	print("This example shows how to integrate OpenAI's CUA as a fallback action")
	print('when standard browser-use actions cannot achieve the desired goal.')
	print()
	print('CUA is particularly useful for:')
	print('• Complex mouse interactions (drag & drop, precise clicking)')
	print('• Keyboard shortcuts and key combinations')
	print('• Actions that require pixel-perfect precision')
	print("• Custom UI elements that don't respond to standard actions")
	print()
	print('Make sure you have OPENAI_API_KEY set in your environment!')
	print()

	# Check if OpenAI API key is available
	if not os.getenv('OPENAI_API_KEY'):
	print('❌ Error: OPENAI_API_KEY environment variable not set')
	print('Please set your OpenAI API key to use CUA integration')
	sys.exit(1)

	# Run the example
	asyncio.run(main())