Spaces:

R-Kentaren
/

suna

Paused

App Files Files Community

suna / backend /agent /tools /browser_tool.py

R-Kentaren

Upload folder using huggingface_hub

4efde5d verified 6 months ago

raw

history blame contribute delete

22 kB

	from agentpress.tool import ToolResult, openapi_schema, usage_example
	from agentpress.thread_manager import ThreadManager
	from sandbox.tool_base import SandboxToolsBase
	from utils.logger import logger
	from utils.s3_upload_utils import upload_base64_image
	import asyncio
	import json
	import base64
	import io
	import traceback
	from PIL import Image
	from utils.config import config

	class BrowserTool(SandboxToolsBase):
	"""
	Browser Tool for browser automation using local Stagehand API.

	This tool provides browser automation capabilities using a local Stagehand API server,
	replacing the sandbox browser tool functionality.

	Only 4 core functions that can handle everything:
	- browser_navigate_to: Navigate to URLs
	- browser_act: Perform any action (click, type, scroll, dropdowns etc.)
	- browser_extract_content: Extract content from pages
	- browser_screenshot: Take screenshots
	"""


	def __init__(self, project_id: str, thread_id: str, thread_manager: ThreadManager):
	super().__init__(project_id, thread_manager)
	self.thread_id = thread_id

	def _validate_base64_image(self, base64_string: str, max_size_mb: int = 10) -> tuple[bool, str]:
	"""
	Comprehensive validation of base64 image data.

	Args:
	base64_string (str): The base64 encoded image data
	max_size_mb (int): Maximum allowed image size in megabytes

	Returns:
	tuple[bool, str]: (is_valid, error_message)
	"""
	try:
	# Check if data exists and has reasonable length
	if not base64_string or len(base64_string) < 10:
	return False, "Base64 string is empty or too short"

	# Remove data URL prefix if present (data:image/jpeg;base64,...)
	if base64_string.startswith('data:'):
	try:
	base64_string = base64_string.split(',', 1)[1]
	except (IndexError, ValueError):
	return False, "Invalid data URL format"

	# Check if string contains only valid base64 characters
	# Base64 alphabet: A-Z, a-z, 0-9, +, /, = (padding)
	import re
	if not re.match(r'^[A-Za-z0-9+/]*={0,2}$', base64_string):
	return False, "Invalid base64 characters detected"

	# Check if base64 string length is valid (must be multiple of 4)
	if len(base64_string) % 4 != 0:
	return False, "Invalid base64 string length"

	# Attempt to decode base64
	try:
	image_data = base64.b64decode(base64_string, validate=True)
	except Exception as e:
	return False, f"Base64 decoding failed: {str(e)}"

	# Check decoded data size
	if len(image_data) == 0:
	return False, "Decoded image data is empty"

	# Check if decoded data size exceeds limit
	max_size_bytes = max_size_mb * 1024 * 1024
	if len(image_data) > max_size_bytes:
	return False, f"Image size ({len(image_data)} bytes) exceeds limit ({max_size_bytes} bytes)"

	# Validate that decoded data is actually a valid image using PIL
	try:
	image_stream = io.BytesIO(image_data)
	with Image.open(image_stream) as img:
	# Verify the image by attempting to load it
	img.verify()

	# Check if image format is supported
	supported_formats = {'JPEG', 'PNG', 'GIF', 'BMP', 'WEBP', 'TIFF'}
	if img.format not in supported_formats:
	return False, f"Unsupported image format: {img.format}"

	return True, "Image validation successful"

	except Exception as e:
	return False, f"Image validation failed: {str(e)}"

	except Exception as e:
	return False, f"Image validation error: {str(e)}"

	async def _debug_sandbox_services(self) -> str:
	"""Debug method to check what services are running in the sandbox"""
	try:
	await self._ensure_sandbox()

	# Check what processes are running
	ps_cmd = "ps aux \| grep -E '(python\|uvicorn\|stagehand\|node)' \| grep -v grep"
	response = await self.sandbox.process.exec(ps_cmd, timeout=10)

	processes = response.result if response.exit_code == 0 else "Failed to get process list"

	# Check what ports are listening
	netstat_cmd = "netstat -tlnp 2>/dev/null \| grep -E ':(8003\|8004)' \|\| ss -tlnp 2>/dev/null \| grep -E ':(8003\|8004)' \|\| echo 'No netstat/ss available'"
	response2 = await self.sandbox.process.exec(netstat_cmd, timeout=10)

	ports = response2.result if response2.exit_code == 0 else "Failed to get port list"

	debug_info = f"""
	=== Sandbox Services Debug Info ===
	Running processes:
	{processes}

	Listening ports:
	{ports}

	=== End Debug Info ===
	"""
	return debug_info

	except Exception as e:
	return f"Error getting debug info: {e}"

	async def _check_stagehand_api_health(self) -> bool:
	"""Check if the Stagehand API server is running and accessible"""
	try:
	await self._ensure_sandbox()


	# Simple health check curl command
	curl_cmd = "curl -s -X GET 'http://localhost:8004/api' -H 'Content-Type: application/json'"

	logger.debug(f"Checking Stagehand API health with: {curl_cmd}")

	response = await self.sandbox.process.exec(curl_cmd, timeout=10)
	if response.exit_code == 0:
	try:
	result = json.loads(response.result)
	if result.get("status") == "healthy":
	logger.debug("✅ Stagehand API server is running and healthy")
	return True
	else:
	# If the browser api is not healthy, we need to restart the browser api
	model_api_key = config.GEMINI_API_KEY

	response = await self.sandbox.process.exec(f"curl -X POST 'http://localhost:8004/api/init' -H 'Content-Type: application/json' -d '{{\"api_key\": \"{model_api_key}\"}}'", timeout=90)
	if response.exit_code == 0:
	logger.debug("Stagehand API server restarted successfully")
	return True
	else:
	logger.warning(f"Stagehand API server restart failed: {response.result}")
	return False
	except json.JSONDecodeError:
	logger.warning(f"Stagehand API server responded but with invalid JSON: {response.result}")
	return False
	else:
	logger.warning(f"Stagehand API server health check failed with exit code {response.exit_code}")
	return False

	except Exception as e:
	logger.error(f"Error checking Stagehand API health: {e}")
	return False

	async def _execute_stagehand_api(self, endpoint: str, params: dict = None, method: str = "POST") -> ToolResult:
	"""Execute a Stagehand action through the sandbox API"""
	try:
	# Ensure sandbox is initialized
	await self._ensure_sandbox()

	# Check if Stagehand API server is running
	stagehand_healthy = await self._check_stagehand_api_health()

	if not stagehand_healthy:
	error_msg = "Stagehand API server is not running. Please ensure the Stagehand API server is running. Error: {response}"

	# Add debug information
	debug_info = await self._debug_sandbox_services()
	error_msg += f"\n\nDebug information:\n{debug_info}"

	logger.error(error_msg)
	return self.fail_response(error_msg)


	# Build the curl command to call the local Stagehand API
	url = f"http://localhost:8004/api/{endpoint}" # Fixed localhost as curl runs inside container

	if method == "GET" and params:
	query_params = "&".join([f"{k}={v}" for k, v in params.items()])
	url = f"{url}?{query_params}"
	curl_cmd = f"curl -s -X {method} '{url}' -H 'Content-Type: application/json'"
	else:
	curl_cmd = f"curl -s -X {method} '{url}' -H 'Content-Type: application/json'"
	if params:
	json_data = json.dumps(params)
	curl_cmd += f" -d '{json_data}'"

	logger.debug(f"\033[95mExecuting curl command:\033[0m\n{curl_cmd}")

	response = await self.sandbox.process.exec(curl_cmd, timeout=30) # Execute curl inside sandbox

	if response.exit_code == 0:
	try:
	result = json.loads(response.result)
	logger.debug(f"Stagehand API result: {result}")

	logger.debug("Stagehand API request completed successfully")

	if "screenshot_base64" in result:
	try:
	screenshot_data = result["screenshot_base64"]
	is_valid, validation_message = self._validate_base64_image(screenshot_data)

	if is_valid:
	logger.debug(f"Screenshot validation passed: {validation_message}")
	image_url = await upload_base64_image(screenshot_data)
	result["image_url"] = image_url
	logger.debug(f"Uploaded screenshot to {image_url}")
	else:
	logger.warning(f"Screenshot validation failed: {validation_message}")
	result["image_validation_error"] = validation_message

	del result["screenshot_base64"]

	except Exception as e:
	logger.error(f"Failed to process screenshot: {e}")
	result["image_upload_error"] = str(e)

	result["input"] = params
	added_message = await self.thread_manager.add_message(
	thread_id=self.thread_id,
	type="browser_state",
	content=result,
	is_llm_message=False
	)

	# Prepare clean response for agent (filter out internal metadata)
	# Only include data that's useful for the agent's decision making
	clean_result = {
	"success": result.get("success", True),
	"message": result.get("message", "Stagehand action completed successfully")
	}

	# Include only data that actually comes from browserApi.ts
	if result.get("url"):
	clean_result["url"] = result["url"]
	if result.get("title"):
	clean_result["title"] = result["title"]
	if result.get("action"):
	clean_result["action"] = result["action"]
	if result.get("image_url"): # This is screenshot_base64 converted to image_url
	clean_result["image_url"] = result["image_url"]

	# Include any error context that's useful for the agent
	if result.get("image_validation_error"):
	clean_result["screenshot_issue"] = f"Screenshot processing issue: {result['image_validation_error']}"
	if result.get("image_upload_error"):
	clean_result["screenshot_issue"] = f"Screenshot upload issue: {result['image_upload_error']}"
	clean_result["message_id"] = added_message.get("message_id")

	if clean_result.get("success"):
	return self.success_response(clean_result)
	else:
	# Handle error responses with helpful context
	error_msg = result.get("error", result.get("message", "Unknown error"))
	clean_result["message"] = error_msg
	return self.fail_response(clean_result)

	except json.JSONDecodeError as e:
	logger.error(f"Failed to parse response JSON: {response.result} {e}")
	return self.fail_response(f"Failed to parse response JSON: {response.result} {e}")
	else:
	# Check if it's a connection error (exit code 7)
	if response.exit_code == 7:
	error_msg = f"Stagehand API server is not available on port 8004. Please ensure the Stagehand API server is running. Error: {response}"
	logger.error(error_msg)
	return self.fail_response(error_msg)
	else:
	logger.error(f"Stagehand API request failed: {response}")
	return self.fail_response(f"Stagehand API request failed: {response}")

	except Exception as e:
	logger.error(f"Error executing Stagehand action: {e}")
	logger.debug(traceback.format_exc())
	return self.fail_response(f"Error executing Stagehand action: {e}")

	# Core Functions Only

	@openapi_schema({
	"type": "function",
	"function": {
	"name": "browser_navigate_to",
	"description": "Navigate to a specific url",
	"parameters": {
	"type": "object",
	"properties": {
	"url": {
	"type": "string",
	"description": "The url to navigate to"
	}
	},
	"required": ["url"]
	}
	}
	})
	@usage_example('''
	<function_calls>
	<invoke name="browser_navigate_to">
	<parameter name="url">https://example.com</parameter>
	</invoke>
	</function_calls>
	''')
	async def browser_navigate_to(self, url: str) -> ToolResult:
	"""Navigate to a URL using Stagehand."""
	logger.debug(f"Browser navigating to: {url}")
	return await self._execute_stagehand_api("navigate", {"url": url})

	@openapi_schema({
	"type": "function",
	"function": {
	"name": "browser_act",
	"description": "Perform any browser action using natural language description. CRITICAL: This tool automatically provides a screenshot with every action. For data entry actions (filling forms, entering text, selecting options), you MUST review the provided screenshot to verify that displayed values exactly match what was intended. Report mismatches immediately. CRITICAL FILE UPLOAD RULE: ANY action that involves clicking, interacting with, or locating upload buttons, file inputs, resume upload sections, or any element that might trigger a choose file dialog MUST include the filePath parameter with filePath. This includes actions like 'click upload button', 'locate resume section', 'find file input' etc. Always err on the side of caution - if there's any possibility the action might lead to a file dialog, include filePath. This prevents accidental file dialog triggers without proper file handling.",
	"parameters": {
	"type": "object",
	"properties": {
	"action": {
	"type": "string",
	"description": "The action to perform. Examples: 'click the login button', 'fill in the email field with %email%', 'scroll down to see more content', 'select option 2 from the dropdown', 'press Enter', 'go back', 'wait 5 seconds', 'click at coordinates 100,200', 'select United States from the country dropdown'"
	},
	"variables": {
	"type": "object",
	"description": "Variables to use in the action. Variables in the action string are referenced using %variable_name%. These variables are NOT shared with LLM providers for security.",
	"additionalProperties": {"type": "string"},
	"default": {}
	},
	"iframes": {
	"type": "boolean",
	"description": "Whether to include iframe content in the action. Set to true if the target element is inside an iframe.",
	"default": True
	},
	"filePath": {
	"type": "string",
	"description": "CRITICAL: REQUIRED for ANY action that might involve file uploads. This includes: clicking upload buttons, locating resume sections, finding file inputs, scrolling to upload areas, or any action that could potentially trigger a file dialog. Always include this parameter when dealing with upload-related elements to prevent accidental file dialog triggers. The tool will automatically handle the file upload after the action is performed.",
	}
	},
	"required": ["action"]
	}
	}
	})
	@usage_example('''
	<function_calls>
	<invoke name="browser_act">
	<parameter name="action">fill in the login form with %username% and %password%</parameter>
	<parameter name="variables">{"username": "john.doe", "password": "secret123"}</parameter>
	<parameter name="iframes">true</parameter>
	</invoke>
	</function_calls>

	<function_calls>
	<invoke name="browser_act">
	<parameter name="action">click on upload resume button</parameter>
	<parameter name="filePath">/workspace/downloads/document.pdf</parameter>
	</invoke>
	</function_calls>
	''')
	async def browser_act(self, action: str, variables: dict = None, iframes: bool = False, filePath: dict = None) -> ToolResult:
	"""Perform any browser action using Stagehand."""
	logger.debug(f"Browser acting: {action} (variables={'***' if variables else None}, iframes={iframes}), filePath={filePath}")
	params = {"action": action, "iframes": iframes, "variables": variables}
	if filePath:
	params["filePath"] = filePath
	return await self._execute_stagehand_api("act", params)

	@openapi_schema({
	"type": "function",
	"function": {
	"name": "browser_extract_content",
	"description": "Extract structured content from the current page using Stagehand",
	"parameters": {
	"type": "object",
	"properties": {
	"instruction": {
	"type": "string",
	"description": "What content to extract (e.g., 'extract all product prices', 'get the main heading', 'extract apartment listings with address and price')"
	},
	"iframes": {
	"type": "boolean",
	"description": "Whether to include iframe content in the extraction. Set to true if the target content is inside an iframe.",
	"default": True
	}
	},
	"required": ["instruction"]
	}
	}
	})
	@usage_example('''
	<function_calls>
	<invoke name="browser_extract_content">
	<parameter name="instruction">extract all product names and prices from the main product list</parameter>
	<parameter name="iframes">true</parameter>
	</invoke>
	</function_calls>
	''')
	async def browser_extract_content(self, instruction: str, iframes: bool = False) -> ToolResult:
	"""Extract structured content from the current page using Stagehand."""
	logger.debug(f"Browser extracting: {instruction} (iframes={iframes})")
	params = {"instruction": instruction, "iframes": iframes}
	return await self._execute_stagehand_api("extract", params)

	@openapi_schema({
	"type": "function",
	"function": {
	"name": "browser_screenshot",
	"description": "Take a screenshot of the current page",
	"parameters": {
	"type": "object",
	"properties": {
	"name": {
	"type": "string",
	"description": "Name for the screenshot",
	"default": "screenshot"
	}
	}
	}
	}
	})
	@usage_example('''
	<function_calls>
	<invoke name="browser_screenshot">
	<parameter name="name">page_screenshot</parameter>
	</invoke>
	</function_calls>
	''')
	async def browser_screenshot(self, name: str = "screenshot") -> ToolResult:
	"""Take a screenshot using Stagehand."""
	logger.debug(f"Browser taking screenshot: {name}")
	return await self._execute_stagehand_api("screenshot", {"name": name})