| import traceback |
| import json |
| import base64 |
| import io |
| from PIL import Image |
|
|
| from agentpress.tool import ToolResult, openapi_schema, xml_schema |
| from agentpress.thread_manager import ThreadManager |
| from sandbox.tool_base import SandboxToolsBase |
| from utils.logger import logger |
| from utils.s3_upload_utils import upload_base64_image |
|
|
|
|
| class SandboxBrowserTool(SandboxToolsBase): |
| """Tool for executing tasks in a Daytona sandbox with browser-use capabilities.""" |
| |
| def __init__(self, project_id: str, thread_id: str, thread_manager: ThreadManager): |
| super().__init__(project_id, thread_manager) |
| self.thread_id = thread_id |
|
|
| def _validate_base64_image(self, base64_string: str, max_size_mb: int = 10) -> tuple[bool, str]: |
| """ |
| Comprehensive validation of base64 image data. |
| |
| Args: |
| base64_string (str): The base64 encoded image data |
| max_size_mb (int): Maximum allowed image size in megabytes |
| |
| Returns: |
| tuple[bool, str]: (is_valid, error_message) |
| """ |
| try: |
| |
| if not base64_string or len(base64_string) < 10: |
| return False, "Base64 string is empty or too short" |
| |
| |
| if base64_string.startswith('data:'): |
| try: |
| base64_string = base64_string.split(',', 1)[1] |
| except (IndexError, ValueError): |
| return False, "Invalid data URL format" |
| |
| |
| |
| import re |
| if not re.match(r'^[A-Za-z0-9+/]*={0,2}$', base64_string): |
| return False, "Invalid base64 characters detected" |
| |
| |
| if len(base64_string) % 4 != 0: |
| return False, "Invalid base64 string length" |
| |
| |
| try: |
| image_data = base64.b64decode(base64_string, validate=True) |
| except Exception as e: |
| return False, f"Base64 decoding failed: {str(e)}" |
| |
| |
| if len(image_data) == 0: |
| return False, "Decoded image data is empty" |
| |
| |
| max_size_bytes = max_size_mb * 1024 * 1024 |
| if len(image_data) > max_size_bytes: |
| return False, f"Image size ({len(image_data)} bytes) exceeds limit ({max_size_bytes} bytes)" |
| |
| |
| try: |
| image_stream = io.BytesIO(image_data) |
| with Image.open(image_stream) as img: |
| |
| img.verify() |
| |
| |
| supported_formats = {'JPEG', 'PNG', 'GIF', 'BMP', 'WEBP', 'TIFF'} |
| if img.format not in supported_formats: |
| return False, f"Unsupported image format: {img.format}" |
| |
| |
| image_stream.seek(0) |
| with Image.open(image_stream) as img_check: |
| width, height = img_check.size |
| |
| |
| max_dimension = 8192 |
| if width > max_dimension or height > max_dimension: |
| return False, f"Image dimensions ({width}x{height}) exceed limit ({max_dimension}x{max_dimension})" |
| |
| |
| if width < 1 or height < 1: |
| return False, f"Invalid image dimensions: {width}x{height}" |
| |
| logger.debug(f"Valid image detected: {img.format}, {width}x{height}, {len(image_data)} bytes") |
| |
| except Exception as e: |
| return False, f"Invalid image data: {str(e)}" |
| |
| return True, "Valid image" |
| |
| except Exception as e: |
| logger.error(f"Unexpected error during base64 image validation: {e}") |
| return False, f"Validation error: {str(e)}" |
|
|
| async def _execute_browser_action(self, endpoint: str, params: dict = None, method: str = "POST") -> ToolResult: |
| """Execute a browser automation action through the API |
| |
| Args: |
| endpoint (str): The API endpoint to call |
| params (dict, optional): Parameters to send. Defaults to None. |
| method (str, optional): HTTP method to use. Defaults to "POST". |
| |
| Returns: |
| ToolResult: Result of the execution |
| """ |
| try: |
| |
| await self._ensure_sandbox() |
| |
| |
| url = f"http://localhost:8003/api/automation/{endpoint}" |
| |
| if method == "GET" and params: |
| query_params = "&".join([f"{k}={v}" for k, v in params.items()]) |
| url = f"{url}?{query_params}" |
| curl_cmd = f"curl -s -X {method} '{url}' -H 'Content-Type: application/json'" |
| else: |
| curl_cmd = f"curl -s -X {method} '{url}' -H 'Content-Type: application/json'" |
| if params: |
| json_data = json.dumps(params) |
| curl_cmd += f" -d '{json_data}'" |
| |
| logger.debug("\033[95mExecuting curl command:\033[0m") |
| logger.debug(f"{curl_cmd}") |
| |
| response = await self.sandbox.process.exec(curl_cmd, timeout=30) |
| |
| if response.exit_code == 0: |
| try: |
| result = json.loads(response.result) |
|
|
| if not "content" in result: |
| result["content"] = "" |
| |
| if not "role" in result: |
| result["role"] = "assistant" |
|
|
| logger.info("Browser automation request completed successfully") |
|
|
| if "screenshot_base64" in result: |
| try: |
| |
| screenshot_data = result["screenshot_base64"] |
| is_valid, validation_message = self._validate_base64_image(screenshot_data) |
| |
| if is_valid: |
| logger.debug(f"Screenshot validation passed: {validation_message}") |
| image_url = await upload_base64_image(screenshot_data) |
| result["image_url"] = image_url |
| logger.debug(f"Uploaded screenshot to {image_url}") |
| else: |
| logger.warning(f"Screenshot validation failed: {validation_message}") |
| result["image_validation_error"] = validation_message |
| |
| |
| del result["screenshot_base64"] |
| |
| except Exception as e: |
| logger.error(f"Failed to process screenshot: {e}") |
| result["image_upload_error"] = str(e) |
|
|
| added_message = await self.thread_manager.add_message( |
| thread_id=self.thread_id, |
| type="browser_state", |
| content=result, |
| is_llm_message=False |
| ) |
|
|
| success_response = {} |
|
|
| if result.get("success"): |
| success_response["success"] = result["success"] |
| success_response["message"] = result.get("message", "Browser action completed successfully") |
| else: |
| success_response["success"] = False |
| success_response["message"] = result.get("message", "Browser action failed") |
|
|
| if added_message and 'message_id' in added_message: |
| success_response['message_id'] = added_message['message_id'] |
| if result.get("url"): |
| success_response["url"] = result["url"] |
| if result.get("title"): |
| success_response["title"] = result["title"] |
| if result.get("element_count"): |
| success_response["elements_found"] = result["element_count"] |
| if result.get("pixels_below"): |
| success_response["scrollable_content"] = result["pixels_below"] > 0 |
| if result.get("ocr_text"): |
| success_response["ocr_text"] = result["ocr_text"] |
| if result.get("image_url"): |
| success_response["image_url"] = result["image_url"] |
|
|
| if success_response.get("success"): |
| return self.success_response(success_response) |
| else: |
| return self.fail_response(success_response) |
|
|
| except json.JSONDecodeError as e: |
| logger.error(f"Failed to parse response JSON: {response.result} {e}") |
| return self.fail_response(f"Failed to parse response JSON: {response.result} {e}") |
| else: |
| logger.error(f"Browser automation request failed 2: {response}") |
| return self.fail_response(f"Browser automation request failed 2: {response}") |
|
|
| except Exception as e: |
| logger.error(f"Error executing browser action: {e}") |
| logger.debug(traceback.format_exc()) |
| return self.fail_response(f"Error executing browser action: {e}") |
|
|
|
|
| @openapi_schema({ |
| "type": "function", |
| "function": { |
| "name": "browser_navigate_to", |
| "description": "Navigate to a specific url", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "url": { |
| "type": "string", |
| "description": "The url to navigate to" |
| } |
| }, |
| "required": ["url"] |
| } |
| } |
| }) |
| @xml_schema( |
| tag_name="browser-navigate-to", |
| mappings=[ |
| {"param_name": "url", "node_type": "content", "path": "."} |
| ], |
| example=''' |
| <function_calls> |
| <invoke name="browser_navigate_to"> |
| <parameter name="url">https://example.com</parameter> |
| </invoke> |
| </function_calls> |
| ''' |
| ) |
| async def browser_navigate_to(self, url: str) -> ToolResult: |
| """Navigate to a specific url |
| |
| Args: |
| url (str): The url to navigate to |
| |
| Returns: |
| dict: Result of the execution |
| """ |
| return await self._execute_browser_action("navigate_to", {"url": url}) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| @openapi_schema({ |
| "type": "function", |
| "function": { |
| "name": "browser_go_back", |
| "description": "Navigate back in browser history", |
| "parameters": { |
| "type": "object", |
| "properties": {} |
| } |
| } |
| }) |
| @xml_schema( |
| tag_name="browser-go-back", |
| mappings=[], |
| example=''' |
| <function_calls> |
| <invoke name="browser_go_back"> |
| </invoke> |
| </function_calls> |
| ''' |
| ) |
| async def browser_go_back(self) -> ToolResult: |
| """Navigate back in browser history |
| |
| Returns: |
| dict: Result of the execution |
| """ |
| logger.debug(f"\033[95mNavigating back in browser history\033[0m") |
| return await self._execute_browser_action("go_back", {}) |
|
|
| @openapi_schema({ |
| "type": "function", |
| "function": { |
| "name": "browser_wait", |
| "description": "Wait for the specified number of seconds", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "seconds": { |
| "type": "integer", |
| "description": "Number of seconds to wait (default: 3)" |
| } |
| } |
| } |
| } |
| }) |
| @xml_schema( |
| tag_name="browser-wait", |
| mappings=[ |
| {"param_name": "seconds", "node_type": "content", "path": "."} |
| ], |
| example=''' |
| <function_calls> |
| <invoke name="browser_wait"> |
| <parameter name="seconds">5</parameter> |
| </invoke> |
| </function_calls> |
| ''' |
| ) |
| async def browser_wait(self, seconds: int = 3) -> ToolResult: |
| """Wait for the specified number of seconds |
| |
| Args: |
| seconds (int, optional): Number of seconds to wait. Defaults to 3. |
| |
| Returns: |
| dict: Result of the execution |
| """ |
| logger.debug(f"\033[95mWaiting for {seconds} seconds\033[0m") |
| return await self._execute_browser_action("wait", {"seconds": seconds}) |
|
|
| @openapi_schema({ |
| "type": "function", |
| "function": { |
| "name": "browser_click_element", |
| "description": "Click on an element by index", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "index": { |
| "type": "integer", |
| "description": "The index of the element to click" |
| } |
| }, |
| "required": ["index"] |
| } |
| } |
| }) |
| @xml_schema( |
| tag_name="browser-click-element", |
| mappings=[ |
| {"param_name": "index", "node_type": "content", "path": "."} |
| ], |
| example=''' |
| <function_calls> |
| <invoke name="browser_click_element"> |
| <parameter name="index">2</parameter> |
| </invoke> |
| </function_calls> |
| ''' |
| ) |
| async def browser_click_element(self, index: int) -> ToolResult: |
| """Click on an element by index |
| |
| Args: |
| index (int): The index of the element to click |
| |
| Returns: |
| dict: Result of the execution |
| """ |
| logger.debug(f"\033[95mClicking element with index: {index}\033[0m") |
| return await self._execute_browser_action("click_element", {"index": index}) |
|
|
| @openapi_schema({ |
| "type": "function", |
| "function": { |
| "name": "browser_input_text", |
| "description": "Input text into an element", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "index": { |
| "type": "integer", |
| "description": "The index of the element to input text into" |
| }, |
| "text": { |
| "type": "string", |
| "description": "The text to input" |
| } |
| }, |
| "required": ["index", "text"] |
| } |
| } |
| }) |
| @xml_schema( |
| tag_name="browser-input-text", |
| mappings=[ |
| {"param_name": "index", "node_type": "attribute", "path": "."}, |
| {"param_name": "text", "node_type": "content", "path": "."} |
| ], |
| example=''' |
| <function_calls> |
| <invoke name="browser_input_text"> |
| <parameter name="index">2</parameter> |
| <parameter name="text">Hello, world!</parameter> |
| </invoke> |
| </function_calls> |
| ''' |
| ) |
| async def browser_input_text(self, index: int, text: str) -> ToolResult: |
| """Input text into an element |
| |
| Args: |
| index (int): The index of the element to input text into |
| text (str): The text to input |
| |
| Returns: |
| dict: Result of the execution |
| """ |
| logger.debug(f"\033[95mInputting text into element {index}: {text}\033[0m") |
| return await self._execute_browser_action("input_text", {"index": index, "text": text}) |
|
|
| @openapi_schema({ |
| "type": "function", |
| "function": { |
| "name": "browser_send_keys", |
| "description": "Send keyboard keys such as Enter, Escape, or keyboard shortcuts", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "keys": { |
| "type": "string", |
| "description": "The keys to send (e.g., 'Enter', 'Escape', 'Control+a')" |
| } |
| }, |
| "required": ["keys"] |
| } |
| } |
| }) |
| @xml_schema( |
| tag_name="browser-send-keys", |
| mappings=[ |
| {"param_name": "keys", "node_type": "content", "path": "."} |
| ], |
| example=''' |
| <function_calls> |
| <invoke name="browser_send_keys"> |
| <parameter name="keys">Enter</parameter> |
| </invoke> |
| </function_calls> |
| ''' |
| ) |
| async def browser_send_keys(self, keys: str) -> ToolResult: |
| """Send keyboard keys |
| |
| Args: |
| keys (str): The keys to send (e.g., 'Enter', 'Escape', 'Control+a') |
| |
| Returns: |
| dict: Result of the execution |
| """ |
| logger.debug(f"\033[95mSending keys: {keys}\033[0m") |
| return await self._execute_browser_action("send_keys", {"keys": keys}) |
|
|
| @openapi_schema({ |
| "type": "function", |
| "function": { |
| "name": "browser_switch_tab", |
| "description": "Switch to a different browser tab", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "page_id": { |
| "type": "integer", |
| "description": "The ID of the tab to switch to" |
| } |
| }, |
| "required": ["page_id"] |
| } |
| } |
| }) |
| @xml_schema( |
| tag_name="browser-switch-tab", |
| mappings=[ |
| {"param_name": "page_id", "node_type": "content", "path": "."} |
| ], |
| example=''' |
| <function_calls> |
| <invoke name="browser_switch_tab"> |
| <parameter name="page_id">1</parameter> |
| </invoke> |
| </function_calls> |
| ''' |
| ) |
| async def browser_switch_tab(self, page_id: int) -> ToolResult: |
| """Switch to a different browser tab |
| |
| Args: |
| page_id (int): The ID of the tab to switch to |
| |
| Returns: |
| dict: Result of the execution |
| """ |
| logger.debug(f"\033[95mSwitching to tab: {page_id}\033[0m") |
| return await self._execute_browser_action("switch_tab", {"page_id": page_id}) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| @openapi_schema({ |
| "type": "function", |
| "function": { |
| "name": "browser_close_tab", |
| "description": "Close a browser tab", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "page_id": { |
| "type": "integer", |
| "description": "The ID of the tab to close" |
| } |
| }, |
| "required": ["page_id"] |
| } |
| } |
| }) |
| @xml_schema( |
| tag_name="browser-close-tab", |
| mappings=[ |
| {"param_name": "page_id", "node_type": "content", "path": "."} |
| ], |
| example=''' |
| <function_calls> |
| <invoke name="browser_close_tab"> |
| <parameter name="page_id">1</parameter> |
| </invoke> |
| </function_calls> |
| ''' |
| ) |
| async def browser_close_tab(self, page_id: int) -> ToolResult: |
| """Close a browser tab |
| |
| Args: |
| page_id (int): The ID of the tab to close |
| |
| Returns: |
| dict: Result of the execution |
| """ |
| logger.debug(f"\033[95mClosing tab: {page_id}\033[0m") |
| return await self._execute_browser_action("close_tab", {"page_id": page_id}) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| @openapi_schema({ |
| "type": "function", |
| "function": { |
| "name": "browser_scroll_down", |
| "description": "Scroll down the page", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "amount": { |
| "type": "integer", |
| "description": "Pixel amount to scroll (if not specified, scrolls one page)" |
| } |
| } |
| } |
| } |
| }) |
| @xml_schema( |
| tag_name="browser-scroll-down", |
| mappings=[ |
| {"param_name": "amount", "node_type": "content", "path": "."} |
| ], |
| example=''' |
| <function_calls> |
| <invoke name="browser_scroll_down"> |
| <parameter name="amount">500</parameter> |
| </invoke> |
| </function_calls> |
| ''' |
| ) |
| async def browser_scroll_down(self, amount: int = None) -> ToolResult: |
| """Scroll down the page |
| |
| Args: |
| amount (int, optional): Pixel amount to scroll. If None, scrolls one page. |
| |
| Returns: |
| dict: Result of the execution |
| """ |
| params = {} |
| if amount is not None: |
| params["amount"] = amount |
| logger.debug(f"\033[95mScrolling down by {amount} pixels\033[0m") |
| else: |
| logger.debug(f"\033[95mScrolling down one page\033[0m") |
| |
| return await self._execute_browser_action("scroll_down", params) |
|
|
| @openapi_schema({ |
| "type": "function", |
| "function": { |
| "name": "browser_scroll_up", |
| "description": "Scroll up the page", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "amount": { |
| "type": "integer", |
| "description": "Pixel amount to scroll (if not specified, scrolls one page)" |
| } |
| } |
| } |
| } |
| }) |
| @xml_schema( |
| tag_name="browser-scroll-up", |
| mappings=[ |
| {"param_name": "amount", "node_type": "content", "path": "."} |
| ], |
| example=''' |
| <function_calls> |
| <invoke name="browser_scroll_up"> |
| <parameter name="amount">500</parameter> |
| </invoke> |
| </function_calls> |
| ''' |
| ) |
| async def browser_scroll_up(self, amount: int = None) -> ToolResult: |
| """Scroll up the page |
| |
| Args: |
| amount (int, optional): Pixel amount to scroll. If None, scrolls one page. |
| |
| Returns: |
| dict: Result of the execution |
| """ |
| params = {} |
| if amount is not None: |
| params["amount"] = amount |
| logger.debug(f"\033[95mScrolling up by {amount} pixels\033[0m") |
| else: |
| logger.debug(f"\033[95mScrolling up one page\033[0m") |
| |
| return await self._execute_browser_action("scroll_up", params) |
|
|
| @openapi_schema({ |
| "type": "function", |
| "function": { |
| "name": "browser_scroll_to_text", |
| "description": "Scroll to specific text on the page", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "text": { |
| "type": "string", |
| "description": "The text to scroll to" |
| } |
| }, |
| "required": ["text"] |
| } |
| } |
| }) |
| @xml_schema( |
| tag_name="browser-scroll-to-text", |
| mappings=[ |
| {"param_name": "text", "node_type": "content", "path": "."} |
| ], |
| example=''' |
| <function_calls> |
| <invoke name="browser_scroll_to_text"> |
| <parameter name="text">Contact Us</parameter> |
| </invoke> |
| </function_calls> |
| ''' |
| ) |
| async def browser_scroll_to_text(self, text: str) -> ToolResult: |
| """Scroll to specific text on the page |
| |
| Args: |
| text (str): The text to scroll to |
| |
| Returns: |
| dict: Result of the execution |
| """ |
| logger.debug(f"\033[95mScrolling to text: {text}\033[0m") |
| return await self._execute_browser_action("scroll_to_text", {"text": text}) |
|
|
| @openapi_schema({ |
| "type": "function", |
| "function": { |
| "name": "browser_get_dropdown_options", |
| "description": "Get all options from a dropdown element", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "index": { |
| "type": "integer", |
| "description": "The index of the dropdown element" |
| } |
| }, |
| "required": ["index"] |
| } |
| } |
| }) |
| @xml_schema( |
| tag_name="browser-get-dropdown-options", |
| mappings=[ |
| {"param_name": "index", "node_type": "content", "path": "."} |
| ], |
| example=''' |
| <function_calls> |
| <invoke name="browser_get_dropdown_options"> |
| <parameter name="index">2</parameter> |
| </invoke> |
| </function_calls> |
| ''' |
| ) |
| async def browser_get_dropdown_options(self, index: int) -> ToolResult: |
| """Get all options from a dropdown element |
| |
| Args: |
| index (int): The index of the dropdown element |
| |
| Returns: |
| dict: Result of the execution with the dropdown options |
| """ |
| logger.debug(f"\033[95mGetting options from dropdown with index: {index}\033[0m") |
| return await self._execute_browser_action("get_dropdown_options", {"index": index}) |
|
|
| @openapi_schema({ |
| "type": "function", |
| "function": { |
| "name": "browser_select_dropdown_option", |
| "description": "Select an option from a dropdown by text", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "index": { |
| "type": "integer", |
| "description": "The index of the dropdown element" |
| }, |
| "text": { |
| "type": "string", |
| "description": "The text of the option to select" |
| } |
| }, |
| "required": ["index", "text"] |
| } |
| } |
| }) |
| @xml_schema( |
| tag_name="browser-select-dropdown-option", |
| mappings=[ |
| {"param_name": "index", "node_type": "attribute", "path": "."}, |
| {"param_name": "text", "node_type": "content", "path": "."} |
| ], |
| example=''' |
| <function_calls> |
| <invoke name="browser_select_dropdown_option"> |
| <parameter name="index">2</parameter> |
| <parameter name="text">Option 1</parameter> |
| </invoke> |
| </function_calls> |
| ''' |
| ) |
| async def browser_select_dropdown_option(self, index: int, text: str) -> ToolResult: |
| """Select an option from a dropdown by text |
| |
| Args: |
| index (int): The index of the dropdown element |
| text (str): The text of the option to select |
| |
| Returns: |
| dict: Result of the execution |
| """ |
| logger.debug(f"\033[95mSelecting option '{text}' from dropdown with index: {index}\033[0m") |
| return await self._execute_browser_action("select_dropdown_option", {"index": index, "text": text}) |
|
|
| @openapi_schema({ |
| "type": "function", |
| "function": { |
| "name": "browser_drag_drop", |
| "description": "Perform drag and drop operation between elements or coordinates", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "element_source": { |
| "type": "string", |
| "description": "The source element selector" |
| }, |
| "element_target": { |
| "type": "string", |
| "description": "The target element selector" |
| }, |
| "coord_source_x": { |
| "type": "integer", |
| "description": "The source X coordinate" |
| }, |
| "coord_source_y": { |
| "type": "integer", |
| "description": "The source Y coordinate" |
| }, |
| "coord_target_x": { |
| "type": "integer", |
| "description": "The target X coordinate" |
| }, |
| "coord_target_y": { |
| "type": "integer", |
| "description": "The target Y coordinate" |
| } |
| } |
| } |
| } |
| }) |
| @xml_schema( |
| tag_name="browser-drag-drop", |
| mappings=[ |
| {"param_name": "element_source", "node_type": "attribute", "path": "."}, |
| {"param_name": "element_target", "node_type": "attribute", "path": "."}, |
| {"param_name": "coord_source_x", "node_type": "attribute", "path": "."}, |
| {"param_name": "coord_source_y", "node_type": "attribute", "path": "."}, |
| {"param_name": "coord_target_x", "node_type": "attribute", "path": "."}, |
| {"param_name": "coord_target_y", "node_type": "attribute", "path": "."} |
| ], |
| example=''' |
| <function_calls> |
| <invoke name="browser_drag_drop"> |
| <parameter name="element_source">#draggable</parameter> |
| <parameter name="element_target">#droppable</parameter> |
| </invoke> |
| </function_calls> |
| ''' |
| ) |
| async def browser_drag_drop(self, element_source: str = None, element_target: str = None, |
| coord_source_x: int = None, coord_source_y: int = None, |
| coord_target_x: int = None, coord_target_y: int = None) -> ToolResult: |
| """Perform drag and drop operation between elements or coordinates |
| |
| Args: |
| element_source (str, optional): The source element selector |
| element_target (str, optional): The target element selector |
| coord_source_x (int, optional): The source X coordinate |
| coord_source_y (int, optional): The source Y coordinate |
| coord_target_x (int, optional): The target X coordinate |
| coord_target_y (int, optional): The target Y coordinate |
| |
| Returns: |
| dict: Result of the execution |
| """ |
| params = {} |
| |
| if element_source and element_target: |
| params["element_source"] = element_source |
| params["element_target"] = element_target |
| logger.debug(f"\033[95mDragging from element '{element_source}' to '{element_target}'\033[0m") |
| elif all(coord is not None for coord in [coord_source_x, coord_source_y, coord_target_x, coord_target_y]): |
| params["coord_source_x"] = coord_source_x |
| params["coord_source_y"] = coord_source_y |
| params["coord_target_x"] = coord_target_x |
| params["coord_target_y"] = coord_target_y |
| logger.debug(f"\033[95mDragging from coordinates ({coord_source_x}, {coord_source_y}) to ({coord_target_x}, {coord_target_y})\033[0m") |
| else: |
| return self.fail_response("Must provide either element selectors or coordinates for drag and drop") |
| |
| return await self._execute_browser_action("drag_drop", params) |
|
|
| @openapi_schema({ |
| "type": "function", |
| "function": { |
| "name": "browser_click_coordinates", |
| "description": "Click at specific X,Y coordinates on the page", |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "x": { |
| "type": "integer", |
| "description": "The X coordinate to click" |
| }, |
| "y": { |
| "type": "integer", |
| "description": "The Y coordinate to click" |
| } |
| }, |
| "required": ["x", "y"] |
| } |
| } |
| }) |
| @xml_schema( |
| tag_name="browser-click-coordinates", |
| mappings=[ |
| {"param_name": "x", "node_type": "attribute", "path": "."}, |
| {"param_name": "y", "node_type": "attribute", "path": "."} |
| ], |
| example=''' |
| <function_calls> |
| <invoke name="browser_click_coordinates"> |
| <parameter name="x">100</parameter> |
| <parameter name="y">200</parameter> |
| </invoke> |
| </function_calls> |
| ''' |
| ) |
| async def browser_click_coordinates(self, x: int, y: int) -> ToolResult: |
| """Click at specific X,Y coordinates on the page |
| |
| Args: |
| x (int): The X coordinate to click |
| y (int): The Y coordinate to click |
| |
| Returns: |
| dict: Result of the execution |
| """ |
| logger.debug(f"\033[95mClicking at coordinates: ({x}, {y})\033[0m") |
| return await self._execute_browser_action("click_coordinates", {"x": x, "y": y}) |