Spaces:
Running
Running
| import base64 | |
| import os | |
| import time | |
| from typing import Any, Dict, Optional, Tuple, List | |
| from openspace.utils.logging import Logger | |
| from PIL import Image | |
| import io | |
| logger = Logger.get_logger(__name__) | |
| try: | |
| from anthropic import ( | |
| Anthropic, | |
| AnthropicBedrock, | |
| AnthropicVertex, | |
| APIError, | |
| APIResponseValidationError, | |
| APIStatusError, | |
| ) | |
| from anthropic.types.beta import ( | |
| BetaMessageParam, | |
| BetaTextBlockParam, | |
| ) | |
| ANTHROPIC_AVAILABLE = True | |
| except ImportError: | |
| logger.warning("Anthropic SDK not available. Install with: pip install anthropic") | |
| ANTHROPIC_AVAILABLE = False | |
| # Import utility functions | |
| from .anthropic_utils import ( | |
| APIProvider, | |
| PROVIDER_TO_DEFAULT_MODEL_NAME, | |
| COMPUTER_USE_BETA_FLAG, | |
| PROMPT_CACHING_BETA_FLAG, | |
| get_system_prompt, | |
| inject_prompt_caching, | |
| maybe_filter_to_n_most_recent_images, | |
| response_to_params, | |
| ) | |
| # API retry configuration | |
| API_RETRY_TIMES = 10 | |
| API_RETRY_INTERVAL = 5 # seconds | |
| class AnthropicGUIClient: | |
| """ | |
| Anthropic LLM Client for GUI operations. | |
| Uses Claude Sonnet 4.5 with computer-use-2025-01-24 API. | |
| Features: | |
| - Vision-based screen understanding | |
| - Automatic screenshot resizing (configurable display size) | |
| - Coordinate scaling between display and actual screen | |
| """ | |
| def __init__( | |
| self, | |
| model: str = "claude-sonnet-4-5", | |
| platform: str = "Ubuntu", | |
| api_key: Optional[str] = None, | |
| provider: str = "anthropic", | |
| max_tokens: int = 4096, | |
| screen_size: Tuple[int, int] = (1920, 1080), | |
| display_size: Tuple[int, int] = (1024, 768), # Computer use display size | |
| pyautogui_size: Optional[Tuple[int, int]] = None, # PyAutoGUI working size | |
| only_n_most_recent_images: int = 3, | |
| enable_prompt_caching: bool = True, | |
| backup_api_key: Optional[str] = None, | |
| ): | |
| """ | |
| Initialize Anthropic GUI Client for Claude Sonnet 4.5. | |
| Args: | |
| model: Model name (only "claude-sonnet-4-5" supported) | |
| platform: Platform type (Ubuntu, Windows, or macOS) | |
| api_key: Anthropic API key (defaults to ANTHROPIC_API_KEY env var) | |
| provider: API provider (only "anthropic" supported) | |
| max_tokens: Maximum tokens for response | |
| screen_size: Actual screenshot resolution (width, height) - physical pixels | |
| display_size: Display size for computer use tool (width, height) | |
| Screenshots will be resized to this size before sending to API | |
| pyautogui_size: PyAutoGUI working size (logical pixels). If None, assumed same as screen_size. | |
| On Retina/HiDPI displays, this may be screen_size / 2 | |
| only_n_most_recent_images: Number of recent screenshots to keep in history | |
| enable_prompt_caching: Whether to enable prompt caching for cost optimization | |
| backup_api_key: Backup API key (defaults to ANTHROPIC_API_KEY_BACKUP env var) | |
| """ | |
| if not ANTHROPIC_AVAILABLE: | |
| raise RuntimeError("Anthropic SDK not installed. Install with: pip install anthropic") | |
| # Only support claude-sonnet-4-5 | |
| if model != "claude-sonnet-4-5": | |
| logger.warning(f"Model '{model}' not supported. Using 'claude-sonnet-4-5'") | |
| model = "claude-sonnet-4-5" | |
| self.model = model | |
| self.platform = platform | |
| self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY") | |
| if not self.api_key: | |
| raise ValueError("Anthropic API key not provided. Set ANTHROPIC_API_KEY env var or pass api_key parameter") | |
| # Backup API key for failover | |
| self.backup_api_key = backup_api_key or os.environ.get("ANTHROPIC_API_KEY_BACKUP") | |
| # Only support anthropic provider | |
| if provider != "anthropic": | |
| logger.warning(f"Provider '{provider}' not supported. Using 'anthropic'") | |
| provider = "anthropic" | |
| self.provider = APIProvider(provider) | |
| self.max_tokens = max_tokens | |
| self.screen_size = screen_size | |
| self.display_size = display_size | |
| self.pyautogui_size = pyautogui_size or screen_size # Default to screen_size if not specified | |
| self.only_n_most_recent_images = only_n_most_recent_images | |
| self.enable_prompt_caching = enable_prompt_caching | |
| # Message history | |
| self.messages: List[BetaMessageParam] = [] | |
| # Calculate resize factor for coordinate scaling | |
| # Step 1: LLM coordinates (display_size) -> Physical pixels (screen_size) | |
| # Step 2: Physical pixels -> PyAutoGUI logical pixels (pyautogui_size) | |
| self.resize_factor = ( | |
| self.pyautogui_size[0] / display_size[0], # x scale factor | |
| self.pyautogui_size[1] / display_size[1] # y scale factor | |
| ) | |
| logger.info( | |
| f"Initialized AnthropicGUIClient:\n" | |
| f" Model: {model}\n" | |
| f" Platform: {platform}\n" | |
| f" Screen Size (physical): {screen_size}\n" | |
| f" PyAutoGUI Size (logical): {self.pyautogui_size}\n" | |
| f" Display Size (LLM): {display_size}\n" | |
| f" Resize Factor (LLM->PyAutoGUI): {self.resize_factor}\n" | |
| f" Prompt Caching: {enable_prompt_caching}" | |
| ) | |
| def _create_client(self, api_key: Optional[str] = None): | |
| """Create Anthropic client (only supports anthropic provider).""" | |
| key = api_key or self.api_key | |
| return Anthropic(api_key=key, max_retries=4) | |
| def _resize_screenshot(self, screenshot_bytes: bytes) -> bytes: | |
| """ | |
| Resize screenshot to display size for Computer Use API. | |
| For computer-use-2025-01-24, the screenshot must be resized to the | |
| display_width_px x display_height_px specified in the tool definition. | |
| """ | |
| screenshot_image = Image.open(io.BytesIO(screenshot_bytes)) | |
| resized_image = screenshot_image.resize(self.display_size, Image.Resampling.LANCZOS) | |
| output_buffer = io.BytesIO() | |
| resized_image.save(output_buffer, format='PNG') | |
| return output_buffer.getvalue() | |
| def _scale_coordinates(self, x: int, y: int) -> Tuple[int, int]: | |
| """ | |
| Scale coordinates from display size to actual screen size. | |
| The API returns coordinates in display_size (e.g., 1024x768). | |
| We need to scale them to actual screen_size (e.g., 1920x1080) for execution. | |
| Args: | |
| x, y: Coordinates in display size space | |
| Returns: | |
| Scaled coordinates in actual screen size space | |
| """ | |
| scaled_x = int(x * self.resize_factor[0]) | |
| scaled_y = int(y * self.resize_factor[1]) | |
| return scaled_x, scaled_y | |
| async def plan_action( | |
| self, | |
| task_description: str, | |
| screenshot: bytes, | |
| action_history: List[Dict[str, Any]] = None, | |
| ) -> Tuple[Optional[str], List[str]]: | |
| """ | |
| Plan next action based on task and current screenshot. | |
| Includes prompt caching, error handling, and backup API key support. | |
| Args: | |
| task_description: Task to accomplish | |
| screenshot: Current screenshot (PNG bytes) | |
| action_history: Previous actions (for context) | |
| Returns: | |
| Tuple of (reasoning, list of pyautogui commands) | |
| """ | |
| # Resize screenshot | |
| resized_screenshot = self._resize_screenshot(screenshot) | |
| screenshot_b64 = base64.b64encode(resized_screenshot).decode('utf-8') | |
| # Initialize messages with first task + screenshot | |
| if not self.messages: | |
| # IMPORTANT: Image should come BEFORE text for better model understanding | |
| # This matches OSWorld's implementation which has proven effectiveness | |
| self.messages.append({ | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "image", | |
| "source": { | |
| "type": "base64", | |
| "media_type": "image/png", | |
| "data": screenshot_b64, | |
| }, | |
| }, | |
| {"type": "text", "text": task_description}, | |
| ] | |
| }) | |
| # Filter images BEFORE adding new screenshot to control message size | |
| # This is critical to avoid exceeding the 25MB API limit | |
| image_truncation_threshold = 10 | |
| if self.only_n_most_recent_images and len(self.messages) > 1: | |
| # Reserve 1 slot for the screenshot we're about to add | |
| maybe_filter_to_n_most_recent_images( | |
| self.messages, | |
| max(1, self.only_n_most_recent_images - 1), | |
| min_removal_threshold=1, # More aggressive filtering | |
| ) | |
| # Add tool result from previous action if exists | |
| if self.messages and self.messages[-1]["role"] == "assistant": | |
| last_content = self.messages[-1]["content"] | |
| if isinstance(last_content, list) and any( | |
| block.get("type") == "tool_use" for block in last_content | |
| ): | |
| tool_use_id = next( | |
| block["id"] for block in last_content | |
| if block.get("type") == "tool_use" | |
| ) | |
| self._add_tool_result(tool_use_id, "Success", resized_screenshot) | |
| # Define tools and betas for claude-sonnet-4-5 with computer-use-2025-01-24 | |
| tools = [{ | |
| 'name': 'computer', | |
| 'type': 'computer_20250124', | |
| 'display_width_px': self.display_size[0], | |
| 'display_height_px': self.display_size[1], | |
| 'display_number': 1 | |
| }] | |
| betas = [COMPUTER_USE_BETA_FLAG] | |
| # Prepare system prompt with optional caching | |
| system = BetaTextBlockParam( | |
| type="text", | |
| text=get_system_prompt(self.platform) | |
| ) | |
| # Enable prompt caching if supported and enabled | |
| if self.enable_prompt_caching: | |
| betas.append(PROMPT_CACHING_BETA_FLAG) | |
| inject_prompt_caching(self.messages) | |
| system["cache_control"] = {"type": "ephemeral"} # type: ignore | |
| # Model name - use claude-sonnet-4-5 directly | |
| model_name = "claude-sonnet-4-5" | |
| # Enable thinking for complex computer use tasks | |
| extra_body = {"thinking": {"type": "enabled", "budget_tokens": 2048}} | |
| # Log request details for debugging | |
| # Count current images in messages | |
| total_images = sum( | |
| 1 | |
| for message in self.messages | |
| for item in (message.get("content", []) if isinstance(message.get("content"), list) else []) | |
| if isinstance(item, dict) and item.get("type") == "image" | |
| ) | |
| tool_result_images = sum( | |
| 1 | |
| for message in self.messages | |
| for item in (message.get("content", []) if isinstance(message.get("content"), list) else []) | |
| if isinstance(item, dict) and item.get("type") == "tool_result" | |
| for content in item.get("content", []) | |
| if isinstance(content, dict) and content.get("type") == "image" | |
| ) | |
| logger.info( | |
| f"Anthropic API request:\n" | |
| f" Model: {model_name}\n" | |
| f" Display Size: {self.display_size}\n" | |
| f" Betas: {betas}\n" | |
| f" Images: {total_images} ({tool_result_images} in tool_results)\n" | |
| f" Messages: {len(self.messages)}" | |
| ) | |
| # Try API call with retry and backup | |
| client = self._create_client() | |
| response = None | |
| try: | |
| # Retry loop with automatic image count reduction on 25MB error | |
| for attempt in range(API_RETRY_TIMES): | |
| try: | |
| response = client.beta.messages.create( | |
| max_tokens=self.max_tokens, | |
| messages=self.messages, | |
| model=model_name, | |
| system=[system], | |
| tools=tools, | |
| betas=betas, | |
| extra_body=extra_body | |
| ) | |
| logger.info(f"API call succeeded on attempt {attempt + 1}") | |
| break | |
| except (APIError, APIStatusError, APIResponseValidationError) as e: | |
| error_msg = str(e) | |
| logger.warning(f"Anthropic API error (attempt {attempt+1}/{API_RETRY_TIMES}): {error_msg}") | |
| # Handle 25MB payload limit error (including HTTP 413) | |
| if ("25000000" in error_msg or | |
| "Member must have length less than or equal to" in error_msg or | |
| "request_too_large" in error_msg or | |
| "413" in str(e)): | |
| logger.warning("Detected 25MB limit error, reducing image count") | |
| current_count = self.only_n_most_recent_images | |
| new_count = max(1, current_count // 2) | |
| self.only_n_most_recent_images = new_count | |
| maybe_filter_to_n_most_recent_images( | |
| self.messages, | |
| new_count, | |
| min_removal_threshold=1, # Aggressive filtering when hitting limit | |
| ) | |
| logger.info(f"Image count reduced from {current_count} to {new_count}") | |
| if attempt < API_RETRY_TIMES - 1: | |
| time.sleep(API_RETRY_INTERVAL) | |
| else: | |
| raise | |
| except (APIError, APIStatusError, APIResponseValidationError) as e: | |
| logger.error(f"Primary API key failed: {e}") | |
| # Try backup API key if available | |
| if self.backup_api_key: | |
| logger.warning("Retrying with backup API key...") | |
| try: | |
| backup_client = self._create_client(self.backup_api_key) | |
| response = backup_client.beta.messages.create( | |
| max_tokens=self.max_tokens, | |
| messages=self.messages, | |
| model=model_name, | |
| system=[system], | |
| tools=tools, | |
| betas=betas, | |
| extra_body=extra_body | |
| ) | |
| logger.info("Successfully used backup API key") | |
| except Exception as backup_e: | |
| logger.error(f"Backup API key also failed: {backup_e}") | |
| return None, ["FAIL"] | |
| else: | |
| return None, ["FAIL"] | |
| except Exception as e: | |
| logger.error(f"Unexpected error: {e}") | |
| return None, ["FAIL"] | |
| if not response: | |
| return None, ["FAIL"] | |
| # Parse response using utility function | |
| response_params = response_to_params(response) | |
| # Extract reasoning and commands | |
| reasoning = "" | |
| commands = [] | |
| for block in response_params: | |
| block_type = block.get("type") | |
| if block_type == "text": | |
| reasoning = block.get("text", "") | |
| elif block_type == "thinking": | |
| reasoning = block.get("thinking", "") | |
| elif block_type == "tool_use": | |
| tool_input = block.get("input", {}) | |
| command = self._parse_computer_tool_use(tool_input) | |
| if command: | |
| commands.append(command) | |
| else: | |
| logger.warning(f"Failed to parse tool_use: {tool_input}") | |
| # Store assistant response | |
| self.messages.append({ | |
| "role": "assistant", | |
| "content": response_params | |
| }) | |
| logger.info(f"Parsed {len(commands)} commands from response") | |
| return reasoning, commands | |
| def _add_tool_result( | |
| self, | |
| tool_use_id: str, | |
| result: str, | |
| screenshot_bytes: Optional[bytes] = None | |
| ): | |
| """ | |
| Add tool result to message history. | |
| IMPORTANT: Put screenshot BEFORE text for consistency with initial message. | |
| """ | |
| # Build content list with image first (if provided), then text | |
| content_list = [] | |
| # Add screenshot first if provided (consistent with initial message ordering) | |
| if screenshot_bytes is not None: | |
| screenshot_b64 = base64.b64encode(screenshot_bytes).decode('utf-8') | |
| content_list.append({ | |
| "type": "image", | |
| "source": { | |
| "type": "base64", | |
| "media_type": "image/png", | |
| "data": screenshot_b64 | |
| } | |
| }) | |
| # Then add text result | |
| content_list.append({"type": "text", "text": result}) | |
| tool_result_content = [{ | |
| "type": "tool_result", | |
| "tool_use_id": tool_use_id, | |
| "content": content_list | |
| }] | |
| self.messages.append({ | |
| "role": "user", | |
| "content": tool_result_content | |
| }) | |
| def _parse_computer_tool_use(self, tool_input: Dict[str, Any]) -> Optional[str]: | |
| """ | |
| Parse Anthropic computer tool use to pyautogui command. | |
| Args: | |
| tool_input: Tool input from Anthropic (action, coordinate, text, etc.) | |
| Returns: | |
| PyAutoGUI command string or control command (DONE, FAIL) | |
| """ | |
| action = tool_input.get("action") | |
| if not action: | |
| return None | |
| # Action conversion | |
| action_conversion = { | |
| "left click": "click", | |
| "right click": "right_click" | |
| } | |
| action = action_conversion.get(action, action) | |
| text = tool_input.get("text") | |
| coordinate = tool_input.get("coordinate") | |
| scroll_direction = tool_input.get("scroll_direction") | |
| scroll_amount = tool_input.get("scroll_amount", 5) | |
| # Scale coordinates to actual screen size | |
| if coordinate: | |
| coordinate = self._scale_coordinates(coordinate[0], coordinate[1]) | |
| # Build commands | |
| command = "" | |
| if action == "mouse_move": | |
| if coordinate: | |
| x, y = coordinate | |
| command = f"pyautogui.moveTo({x}, {y}, duration=0.5)" | |
| elif action in ("left_click", "click"): | |
| if coordinate: | |
| x, y = coordinate | |
| command = f"pyautogui.click({x}, {y})" | |
| else: | |
| command = "pyautogui.click()" | |
| elif action == "right_click": | |
| if coordinate: | |
| x, y = coordinate | |
| command = f"pyautogui.rightClick({x}, {y})" | |
| else: | |
| command = "pyautogui.rightClick()" | |
| elif action == "double_click": | |
| if coordinate: | |
| x, y = coordinate | |
| command = f"pyautogui.doubleClick({x}, {y})" | |
| else: | |
| command = "pyautogui.doubleClick()" | |
| elif action == "middle_click": | |
| if coordinate: | |
| x, y = coordinate | |
| command = f"pyautogui.middleClick({x}, {y})" | |
| else: | |
| command = "pyautogui.middleClick()" | |
| elif action == "left_click_drag": | |
| if coordinate: | |
| x, y = coordinate | |
| command = f"pyautogui.dragTo({x}, {y}, duration=0.5)" | |
| elif action == "key": | |
| if text: | |
| keys = text.split('+') | |
| # Key conversion | |
| key_conversion = { | |
| "page_down": "pagedown", | |
| "page_up": "pageup", | |
| "super_l": "win", | |
| "super": "command", | |
| "escape": "esc" | |
| } | |
| converted_keys = [key_conversion.get(k.strip().lower(), k.strip().lower()) for k in keys] | |
| # Press and release keys | |
| for key in converted_keys: | |
| command += f"pyautogui.keyDown('{key}'); " | |
| for key in reversed(converted_keys): | |
| command += f"pyautogui.keyUp('{key}'); " | |
| # Remove trailing semicolon and space | |
| command = command.rstrip('; ') | |
| elif action == "type": | |
| if text: | |
| command = f"pyautogui.typewrite({repr(text)}, interval=0.01)" | |
| elif action == "scroll": | |
| if scroll_direction in ("up", "down"): | |
| scroll_value = scroll_amount if scroll_direction == "up" else -scroll_amount | |
| if coordinate: | |
| x, y = coordinate | |
| command = f"pyautogui.scroll({scroll_value}, {x}, {y})" | |
| else: | |
| command = f"pyautogui.scroll({scroll_value})" | |
| elif scroll_direction in ("left", "right"): | |
| scroll_value = scroll_amount if scroll_direction == "right" else -scroll_amount | |
| if coordinate: | |
| x, y = coordinate | |
| command = f"pyautogui.hscroll({scroll_value}, {x}, {y})" | |
| else: | |
| command = f"pyautogui.hscroll({scroll_value})" | |
| elif action == "screenshot": | |
| # Screenshot is automatically handled by the system | |
| # Return special marker to indicate no action needed | |
| return "SCREENSHOT" | |
| elif action == "wait": | |
| # Wait for specified duration | |
| duration = tool_input.get("duration", 1) | |
| command = f"pyautogui.sleep({duration})" | |
| elif action == "done": | |
| return "DONE" | |
| elif action == "fail": | |
| return "FAIL" | |
| return command if command else None | |
| def reset(self): | |
| """Reset message history.""" | |
| self.messages = [] | |
| logger.info("Reset AnthropicGUIClient message history") |