OpenSpace / openspace /grounding /backends /gui /anthropic_client.py
darkfire514's picture
Upload 160 files
399b80c verified
import base64
import os
import time
from typing import Any, Dict, Optional, Tuple, List
from openspace.utils.logging import Logger
from PIL import Image
import io
logger = Logger.get_logger(__name__)
try:
from anthropic import (
Anthropic,
AnthropicBedrock,
AnthropicVertex,
APIError,
APIResponseValidationError,
APIStatusError,
)
from anthropic.types.beta import (
BetaMessageParam,
BetaTextBlockParam,
)
ANTHROPIC_AVAILABLE = True
except ImportError:
logger.warning("Anthropic SDK not available. Install with: pip install anthropic")
ANTHROPIC_AVAILABLE = False
# Import utility functions
from .anthropic_utils import (
APIProvider,
PROVIDER_TO_DEFAULT_MODEL_NAME,
COMPUTER_USE_BETA_FLAG,
PROMPT_CACHING_BETA_FLAG,
get_system_prompt,
inject_prompt_caching,
maybe_filter_to_n_most_recent_images,
response_to_params,
)
# API retry configuration
API_RETRY_TIMES = 10
API_RETRY_INTERVAL = 5 # seconds
class AnthropicGUIClient:
"""
Anthropic LLM Client for GUI operations.
Uses Claude Sonnet 4.5 with computer-use-2025-01-24 API.
Features:
- Vision-based screen understanding
- Automatic screenshot resizing (configurable display size)
- Coordinate scaling between display and actual screen
"""
def __init__(
self,
model: str = "claude-sonnet-4-5",
platform: str = "Ubuntu",
api_key: Optional[str] = None,
provider: str = "anthropic",
max_tokens: int = 4096,
screen_size: Tuple[int, int] = (1920, 1080),
display_size: Tuple[int, int] = (1024, 768), # Computer use display size
pyautogui_size: Optional[Tuple[int, int]] = None, # PyAutoGUI working size
only_n_most_recent_images: int = 3,
enable_prompt_caching: bool = True,
backup_api_key: Optional[str] = None,
):
"""
Initialize Anthropic GUI Client for Claude Sonnet 4.5.
Args:
model: Model name (only "claude-sonnet-4-5" supported)
platform: Platform type (Ubuntu, Windows, or macOS)
api_key: Anthropic API key (defaults to ANTHROPIC_API_KEY env var)
provider: API provider (only "anthropic" supported)
max_tokens: Maximum tokens for response
screen_size: Actual screenshot resolution (width, height) - physical pixels
display_size: Display size for computer use tool (width, height)
Screenshots will be resized to this size before sending to API
pyautogui_size: PyAutoGUI working size (logical pixels). If None, assumed same as screen_size.
On Retina/HiDPI displays, this may be screen_size / 2
only_n_most_recent_images: Number of recent screenshots to keep in history
enable_prompt_caching: Whether to enable prompt caching for cost optimization
backup_api_key: Backup API key (defaults to ANTHROPIC_API_KEY_BACKUP env var)
"""
if not ANTHROPIC_AVAILABLE:
raise RuntimeError("Anthropic SDK not installed. Install with: pip install anthropic")
# Only support claude-sonnet-4-5
if model != "claude-sonnet-4-5":
logger.warning(f"Model '{model}' not supported. Using 'claude-sonnet-4-5'")
model = "claude-sonnet-4-5"
self.model = model
self.platform = platform
self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
if not self.api_key:
raise ValueError("Anthropic API key not provided. Set ANTHROPIC_API_KEY env var or pass api_key parameter")
# Backup API key for failover
self.backup_api_key = backup_api_key or os.environ.get("ANTHROPIC_API_KEY_BACKUP")
# Only support anthropic provider
if provider != "anthropic":
logger.warning(f"Provider '{provider}' not supported. Using 'anthropic'")
provider = "anthropic"
self.provider = APIProvider(provider)
self.max_tokens = max_tokens
self.screen_size = screen_size
self.display_size = display_size
self.pyautogui_size = pyautogui_size or screen_size # Default to screen_size if not specified
self.only_n_most_recent_images = only_n_most_recent_images
self.enable_prompt_caching = enable_prompt_caching
# Message history
self.messages: List[BetaMessageParam] = []
# Calculate resize factor for coordinate scaling
# Step 1: LLM coordinates (display_size) -> Physical pixels (screen_size)
# Step 2: Physical pixels -> PyAutoGUI logical pixels (pyautogui_size)
self.resize_factor = (
self.pyautogui_size[0] / display_size[0], # x scale factor
self.pyautogui_size[1] / display_size[1] # y scale factor
)
logger.info(
f"Initialized AnthropicGUIClient:\n"
f" Model: {model}\n"
f" Platform: {platform}\n"
f" Screen Size (physical): {screen_size}\n"
f" PyAutoGUI Size (logical): {self.pyautogui_size}\n"
f" Display Size (LLM): {display_size}\n"
f" Resize Factor (LLM->PyAutoGUI): {self.resize_factor}\n"
f" Prompt Caching: {enable_prompt_caching}"
)
def _create_client(self, api_key: Optional[str] = None):
"""Create Anthropic client (only supports anthropic provider)."""
key = api_key or self.api_key
return Anthropic(api_key=key, max_retries=4)
def _resize_screenshot(self, screenshot_bytes: bytes) -> bytes:
"""
Resize screenshot to display size for Computer Use API.
For computer-use-2025-01-24, the screenshot must be resized to the
display_width_px x display_height_px specified in the tool definition.
"""
screenshot_image = Image.open(io.BytesIO(screenshot_bytes))
resized_image = screenshot_image.resize(self.display_size, Image.Resampling.LANCZOS)
output_buffer = io.BytesIO()
resized_image.save(output_buffer, format='PNG')
return output_buffer.getvalue()
def _scale_coordinates(self, x: int, y: int) -> Tuple[int, int]:
"""
Scale coordinates from display size to actual screen size.
The API returns coordinates in display_size (e.g., 1024x768).
We need to scale them to actual screen_size (e.g., 1920x1080) for execution.
Args:
x, y: Coordinates in display size space
Returns:
Scaled coordinates in actual screen size space
"""
scaled_x = int(x * self.resize_factor[0])
scaled_y = int(y * self.resize_factor[1])
return scaled_x, scaled_y
async def plan_action(
self,
task_description: str,
screenshot: bytes,
action_history: List[Dict[str, Any]] = None,
) -> Tuple[Optional[str], List[str]]:
"""
Plan next action based on task and current screenshot.
Includes prompt caching, error handling, and backup API key support.
Args:
task_description: Task to accomplish
screenshot: Current screenshot (PNG bytes)
action_history: Previous actions (for context)
Returns:
Tuple of (reasoning, list of pyautogui commands)
"""
# Resize screenshot
resized_screenshot = self._resize_screenshot(screenshot)
screenshot_b64 = base64.b64encode(resized_screenshot).decode('utf-8')
# Initialize messages with first task + screenshot
if not self.messages:
# IMPORTANT: Image should come BEFORE text for better model understanding
# This matches OSWorld's implementation which has proven effectiveness
self.messages.append({
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot_b64,
},
},
{"type": "text", "text": task_description},
]
})
# Filter images BEFORE adding new screenshot to control message size
# This is critical to avoid exceeding the 25MB API limit
image_truncation_threshold = 10
if self.only_n_most_recent_images and len(self.messages) > 1:
# Reserve 1 slot for the screenshot we're about to add
maybe_filter_to_n_most_recent_images(
self.messages,
max(1, self.only_n_most_recent_images - 1),
min_removal_threshold=1, # More aggressive filtering
)
# Add tool result from previous action if exists
if self.messages and self.messages[-1]["role"] == "assistant":
last_content = self.messages[-1]["content"]
if isinstance(last_content, list) and any(
block.get("type") == "tool_use" for block in last_content
):
tool_use_id = next(
block["id"] for block in last_content
if block.get("type") == "tool_use"
)
self._add_tool_result(tool_use_id, "Success", resized_screenshot)
# Define tools and betas for claude-sonnet-4-5 with computer-use-2025-01-24
tools = [{
'name': 'computer',
'type': 'computer_20250124',
'display_width_px': self.display_size[0],
'display_height_px': self.display_size[1],
'display_number': 1
}]
betas = [COMPUTER_USE_BETA_FLAG]
# Prepare system prompt with optional caching
system = BetaTextBlockParam(
type="text",
text=get_system_prompt(self.platform)
)
# Enable prompt caching if supported and enabled
if self.enable_prompt_caching:
betas.append(PROMPT_CACHING_BETA_FLAG)
inject_prompt_caching(self.messages)
system["cache_control"] = {"type": "ephemeral"} # type: ignore
# Model name - use claude-sonnet-4-5 directly
model_name = "claude-sonnet-4-5"
# Enable thinking for complex computer use tasks
extra_body = {"thinking": {"type": "enabled", "budget_tokens": 2048}}
# Log request details for debugging
# Count current images in messages
total_images = sum(
1
for message in self.messages
for item in (message.get("content", []) if isinstance(message.get("content"), list) else [])
if isinstance(item, dict) and item.get("type") == "image"
)
tool_result_images = sum(
1
for message in self.messages
for item in (message.get("content", []) if isinstance(message.get("content"), list) else [])
if isinstance(item, dict) and item.get("type") == "tool_result"
for content in item.get("content", [])
if isinstance(content, dict) and content.get("type") == "image"
)
logger.info(
f"Anthropic API request:\n"
f" Model: {model_name}\n"
f" Display Size: {self.display_size}\n"
f" Betas: {betas}\n"
f" Images: {total_images} ({tool_result_images} in tool_results)\n"
f" Messages: {len(self.messages)}"
)
# Try API call with retry and backup
client = self._create_client()
response = None
try:
# Retry loop with automatic image count reduction on 25MB error
for attempt in range(API_RETRY_TIMES):
try:
response = client.beta.messages.create(
max_tokens=self.max_tokens,
messages=self.messages,
model=model_name,
system=[system],
tools=tools,
betas=betas,
extra_body=extra_body
)
logger.info(f"API call succeeded on attempt {attempt + 1}")
break
except (APIError, APIStatusError, APIResponseValidationError) as e:
error_msg = str(e)
logger.warning(f"Anthropic API error (attempt {attempt+1}/{API_RETRY_TIMES}): {error_msg}")
# Handle 25MB payload limit error (including HTTP 413)
if ("25000000" in error_msg or
"Member must have length less than or equal to" in error_msg or
"request_too_large" in error_msg or
"413" in str(e)):
logger.warning("Detected 25MB limit error, reducing image count")
current_count = self.only_n_most_recent_images
new_count = max(1, current_count // 2)
self.only_n_most_recent_images = new_count
maybe_filter_to_n_most_recent_images(
self.messages,
new_count,
min_removal_threshold=1, # Aggressive filtering when hitting limit
)
logger.info(f"Image count reduced from {current_count} to {new_count}")
if attempt < API_RETRY_TIMES - 1:
time.sleep(API_RETRY_INTERVAL)
else:
raise
except (APIError, APIStatusError, APIResponseValidationError) as e:
logger.error(f"Primary API key failed: {e}")
# Try backup API key if available
if self.backup_api_key:
logger.warning("Retrying with backup API key...")
try:
backup_client = self._create_client(self.backup_api_key)
response = backup_client.beta.messages.create(
max_tokens=self.max_tokens,
messages=self.messages,
model=model_name,
system=[system],
tools=tools,
betas=betas,
extra_body=extra_body
)
logger.info("Successfully used backup API key")
except Exception as backup_e:
logger.error(f"Backup API key also failed: {backup_e}")
return None, ["FAIL"]
else:
return None, ["FAIL"]
except Exception as e:
logger.error(f"Unexpected error: {e}")
return None, ["FAIL"]
if not response:
return None, ["FAIL"]
# Parse response using utility function
response_params = response_to_params(response)
# Extract reasoning and commands
reasoning = ""
commands = []
for block in response_params:
block_type = block.get("type")
if block_type == "text":
reasoning = block.get("text", "")
elif block_type == "thinking":
reasoning = block.get("thinking", "")
elif block_type == "tool_use":
tool_input = block.get("input", {})
command = self._parse_computer_tool_use(tool_input)
if command:
commands.append(command)
else:
logger.warning(f"Failed to parse tool_use: {tool_input}")
# Store assistant response
self.messages.append({
"role": "assistant",
"content": response_params
})
logger.info(f"Parsed {len(commands)} commands from response")
return reasoning, commands
def _add_tool_result(
self,
tool_use_id: str,
result: str,
screenshot_bytes: Optional[bytes] = None
):
"""
Add tool result to message history.
IMPORTANT: Put screenshot BEFORE text for consistency with initial message.
"""
# Build content list with image first (if provided), then text
content_list = []
# Add screenshot first if provided (consistent with initial message ordering)
if screenshot_bytes is not None:
screenshot_b64 = base64.b64encode(screenshot_bytes).decode('utf-8')
content_list.append({
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": screenshot_b64
}
})
# Then add text result
content_list.append({"type": "text", "text": result})
tool_result_content = [{
"type": "tool_result",
"tool_use_id": tool_use_id,
"content": content_list
}]
self.messages.append({
"role": "user",
"content": tool_result_content
})
def _parse_computer_tool_use(self, tool_input: Dict[str, Any]) -> Optional[str]:
"""
Parse Anthropic computer tool use to pyautogui command.
Args:
tool_input: Tool input from Anthropic (action, coordinate, text, etc.)
Returns:
PyAutoGUI command string or control command (DONE, FAIL)
"""
action = tool_input.get("action")
if not action:
return None
# Action conversion
action_conversion = {
"left click": "click",
"right click": "right_click"
}
action = action_conversion.get(action, action)
text = tool_input.get("text")
coordinate = tool_input.get("coordinate")
scroll_direction = tool_input.get("scroll_direction")
scroll_amount = tool_input.get("scroll_amount", 5)
# Scale coordinates to actual screen size
if coordinate:
coordinate = self._scale_coordinates(coordinate[0], coordinate[1])
# Build commands
command = ""
if action == "mouse_move":
if coordinate:
x, y = coordinate
command = f"pyautogui.moveTo({x}, {y}, duration=0.5)"
elif action in ("left_click", "click"):
if coordinate:
x, y = coordinate
command = f"pyautogui.click({x}, {y})"
else:
command = "pyautogui.click()"
elif action == "right_click":
if coordinate:
x, y = coordinate
command = f"pyautogui.rightClick({x}, {y})"
else:
command = "pyautogui.rightClick()"
elif action == "double_click":
if coordinate:
x, y = coordinate
command = f"pyautogui.doubleClick({x}, {y})"
else:
command = "pyautogui.doubleClick()"
elif action == "middle_click":
if coordinate:
x, y = coordinate
command = f"pyautogui.middleClick({x}, {y})"
else:
command = "pyautogui.middleClick()"
elif action == "left_click_drag":
if coordinate:
x, y = coordinate
command = f"pyautogui.dragTo({x}, {y}, duration=0.5)"
elif action == "key":
if text:
keys = text.split('+')
# Key conversion
key_conversion = {
"page_down": "pagedown",
"page_up": "pageup",
"super_l": "win",
"super": "command",
"escape": "esc"
}
converted_keys = [key_conversion.get(k.strip().lower(), k.strip().lower()) for k in keys]
# Press and release keys
for key in converted_keys:
command += f"pyautogui.keyDown('{key}'); "
for key in reversed(converted_keys):
command += f"pyautogui.keyUp('{key}'); "
# Remove trailing semicolon and space
command = command.rstrip('; ')
elif action == "type":
if text:
command = f"pyautogui.typewrite({repr(text)}, interval=0.01)"
elif action == "scroll":
if scroll_direction in ("up", "down"):
scroll_value = scroll_amount if scroll_direction == "up" else -scroll_amount
if coordinate:
x, y = coordinate
command = f"pyautogui.scroll({scroll_value}, {x}, {y})"
else:
command = f"pyautogui.scroll({scroll_value})"
elif scroll_direction in ("left", "right"):
scroll_value = scroll_amount if scroll_direction == "right" else -scroll_amount
if coordinate:
x, y = coordinate
command = f"pyautogui.hscroll({scroll_value}, {x}, {y})"
else:
command = f"pyautogui.hscroll({scroll_value})"
elif action == "screenshot":
# Screenshot is automatically handled by the system
# Return special marker to indicate no action needed
return "SCREENSHOT"
elif action == "wait":
# Wait for specified duration
duration = tool_input.get("duration", 1)
command = f"pyautogui.sleep({duration})"
elif action == "done":
return "DONE"
elif action == "fail":
return "FAIL"
return command if command else None
def reset(self):
"""Reset message history."""
self.messages = []
logger.info("Reset AnthropicGUIClient message history")