Spaces:

darkfire514
/

OpenSpace

Running

App Files Files Community

OpenSpace / openspace /grounding /backends /gui /anthropic_client.py

darkfire514

Upload 160 files

399b80c verified 2 months ago

raw

history blame contribute delete

22.7 kB

	import base64
	import os
	import time
	from typing import Any, Dict, Optional, Tuple, List
	from openspace.utils.logging import Logger
	from PIL import Image
	import io

	logger = Logger.get_logger(__name__)

	try:
	from anthropic import (
	Anthropic,
	AnthropicBedrock,
	AnthropicVertex,
	APIError,
	APIResponseValidationError,
	APIStatusError,
	)
	from anthropic.types.beta import (
	BetaMessageParam,
	BetaTextBlockParam,
	)
	ANTHROPIC_AVAILABLE = True
	except ImportError:
	logger.warning("Anthropic SDK not available. Install with: pip install anthropic")
	ANTHROPIC_AVAILABLE = False

	# Import utility functions
	from .anthropic_utils import (
	APIProvider,
	PROVIDER_TO_DEFAULT_MODEL_NAME,
	COMPUTER_USE_BETA_FLAG,
	PROMPT_CACHING_BETA_FLAG,
	get_system_prompt,
	inject_prompt_caching,
	maybe_filter_to_n_most_recent_images,
	response_to_params,
	)

	# API retry configuration
	API_RETRY_TIMES = 10
	API_RETRY_INTERVAL = 5 # seconds


	class AnthropicGUIClient:
	"""
	Anthropic LLM Client for GUI operations.
	Uses Claude Sonnet 4.5 with computer-use-2025-01-24 API.

	Features:
	- Vision-based screen understanding
	- Automatic screenshot resizing (configurable display size)
	- Coordinate scaling between display and actual screen
	"""

	def __init__(
	self,
	model: str = "claude-sonnet-4-5",
	platform: str = "Ubuntu",
	api_key: Optional[str] = None,
	provider: str = "anthropic",
	max_tokens: int = 4096,
	screen_size: Tuple[int, int] = (1920, 1080),
	display_size: Tuple[int, int] = (1024, 768), # Computer use display size
	pyautogui_size: Optional[Tuple[int, int]] = None, # PyAutoGUI working size
	only_n_most_recent_images: int = 3,
	enable_prompt_caching: bool = True,
	backup_api_key: Optional[str] = None,
	):
	"""
	Initialize Anthropic GUI Client for Claude Sonnet 4.5.

	Args:
	model: Model name (only "claude-sonnet-4-5" supported)
	platform: Platform type (Ubuntu, Windows, or macOS)
	api_key: Anthropic API key (defaults to ANTHROPIC_API_KEY env var)
	provider: API provider (only "anthropic" supported)
	max_tokens: Maximum tokens for response
	screen_size: Actual screenshot resolution (width, height) - physical pixels
	display_size: Display size for computer use tool (width, height)
	Screenshots will be resized to this size before sending to API
	pyautogui_size: PyAutoGUI working size (logical pixels). If None, assumed same as screen_size.
	On Retina/HiDPI displays, this may be screen_size / 2
	only_n_most_recent_images: Number of recent screenshots to keep in history
	enable_prompt_caching: Whether to enable prompt caching for cost optimization
	backup_api_key: Backup API key (defaults to ANTHROPIC_API_KEY_BACKUP env var)
	"""
	if not ANTHROPIC_AVAILABLE:
	raise RuntimeError("Anthropic SDK not installed. Install with: pip install anthropic")

	# Only support claude-sonnet-4-5
	if model != "claude-sonnet-4-5":
	logger.warning(f"Model '{model}' not supported. Using 'claude-sonnet-4-5'")
	model = "claude-sonnet-4-5"

	self.model = model
	self.platform = platform
	self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
	if not self.api_key:
	raise ValueError("Anthropic API key not provided. Set ANTHROPIC_API_KEY env var or pass api_key parameter")

	# Backup API key for failover
	self.backup_api_key = backup_api_key or os.environ.get("ANTHROPIC_API_KEY_BACKUP")

	# Only support anthropic provider
	if provider != "anthropic":
	logger.warning(f"Provider '{provider}' not supported. Using 'anthropic'")
	provider = "anthropic"

	self.provider = APIProvider(provider)
	self.max_tokens = max_tokens
	self.screen_size = screen_size
	self.display_size = display_size
	self.pyautogui_size = pyautogui_size or screen_size # Default to screen_size if not specified
	self.only_n_most_recent_images = only_n_most_recent_images
	self.enable_prompt_caching = enable_prompt_caching

	# Message history
	self.messages: List[BetaMessageParam] = []

	# Calculate resize factor for coordinate scaling
	# Step 1: LLM coordinates (display_size) -> Physical pixels (screen_size)
	# Step 2: Physical pixels -> PyAutoGUI logical pixels (pyautogui_size)
	self.resize_factor = (
	self.pyautogui_size[0] / display_size[0], # x scale factor
	self.pyautogui_size[1] / display_size[1] # y scale factor
	)

	logger.info(
	f"Initialized AnthropicGUIClient:\n"
	f" Model: {model}\n"
	f" Platform: {platform}\n"
	f" Screen Size (physical): {screen_size}\n"
	f" PyAutoGUI Size (logical): {self.pyautogui_size}\n"
	f" Display Size (LLM): {display_size}\n"
	f" Resize Factor (LLM->PyAutoGUI): {self.resize_factor}\n"
	f" Prompt Caching: {enable_prompt_caching}"
	)

	def _create_client(self, api_key: Optional[str] = None):
	"""Create Anthropic client (only supports anthropic provider)."""
	key = api_key or self.api_key
	return Anthropic(api_key=key, max_retries=4)

	def _resize_screenshot(self, screenshot_bytes: bytes) -> bytes:
	"""
	Resize screenshot to display size for Computer Use API.

	For computer-use-2025-01-24, the screenshot must be resized to the
	display_width_px x display_height_px specified in the tool definition.
	"""
	screenshot_image = Image.open(io.BytesIO(screenshot_bytes))
	resized_image = screenshot_image.resize(self.display_size, Image.Resampling.LANCZOS)

	output_buffer = io.BytesIO()
	resized_image.save(output_buffer, format='PNG')
	return output_buffer.getvalue()

	def _scale_coordinates(self, x: int, y: int) -> Tuple[int, int]:
	"""
	Scale coordinates from display size to actual screen size.

	The API returns coordinates in display_size (e.g., 1024x768).
	We need to scale them to actual screen_size (e.g., 1920x1080) for execution.

	Args:
	x, y: Coordinates in display size space

	Returns:
	Scaled coordinates in actual screen size space
	"""
	scaled_x = int(x * self.resize_factor[0])
	scaled_y = int(y * self.resize_factor[1])
	return scaled_x, scaled_y

	async def plan_action(
	self,
	task_description: str,
	screenshot: bytes,
	action_history: List[Dict[str, Any]] = None,
	) -> Tuple[Optional[str], List[str]]:
	"""
	Plan next action based on task and current screenshot.
	Includes prompt caching, error handling, and backup API key support.

	Args:
	task_description: Task to accomplish
	screenshot: Current screenshot (PNG bytes)
	action_history: Previous actions (for context)

	Returns:
	Tuple of (reasoning, list of pyautogui commands)
	"""
	# Resize screenshot
	resized_screenshot = self._resize_screenshot(screenshot)
	screenshot_b64 = base64.b64encode(resized_screenshot).decode('utf-8')

	# Initialize messages with first task + screenshot
	if not self.messages:
	# IMPORTANT: Image should come BEFORE text for better model understanding
	# This matches OSWorld's implementation which has proven effectiveness
	self.messages.append({
	"role": "user",
	"content": [
	{
	"type": "image",
	"source": {
	"type": "base64",
	"media_type": "image/png",
	"data": screenshot_b64,
	},
	},
	{"type": "text", "text": task_description},
	]
	})

	# Filter images BEFORE adding new screenshot to control message size
	# This is critical to avoid exceeding the 25MB API limit
	image_truncation_threshold = 10
	if self.only_n_most_recent_images and len(self.messages) > 1:
	# Reserve 1 slot for the screenshot we're about to add
	maybe_filter_to_n_most_recent_images(
	self.messages,
	max(1, self.only_n_most_recent_images - 1),
	min_removal_threshold=1, # More aggressive filtering
	)

	# Add tool result from previous action if exists
	if self.messages and self.messages[-1]["role"] == "assistant":
	last_content = self.messages[-1]["content"]
	if isinstance(last_content, list) and any(
	block.get("type") == "tool_use" for block in last_content
	):
	tool_use_id = next(
	block["id"] for block in last_content
	if block.get("type") == "tool_use"
	)
	self._add_tool_result(tool_use_id, "Success", resized_screenshot)

	# Define tools and betas for claude-sonnet-4-5 with computer-use-2025-01-24
	tools = [{
	'name': 'computer',
	'type': 'computer_20250124',
	'display_width_px': self.display_size[0],
	'display_height_px': self.display_size[1],
	'display_number': 1
	}]
	betas = [COMPUTER_USE_BETA_FLAG]

	# Prepare system prompt with optional caching
	system = BetaTextBlockParam(
	type="text",
	text=get_system_prompt(self.platform)
	)

	# Enable prompt caching if supported and enabled
	if self.enable_prompt_caching:
	betas.append(PROMPT_CACHING_BETA_FLAG)
	inject_prompt_caching(self.messages)
	system["cache_control"] = {"type": "ephemeral"} # type: ignore

	# Model name - use claude-sonnet-4-5 directly
	model_name = "claude-sonnet-4-5"

	# Enable thinking for complex computer use tasks
	extra_body = {"thinking": {"type": "enabled", "budget_tokens": 2048}}

	# Log request details for debugging
	# Count current images in messages
	total_images = sum(
	1
	for message in self.messages
	for item in (message.get("content", []) if isinstance(message.get("content"), list) else [])
	if isinstance(item, dict) and item.get("type") == "image"
	)
	tool_result_images = sum(
	1
	for message in self.messages
	for item in (message.get("content", []) if isinstance(message.get("content"), list) else [])
	if isinstance(item, dict) and item.get("type") == "tool_result"
	for content in item.get("content", [])
	if isinstance(content, dict) and content.get("type") == "image"
	)
	logger.info(
	f"Anthropic API request:\n"
	f" Model: {model_name}\n"
	f" Display Size: {self.display_size}\n"
	f" Betas: {betas}\n"
	f" Images: {total_images} ({tool_result_images} in tool_results)\n"
	f" Messages: {len(self.messages)}"
	)

	# Try API call with retry and backup
	client = self._create_client()
	response = None

	try:
	# Retry loop with automatic image count reduction on 25MB error
	for attempt in range(API_RETRY_TIMES):
	try:
	response = client.beta.messages.create(
	max_tokens=self.max_tokens,
	messages=self.messages,
	model=model_name,
	system=[system],
	tools=tools,
	betas=betas,
	extra_body=extra_body
	)
	logger.info(f"API call succeeded on attempt {attempt + 1}")
	break

	except (APIError, APIStatusError, APIResponseValidationError) as e:
	error_msg = str(e)
	logger.warning(f"Anthropic API error (attempt {attempt+1}/{API_RETRY_TIMES}): {error_msg}")

	# Handle 25MB payload limit error (including HTTP 413)
	if ("25000000" in error_msg or
	"Member must have length less than or equal to" in error_msg or
	"request_too_large" in error_msg or
	"413" in str(e)):
	logger.warning("Detected 25MB limit error, reducing image count")
	current_count = self.only_n_most_recent_images
	new_count = max(1, current_count // 2)
	self.only_n_most_recent_images = new_count

	maybe_filter_to_n_most_recent_images(
	self.messages,
	new_count,
	min_removal_threshold=1, # Aggressive filtering when hitting limit
	)
	logger.info(f"Image count reduced from {current_count} to {new_count}")

	if attempt < API_RETRY_TIMES - 1:
	time.sleep(API_RETRY_INTERVAL)
	else:
	raise

	except (APIError, APIStatusError, APIResponseValidationError) as e:
	logger.error(f"Primary API key failed: {e}")

	# Try backup API key if available
	if self.backup_api_key:
	logger.warning("Retrying with backup API key...")
	try:
	backup_client = self._create_client(self.backup_api_key)
	response = backup_client.beta.messages.create(
	max_tokens=self.max_tokens,
	messages=self.messages,
	model=model_name,
	system=[system],
	tools=tools,
	betas=betas,
	extra_body=extra_body
	)
	logger.info("Successfully used backup API key")
	except Exception as backup_e:
	logger.error(f"Backup API key also failed: {backup_e}")
	return None, ["FAIL"]
	else:
	return None, ["FAIL"]

	except Exception as e:
	logger.error(f"Unexpected error: {e}")
	return None, ["FAIL"]

	if not response:
	return None, ["FAIL"]

	# Parse response using utility function
	response_params = response_to_params(response)

	# Extract reasoning and commands
	reasoning = ""
	commands = []

	for block in response_params:
	block_type = block.get("type")

	if block_type == "text":
	reasoning = block.get("text", "")
	elif block_type == "thinking":
	reasoning = block.get("thinking", "")
	elif block_type == "tool_use":
	tool_input = block.get("input", {})
	command = self._parse_computer_tool_use(tool_input)
	if command:
	commands.append(command)
	else:
	logger.warning(f"Failed to parse tool_use: {tool_input}")

	# Store assistant response
	self.messages.append({
	"role": "assistant",
	"content": response_params
	})

	logger.info(f"Parsed {len(commands)} commands from response")

	return reasoning, commands

	def _add_tool_result(
	self,
	tool_use_id: str,
	result: str,
	screenshot_bytes: Optional[bytes] = None
	):
	"""
	Add tool result to message history.
	IMPORTANT: Put screenshot BEFORE text for consistency with initial message.
	"""
	# Build content list with image first (if provided), then text
	content_list = []

	# Add screenshot first if provided (consistent with initial message ordering)
	if screenshot_bytes is not None:
	screenshot_b64 = base64.b64encode(screenshot_bytes).decode('utf-8')
	content_list.append({
	"type": "image",
	"source": {
	"type": "base64",
	"media_type": "image/png",
	"data": screenshot_b64
	}
	})

	# Then add text result
	content_list.append({"type": "text", "text": result})

	tool_result_content = [{
	"type": "tool_result",
	"tool_use_id": tool_use_id,
	"content": content_list
	}]

	self.messages.append({
	"role": "user",
	"content": tool_result_content
	})

	def _parse_computer_tool_use(self, tool_input: Dict[str, Any]) -> Optional[str]:
	"""
	Parse Anthropic computer tool use to pyautogui command.

	Args:
	tool_input: Tool input from Anthropic (action, coordinate, text, etc.)

	Returns:
	PyAutoGUI command string or control command (DONE, FAIL)
	"""
	action = tool_input.get("action")
	if not action:
	return None

	# Action conversion
	action_conversion = {
	"left click": "click",
	"right click": "right_click"
	}
	action = action_conversion.get(action, action)

	text = tool_input.get("text")
	coordinate = tool_input.get("coordinate")
	scroll_direction = tool_input.get("scroll_direction")
	scroll_amount = tool_input.get("scroll_amount", 5)

	# Scale coordinates to actual screen size
	if coordinate:
	coordinate = self._scale_coordinates(coordinate[0], coordinate[1])

	# Build commands
	command = ""

	if action == "mouse_move":
	if coordinate:
	x, y = coordinate
	command = f"pyautogui.moveTo({x}, {y}, duration=0.5)"

	elif action in ("left_click", "click"):
	if coordinate:
	x, y = coordinate
	command = f"pyautogui.click({x}, {y})"
	else:
	command = "pyautogui.click()"

	elif action == "right_click":
	if coordinate:
	x, y = coordinate
	command = f"pyautogui.rightClick({x}, {y})"
	else:
	command = "pyautogui.rightClick()"

	elif action == "double_click":
	if coordinate:
	x, y = coordinate
	command = f"pyautogui.doubleClick({x}, {y})"
	else:
	command = "pyautogui.doubleClick()"

	elif action == "middle_click":
	if coordinate:
	x, y = coordinate
	command = f"pyautogui.middleClick({x}, {y})"
	else:
	command = "pyautogui.middleClick()"

	elif action == "left_click_drag":
	if coordinate:
	x, y = coordinate
	command = f"pyautogui.dragTo({x}, {y}, duration=0.5)"

	elif action == "key":
	if text:
	keys = text.split('+')
	# Key conversion
	key_conversion = {
	"page_down": "pagedown",
	"page_up": "pageup",
	"super_l": "win",
	"super": "command",
	"escape": "esc"
	}
	converted_keys = [key_conversion.get(k.strip().lower(), k.strip().lower()) for k in keys]

	# Press and release keys
	for key in converted_keys:
	command += f"pyautogui.keyDown('{key}'); "
	for key in reversed(converted_keys):
	command += f"pyautogui.keyUp('{key}'); "
	# Remove trailing semicolon and space
	command = command.rstrip('; ')

	elif action == "type":
	if text:
	command = f"pyautogui.typewrite({repr(text)}, interval=0.01)"

	elif action == "scroll":
	if scroll_direction in ("up", "down"):
	scroll_value = scroll_amount if scroll_direction == "up" else -scroll_amount
	if coordinate:
	x, y = coordinate
	command = f"pyautogui.scroll({scroll_value}, {x}, {y})"
	else:
	command = f"pyautogui.scroll({scroll_value})"
	elif scroll_direction in ("left", "right"):
	scroll_value = scroll_amount if scroll_direction == "right" else -scroll_amount
	if coordinate:
	x, y = coordinate
	command = f"pyautogui.hscroll({scroll_value}, {x}, {y})"
	else:
	command = f"pyautogui.hscroll({scroll_value})"

	elif action == "screenshot":
	# Screenshot is automatically handled by the system
	# Return special marker to indicate no action needed
	return "SCREENSHOT"

	elif action == "wait":
	# Wait for specified duration
	duration = tool_input.get("duration", 1)
	command = f"pyautogui.sleep({duration})"

	elif action == "done":
	return "DONE"

	elif action == "fail":
	return "FAIL"

	return command if command else None

	def reset(self):
	"""Reset message history."""
	self.messages = []
	logger.info("Reset AnthropicGUIClient message history")