from typing import List, cast
from enum import Enum
from datetime import datetime
from openspace.utils.logging import Logger
logger = Logger.get_logger(__name__)
try:
from anthropic.types.beta import (
BetaCacheControlEphemeralParam,
BetaContentBlockParam,
BetaImageBlockParam,
BetaMessage,
BetaMessageParam,
BetaTextBlock,
BetaTextBlockParam,
BetaToolResultBlockParam,
BetaToolUseBlockParam,
)
ANTHROPIC_AVAILABLE = True
except ImportError:
ANTHROPIC_AVAILABLE = False
# Beta flags
# For claude-sonnet-4-5 with computer-use-2025-01-24
COMPUTER_USE_BETA_FLAG = "computer-use-2025-01-24"
PROMPT_CACHING_BETA_FLAG = "prompt-caching-2024-07-31"
class APIProvider(Enum):
"""API Provider enumeration"""
ANTHROPIC = "anthropic"
# BEDROCK = "bedrock"
# VERTEX = "vertex"
# Provider to model name mapping (simplified for claude-sonnet-4-5 only)
PROVIDER_TO_DEFAULT_MODEL_NAME: dict = {
(APIProvider.ANTHROPIC, "claude-sonnet-4-5"): "claude-sonnet-4-5",
# (APIProvider.BEDROCK, "claude-sonnet-4-5"): "us.anthropic.claude-sonnet-4-5-v1:0",
# (APIProvider.VERTEX, "claude-sonnet-4-5"): "claude-sonnet-4-5-v1",
}
def get_system_prompt(platform: str = "Ubuntu") -> str:
"""
Get system prompt based on platform.
Args:
platform: Platform type (Ubuntu, Windows, macOS, or Darwin)
Returns:
System prompt string
"""
# Normalize platform name
platform_lower = platform.lower()
if platform_lower in ["windows", "win32"]:
return f"""
* You are utilising a Windows virtual machine using x86_64 architecture with internet access.
* You can use the computer tool to interact with the desktop: take screenshots, click, type, and control applications.
* To accomplish tasks, you MUST use the computer tool to see the screen and take actions.
* To open browser, please just click on the Chrome icon. Note, Chrome is what is installed on your system.
* When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
* DO NOT ask users for clarification during task execution. DO NOT stop to request more information from users. Always take action using available tools.
* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
* Home directory of this Windows system is 'C:\\Users\\user'.
* When you want to open some applications on Windows, please use Double Click on it instead of clicking once.
* After each action, the system will provide you with a new screenshot showing the result.
* Continue taking actions until the task is complete.
"""
elif platform_lower in ["macos", "darwin", "mac"]:
return f"""
* You are utilising a macOS system with internet access.
* You can use the computer tool to interact with the desktop: take screenshots, click, type, and control applications.
* To accomplish tasks, you MUST use the computer tool to see the screen and take actions.
* To open browser, please just click on the Chrome icon. Note, Chrome is what is installed on your system.
* When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
* DO NOT ask users for clarification during task execution. DO NOT stop to request more information from users. Always take action using available tools.
* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
* Home directory of this macOS system is typically '/Users/[username]' or can be accessed via '~'.
* On macOS, use Command (⌘) key combinations instead of Ctrl (e.g., Command+C for copy).
* After each action, the system will provide you with a new screenshot showing the result.
* Continue taking actions until the task is complete.
* When the task is completed, simply describe what you've done in your response WITHOUT using the tool again.
"""
else: # Ubuntu/Linux
return f"""
* You are utilising an Ubuntu virtual machine using x86_64 architecture with internet access.
* You can use the computer tool to interact with the desktop: take screenshots, click, type, and control applications.
* To accomplish tasks, you MUST use the computer tool to see the screen and take actions.
* To open browser, please just click on the Chrome icon. Note, Chrome is what is installed on your system.
* When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
* DO NOT ask users for clarification during task execution. DO NOT stop to request more information from users. Always take action using available tools.
* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
* The current date is {datetime.today().strftime('%A, %B %d, %Y')}.
* Home directory of this Ubuntu system is '/home/user'.
* After each action, the system will provide you with a new screenshot showing the result.
* Continue taking actions until the task is complete.
"""
def inject_prompt_caching(messages: List[BetaMessageParam]) -> None:
"""
Set cache breakpoints for the 3 most recent turns.
One cache breakpoint is left for tools/system prompt, to be shared across sessions.
Args:
messages: Message history (modified in place)
"""
if not ANTHROPIC_AVAILABLE:
return
breakpoints_remaining = 3
for message in reversed(messages):
if message["role"] == "user" and isinstance(
content := message["content"], list
):
if breakpoints_remaining:
breakpoints_remaining -= 1
# Use type ignore to bypass TypedDict check until SDK types are updated
content[-1]["cache_control"] = BetaCacheControlEphemeralParam( # type: ignore
{"type": "ephemeral"}
)
else:
content[-1].pop("cache_control", None)
# we'll only ever have one extra turn per loop
break
def maybe_filter_to_n_most_recent_images(
messages: List[BetaMessageParam],
images_to_keep: int,
min_removal_threshold: int,
) -> None:
"""
With the assumption that images are screenshots that are of diminishing value as
the conversation progresses, remove all but the final `images_to_keep` tool_result
images in place, with a chunk of min_removal_threshold to reduce the amount we
break the implicit prompt cache.
Args:
messages: Message history (modified in place)
images_to_keep: Number of recent images to keep
min_removal_threshold: Minimum number of images to remove at once (for cache efficiency)
"""
if not ANTHROPIC_AVAILABLE or images_to_keep is None:
return
tool_result_blocks = cast(
list[BetaToolResultBlockParam],
[
item
for message in messages
for item in (
message["content"] if isinstance(message["content"], list) else []
)
if isinstance(item, dict) and item.get("type") == "tool_result"
],
)
total_images = sum(
1
for tool_result in tool_result_blocks
for content in tool_result.get("content", [])
if isinstance(content, dict) and content.get("type") == "image"
)
images_to_remove = total_images - images_to_keep
# for better cache behavior, we want to remove in chunks
images_to_remove -= images_to_remove % min_removal_threshold
for tool_result in tool_result_blocks:
if isinstance(tool_result.get("content"), list):
new_content = []
for content in tool_result.get("content", []):
if isinstance(content, dict) and content.get("type") == "image":
if images_to_remove > 0:
images_to_remove -= 1
continue
new_content.append(content)
tool_result["content"] = new_content
def response_to_params(response: BetaMessage) -> List[BetaContentBlockParam]:
"""
Convert Anthropic response to parameter list.
Handles both text blocks, tool use blocks, and thinking blocks.
Args:
response: Anthropic API response
Returns:
List of content blocks
"""
if not ANTHROPIC_AVAILABLE:
return []
res: List[BetaContentBlockParam] = []
if response.content:
for block in response.content:
# Check block type using type attribute
# Note: type may be a string or enum, so convert to string for comparison
block_type = str(getattr(block, "type", ""))
if block_type == "text":
# Regular text block
if isinstance(block, BetaTextBlock) and block.text:
res.append(BetaTextBlockParam(type="text", text=block.text))
elif block_type == "thinking":
# Thinking block (for Claude 4 and Sonnet 3.7)
thinking_block = {
"type": "thinking",
"thinking": getattr(block, "thinking", ""),
}
if hasattr(block, "signature"):
thinking_block["signature"] = getattr(block, "signature", None)
res.append(cast(BetaContentBlockParam, thinking_block))
elif block_type == "tool_use":
# Tool use block - only include required fields to avoid API errors
# (e.g., 'caller' field is not permitted by Anthropic API)
tool_use_dict = {
"type": "tool_use",
"id": block.id,
"name": block.name,
"input": block.input,
}
res.append(cast(BetaToolUseBlockParam, tool_use_dict))
else:
# Unknown block type - try to handle generically
try:
res.append(cast(BetaContentBlockParam, block.model_dump()))
except Exception as e:
logger.warning(f"Failed to parse block type {block_type}: {e}")
return res
else:
return []