Spaces:

baqr
/

computer_use_ootb

Runtime error

App Files Files Community

computer_use_ootb / computer_use_demo /tools /computer.py

baqr

Upload folder using huggingface_hub

d73c58e verified 10 months ago

raw

history blame contribute delete

25.3 kB

	import subprocess
	import platform
	import pyautogui
	import asyncio
	import base64
	import os
	import time
	if platform.system() == "Darwin":
	import Quartz # uncomment this line if you are on macOS
	from enum import StrEnum
	from pathlib import Path
	from typing import Literal, TypedDict
	from uuid import uuid4
	from screeninfo import get_monitors

	from PIL import ImageGrab, Image
	from functools import partial

	from anthropic.types.beta import BetaToolComputerUse20241022Param

	from .base import BaseAnthropicTool, ToolError, ToolResult
	from .run import run

	OUTPUT_DIR = "./tmp/outputs"

	TYPING_DELAY_MS = 12
	TYPING_GROUP_SIZE = 50

	Action = Literal[
	"key",
	"type",
	"mouse_move",
	"left_click",
	"left_click_drag",
	"right_click",
	"middle_click",
	"double_click",
	"screenshot",
	"cursor_position",
	]


	class Resolution(TypedDict):
	width: int
	height: int


	MAX_SCALING_TARGETS: dict[str, Resolution] = {
	"XGA": Resolution(width=1024, height=768), # 4:3
	"WXGA": Resolution(width=1280, height=800), # 16:10
	"FWXGA": Resolution(width=1366, height=768), # ~16:9
	}


	class ScalingSource(StrEnum):
	COMPUTER = "computer"
	API = "api"


	class ComputerToolOptions(TypedDict):
	display_height_px: int
	display_width_px: int
	display_number: int \| None


	def chunks(s: str, chunk_size: int) -> list[str]:
	return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]


	def get_screen_details():
	screens = get_monitors()
	screen_details = []

	# Sort screens by x position to arrange from left to right
	sorted_screens = sorted(screens, key=lambda s: s.x)

	# Loop through sorted screens and assign positions
	primary_index = 0
	for i, screen in enumerate(sorted_screens):
	if i == 0:
	layout = "Left"
	elif i == len(sorted_screens) - 1:
	layout = "Right"
	else:
	layout = "Center"

	if screen.is_primary:
	position = "Primary"
	primary_index = i
	else:
	position = "Secondary"
	screen_info = f"Screen {i + 1}: {screen.width}x{screen.height}, {layout}, {position}"
	screen_details.append(screen_info)

	return screen_details, primary_index


	class ComputerTool(BaseAnthropicTool):
	"""
	A tool that allows the agent to interact with the screen, keyboard, and mouse of the current computer.
	Adapted for Windows using 'pyautogui'.
	"""

	name: Literal["computer"] = "computer"
	api_type: Literal["computer_20241022"] = "computer_20241022"
	width: int
	height: int
	display_num: int \| None

	_screenshot_delay = 2.0
	_scaling_enabled = True

	@property
	def options(self) -> ComputerToolOptions:
	width, height = self.scale_coordinates(
	ScalingSource.COMPUTER, self.width, self.height
	)
	return {
	"display_width_px": width,
	"display_height_px": height,
	"display_number": self.display_num,
	}

	def to_params(self) -> BetaToolComputerUse20241022Param:
	return {"name": self.name, "type": self.api_type, **self.options}

	def __init__(self, selected_screen: int = 0, is_scaling: bool = True):
	super().__init__()

	# Get screen width and height using Windows command
	self.display_num = None
	self.offset_x = 0
	self.offset_y = 0
	self.selected_screen = selected_screen
	self.is_scaling = is_scaling
	self.width, self.height = self.get_screen_size()

	# Path to cliclick
	self.cliclick = "cliclick"
	self.key_conversion = {"Page_Down": "pagedown",
	"Page_Up": "pageup",
	"Super_L": "win",
	"Escape": "esc"}

	self.action_conversion = {"left click": "click",
	"right click": "right_click"}

	system = platform.system() # Detect platform
	if system == "Windows":
	screens = get_monitors()
	sorted_screens = sorted(screens, key=lambda s: s.x)
	if self.selected_screen < 0 or self.selected_screen >= len(screens):
	raise IndexError("Invalid screen index.")
	screen = sorted_screens[self.selected_screen]
	bbox = (screen.x, screen.y, screen.x + screen.width, screen.y + screen.height)

	elif system == "Darwin": # macOS
	max_displays = 32 # Maximum number of displays to handle
	active_displays = Quartz.CGGetActiveDisplayList(max_displays, None, None)[1]
	screens = []
	for display_id in active_displays:
	bounds = Quartz.CGDisplayBounds(display_id)
	screens.append({
	'id': display_id, 'x': int(bounds.origin.x), 'y': int(bounds.origin.y),
	'width': int(bounds.size.width), 'height': int(bounds.size.height),
	'is_primary': Quartz.CGDisplayIsMain(display_id) # Check if this is the primary display
	})
	sorted_screens = sorted(screens, key=lambda s: s['x'])
	if self.selected_screen < 0 or self.selected_screen >= len(screens):
	raise IndexError("Invalid screen index.")
	screen = sorted_screens[self.selected_screen]
	bbox = (screen['x'], screen['y'], screen['x'] + screen['width'], screen['y'] + screen['height'])
	else: # Linux or other OS
	cmd = "xrandr \| grep ' primary' \| awk '{print $4}'"
	try:
	output = subprocess.check_output(cmd, shell=True).decode()
	resolution = output.strip().split()[0]
	width, height = map(int, resolution.split('x'))
	bbox = (0, 0, width, height) # Assuming single primary screen for simplicity
	except subprocess.CalledProcessError:
	raise RuntimeError("Failed to get screen resolution on Linux.")

	self.offset_x = screen['x'] if system == "Darwin" else screen.x
	self.offset_y = screen['y'] if system == "Darwin" else screen.y
	self.bbox = bbox


	async def __call__(
	self,
	*,
	action: Action,
	text: str \| None = None,
	coordinate: tuple[int, int] \| None = None,
	**kwargs,
	):
	print(f"action: {action}, text: {text}, coordinate: {coordinate}")
	action = self.action_conversion.get(action, action)

	if action in ("mouse_move", "left_click_drag"):
	if coordinate is None:
	raise ToolError(f"coordinate is required for {action}")
	if text is not None:
	raise ToolError(f"text is not accepted for {action}")
	if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
	raise ToolError(f"{coordinate} must be a tuple of length 2")
	# if not all(isinstance(i, int) and i >= 0 for i in coordinate):
	if not all(isinstance(i, int) for i in coordinate):
	raise ToolError(f"{coordinate} must be a tuple of non-negative ints")

	if self.is_scaling:
	x, y = self.scale_coordinates(
	ScalingSource.API, coordinate[0], coordinate[1]
	)
	else:
	x, y = coordinate

	# print(f"scaled_coordinates: {x}, {y}")
	# print(f"offset: {self.offset_x}, {self.offset_y}")

	x += self.offset_x
	y += self.offset_y

	print(f"mouse move to {x}, {y}")

	if action == "mouse_move":
	pyautogui.moveTo(x, y)
	return ToolResult(output=f"Moved mouse to ({x}, {y})")
	elif action == "left_click_drag":
	current_x, current_y = pyautogui.position()
	pyautogui.dragTo(x, y, duration=0.5) # Adjust duration as needed
	return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})")

	if action in ("key", "type"):
	if text is None:
	raise ToolError(f"text is required for {action}")
	if coordinate is not None:
	raise ToolError(f"coordinate is not accepted for {action}")
	if not isinstance(text, str):
	raise ToolError(output=f"{text} must be a string")

	if action == "key":
	# Handle key combinations
	keys = text.split('+')
	for key in keys:
	key = self.key_conversion.get(key.strip(), key.strip())
	key = key.lower()
	pyautogui.keyDown(key) # Press down each key
	for key in reversed(keys):
	key = self.key_conversion.get(key.strip(), key.strip())
	key = key.lower()
	pyautogui.keyUp(key) # Release each key in reverse order
	return ToolResult(output=f"Pressed keys: {text}")

	elif action == "type":
	pyautogui.typewrite(text, interval=TYPING_DELAY_MS / 1000) # Convert ms to seconds
	screenshot_base64 = (await self.screenshot()).base64_image
	return ToolResult(output=text, base64_image=screenshot_base64)

	if action in (
	"left_click",
	"right_click",
	"double_click",
	"middle_click",
	"screenshot",
	"cursor_position",
	"left_press",
	):
	if text is not None:
	raise ToolError(f"text is not accepted for {action}")
	if coordinate is not None:
	raise ToolError(f"coordinate is not accepted for {action}")

	if action == "screenshot":
	return await self.screenshot()
	elif action == "cursor_position":
	x, y = pyautogui.position()
	x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
	return ToolResult(output=f"X={x},Y={y}")
	else:
	if action == "left_click":
	pyautogui.click()
	elif action == "right_click":
	pyautogui.rightClick()
	elif action == "middle_click":
	pyautogui.middleClick()
	elif action == "double_click":
	pyautogui.doubleClick()
	elif action == "left_press":
	pyautogui.mouseDown()
	time.sleep(1)
	pyautogui.mouseUp()
	return ToolResult(output=f"Performed {action}")

	raise ToolError(f"Invalid action: {action}")


	def sync_call(
	self,
	*,
	action: Action,
	text: str \| None = None,
	coordinate: tuple[int, int] \| None = None,
	**kwargs,
	):
	print(f"action: {action}, text: {text}, coordinate: {coordinate}")
	action = self.action_conversion.get(action, action)

	if action in ("mouse_move", "left_click_drag"):
	if coordinate is None:
	raise ToolError(f"coordinate is required for {action}")
	if text is not None:
	raise ToolError(f"text is not accepted for {action}")
	if not isinstance(coordinate, (list, tuple)) or len(coordinate) != 2:
	raise ToolError(f"{coordinate} must be a tuple of length 2")
	# if not all(isinstance(i, int) and i >= 0 for i in coordinate):
	if not all(isinstance(i, int) for i in coordinate):
	raise ToolError(f"{coordinate} must be a tuple of non-negative ints")

	if self.is_scaling:
	x, y = self.scale_coordinates(
	ScalingSource.API, coordinate[0], coordinate[1]
	)
	else:
	x, y = coordinate

	# print(f"scaled_coordinates: {x}, {y}")
	# print(f"offset: {self.offset_x}, {self.offset_y}")
	x += self.offset_x
	y += self.offset_y

	print(f"mouse move to {x}, {y}")

	if action == "mouse_move":
	pyautogui.moveTo(x, y)
	return ToolResult(output=f"Moved mouse to ({x}, {y})")
	elif action == "left_click_drag":
	current_x, current_y = pyautogui.position()
	pyautogui.dragTo(x, y, duration=0.5) # Adjust duration as needed
	return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})")

	if action in ("key", "type"):
	if text is None:
	raise ToolError(f"text is required for {action}")
	if coordinate is not None:
	raise ToolError(f"coordinate is not accepted for {action}")
	if not isinstance(text, str):
	raise ToolError(output=f"{text} must be a string")

	if action == "key":
	# Handle key combinations
	keys = text.split('+')
	for key in keys:
	key = self.key_conversion.get(key.strip(), key.strip())
	key = key.lower()
	pyautogui.keyDown(key) # Press down each key
	for key in reversed(keys):
	key = self.key_conversion.get(key.strip(), key.strip())
	key = key.lower()
	pyautogui.keyUp(key) # Release each key in reverse order
	return ToolResult(output=f"Pressed keys: {text}")

	elif action == "type":
	pyautogui.typewrite(text, interval=TYPING_DELAY_MS / 1000) # Convert ms to seconds
	return ToolResult(output=text)

	if action in (
	"left_click",
	"right_click",
	"double_click",
	"middle_click",
	"screenshot",
	"cursor_position",
	"left_press",
	):
	if text is not None:
	raise ToolError(f"text is not accepted for {action}")
	if coordinate is not None:
	raise ToolError(f"coordinate is not accepted for {action}")
	elif action == "cursor_position":
	x, y = pyautogui.position()
	x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
	return ToolResult(output=f"X={x},Y={y}")
	else:
	if action == "left_click":
	pyautogui.click()
	elif action == "right_click":
	pyautogui.rightClick()
	elif action == "middle_click":
	pyautogui.middleClick()
	elif action == "double_click":
	pyautogui.doubleClick()
	elif action == "left_press":
	pyautogui.mouseDown()
	time.sleep(1)
	pyautogui.mouseUp()
	return ToolResult(output=f"Performed {action}")

	raise ToolError(f"Invalid action: {action}")

	async def screenshot(self):

	import time
	time.sleep(1)

	"""Take a screenshot of the current screen and return a ToolResult with the base64 encoded image."""
	output_dir = Path(OUTPUT_DIR)
	output_dir.mkdir(parents=True, exist_ok=True)
	path = output_dir / f"screenshot_{uuid4().hex}.png"

	ImageGrab.grab = partial(ImageGrab.grab, all_screens=True)

	# Detect platform
	system = platform.system()

	if system == "Windows":
	# Windows: Use screeninfo to get monitor details
	screens = get_monitors()

	# Sort screens by x position to arrange from left to right
	sorted_screens = sorted(screens, key=lambda s: s.x)

	if self.selected_screen < 0 or self.selected_screen >= len(screens):
	raise IndexError("Invalid screen index.")

	screen = sorted_screens[self.selected_screen]
	bbox = (screen.x, screen.y, screen.x + screen.width, screen.y + screen.height)

	elif system == "Darwin": # macOS
	# macOS: Use Quartz to get monitor details
	max_displays = 32 # Maximum number of displays to handle
	active_displays = Quartz.CGGetActiveDisplayList(max_displays, None, None)[1]

	# Get the display bounds (resolution) for each active display
	screens = []
	for display_id in active_displays:
	bounds = Quartz.CGDisplayBounds(display_id)
	screens.append({
	'id': display_id,
	'x': int(bounds.origin.x),
	'y': int(bounds.origin.y),
	'width': int(bounds.size.width),
	'height': int(bounds.size.height),
	'is_primary': Quartz.CGDisplayIsMain(display_id) # Check if this is the primary display
	})

	# Sort screens by x position to arrange from left to right
	sorted_screens = sorted(screens, key=lambda s: s['x'])

	if self.selected_screen < 0 or self.selected_screen >= len(screens):
	raise IndexError("Invalid screen index.")

	screen = sorted_screens[self.selected_screen]
	bbox = (screen['x'], screen['y'], screen['x'] + screen['width'], screen['y'] + screen['height'])

	else: # Linux or other OS
	cmd = "xrandr \| grep ' primary' \| awk '{print $4}'"
	try:
	output = subprocess.check_output(cmd, shell=True).decode()
	resolution = output.strip().split()[0]
	width, height = map(int, resolution.split('x'))
	bbox = (0, 0, width, height) # Assuming single primary screen for simplicity
	except subprocess.CalledProcessError:
	raise RuntimeError("Failed to get screen resolution on Linux.")

	# Take screenshot using the bounding box
	screenshot = ImageGrab.grab(bbox=bbox)

	# Set offsets (for potential future use)
	self.offset_x = screen['x'] if system == "Darwin" else screen.x
	self.offset_y = screen['y'] if system == "Darwin" else screen.y

	print(f"target_dimension {self.target_dimension}")

	if not hasattr(self, 'target_dimension'):
	screenshot = self.padding_image(screenshot)
	self.target_dimension = MAX_SCALING_TARGETS["WXGA"]

	# Resize if target_dimensions are specified
	print(f"offset is {self.offset_x}, {self.offset_y}")
	print(f"target_dimension is {self.target_dimension}")
	screenshot = screenshot.resize((self.target_dimension["width"], self.target_dimension["height"]))

	# Save the screenshot
	screenshot.save(str(path))

	if path.exists():
	# Return a ToolResult instance instead of a dictionary
	return ToolResult(base64_image=base64.b64encode(path.read_bytes()).decode())

	raise ToolError(f"Failed to take screenshot: {path} does not exist.")

	def padding_image(self, screenshot):
	"""Pad the screenshot to 16:10 aspect ratio, when the aspect ratio is not 16:10."""
	_, height = screenshot.size
	new_width = height * 16 // 10

	padding_image = Image.new("RGB", (new_width, height), (255, 255, 255))
	# padding to top left
	padding_image.paste(screenshot, (0, 0))
	return padding_image

	async def shell(self, command: str, take_screenshot=True) -> ToolResult:
	"""Run a shell command and return the output, error, and optionally a screenshot."""
	_, stdout, stderr = await run(command)
	base64_image = None

	if take_screenshot:
	# delay to let things settle before taking a screenshot
	await asyncio.sleep(self._screenshot_delay)
	base64_image = (await self.screenshot()).base64_image

	return ToolResult(output=stdout, error=stderr, base64_image=base64_image)

	def scale_coordinates(self, source: ScalingSource, x: int, y: int):
	"""Scale coordinates to a target maximum resolution."""
	if not self._scaling_enabled:
	return x, y
	ratio = self.width / self.height
	target_dimension = None

	for target_name, dimension in MAX_SCALING_TARGETS.items():
	# allow some error in the aspect ratio - not ratios are exactly 16:9
	if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
	if dimension["width"] < self.width:
	target_dimension = dimension
	self.target_dimension = target_dimension
	# print(f"target_dimension: {target_dimension}")
	break

	if target_dimension is None:
	# TODO: currently we force the target to be WXGA (16:10), when it cannot find a match
	target_dimension = MAX_SCALING_TARGETS["WXGA"]
	self.target_dimension = MAX_SCALING_TARGETS["WXGA"]

	# should be less than 1
	x_scaling_factor = target_dimension["width"] / self.width
	y_scaling_factor = target_dimension["height"] / self.height
	if source == ScalingSource.API:
	if x > self.width or y > self.height:
	raise ToolError(f"Coordinates {x}, {y} are out of bounds")
	# scale up
	return round(x / x_scaling_factor), round(y / y_scaling_factor)
	# scale down
	return round(x * x_scaling_factor), round(y * y_scaling_factor)

	def get_screen_size(self):
	if platform.system() == "Windows":
	# Use screeninfo to get primary monitor on Windows
	screens = get_monitors()

	# Sort screens by x position to arrange from left to right
	sorted_screens = sorted(screens, key=lambda s: s.x)

	if self.selected_screen is None:
	primary_monitor = next((m for m in get_monitors() if m.is_primary), None)
	return primary_monitor.width, primary_monitor.height
	elif self.selected_screen < 0 or self.selected_screen >= len(screens):
	raise IndexError("Invalid screen index.")
	else:
	screen = sorted_screens[self.selected_screen]
	return screen.width, screen.height

	elif platform.system() == "Darwin":
	# macOS part using Quartz to get screen information
	max_displays = 32 # Maximum number of displays to handle
	active_displays = Quartz.CGGetActiveDisplayList(max_displays, None, None)[1]

	# Get the display bounds (resolution) for each active display
	screens = []
	for display_id in active_displays:
	bounds = Quartz.CGDisplayBounds(display_id)
	screens.append({
	'id': display_id,
	'x': int(bounds.origin.x),
	'y': int(bounds.origin.y),
	'width': int(bounds.size.width),
	'height': int(bounds.size.height),
	'is_primary': Quartz.CGDisplayIsMain(display_id) # Check if this is the primary display
	})

	# Sort screens by x position to arrange from left to right
	sorted_screens = sorted(screens, key=lambda s: s['x'])

	if self.selected_screen is None:
	# Find the primary monitor
	primary_monitor = next((screen for screen in screens if screen['is_primary']), None)
	if primary_monitor:
	return primary_monitor['width'], primary_monitor['height']
	else:
	raise RuntimeError("No primary monitor found.")
	elif self.selected_screen < 0 or self.selected_screen >= len(screens):
	raise IndexError("Invalid screen index.")
	else:
	# Return the resolution of the selected screen
	screen = sorted_screens[self.selected_screen]
	return screen['width'], screen['height']

	else: # Linux or other OS
	cmd = "xrandr \| grep ' primary' \| awk '{print $4}'"
	try:
	output = subprocess.check_output(cmd, shell=True).decode()
	resolution = output.strip().split()[0]
	width, height = map(int, resolution.split('x'))
	return width, height
	except subprocess.CalledProcessError:
	raise RuntimeError("Failed to get screen resolution on Linux.")

	def get_mouse_position(self):
	# TODO: enhance this func
	from AppKit import NSEvent
	from Quartz import CGEventSourceCreate, kCGEventSourceStateCombinedSessionState

	loc = NSEvent.mouseLocation()
	# Adjust for different coordinate system
	return int(loc.x), int(self.height - loc.y)

	def map_keys(self, text: str):
	"""Map text to cliclick key codes if necessary."""
	# For simplicity, return text as is
	# Implement mapping if special keys are needed
	return text