Spaces:

omar0scarf
/

ui-tars-api

Build error

App Files Files Community

ui-tars-api / action_parser.py

omar0scarf

Upload 10 files

3d37441 verified about 1 month ago

raw

history blame contribute delete

11.8 kB

	"""
	UI-TARS Action Parser
	=====================
	Utilities for parsing and executing UI-TARS model outputs
	Compatible with: https://github.com/bytedance/UI-TARS-desktop
	"""

	import re
	from typing import Dict, Any, Optional, List, Tuple
	from dataclasses import dataclass


	@dataclass
	class ParsedAction:
	"""Parsed action structure"""
	action_type: str
	parameters: Dict[str, Any]
	raw_action: str


	class ActionParser:
	"""Parser for UI-TARS action outputs"""

	# Action patterns
	ACTION_PATTERNS = {
	'click': r'click$start_box=[\'"]<\\|box_start\\|\>\((\d+),(\d+)$<\\|box_end\\|>[\'"]\)',
	'left_double': r'left_double$start_box=[\'"]<\\|box_start\\|\>\((\d+),(\d+)$<\\|box_end\\|>[\'"]\)',
	'right_single': r'right_single$start_box=[\'"]<\\|box_start\\|\>\((\d+),(\d+)$<\\|box_end\\|>[\'"]\)',
	'drag': r'drag$start_box=[\'"]<\\|box_start\\|\>\((\d+),(\d+)$<\\|box_end\\|>[\'"],\s*end_box=[\'"]<\\|box_start\\|\>$(\d+),(\d+)$<\\|box_end\\|>[\'"]\)',
	'type': r'type$content=[\'"](.+?)[\'"]$',
	'hotkey': r'hotkey$key=[\'"](.+?)[\'"]$',
	'scroll': r'scroll$start_box=[\'"]<\\|box_start\\|\>\((\d+),(\d+)$<\\|box_end\\|>[\'"],\s*direction=[\'"](\w+)[\'"]\)',
	'wait': r'wait',
	'finished': r'finished$content=[\'"](.+?)[\'"]$',
	# Mobile actions
	'long_press': r'long_press$start_box=[\'"]<\\|box_start\\|\>\((\d+),(\d+)$<\\|box_end\\|>[\'"]\)',
	'open_app': r'open_app$app_name=[\'"](.+?)[\'"]$',
	'press_home': r'press_home',
	'press_back': r'press_back',
	}

	@classmethod
	def parse_response(cls, response: str) -> Dict[str, Any]:
	"""
	Parse the full model response

	Args:
	response: Raw model output

	Returns:
	Dictionary with thought and action
	"""
	result = {
	'thought': None,
	'action': None,
	'action_type': None,
	'parameters': {}
	}

	# Extract thought
	thought_match = re.search(r'Thought:\s*(.+?)(?=\nAction:\|$)', response, re.DOTALL)
	if thought_match:
	result['thought'] = thought_match.group(1).strip()

	# Extract action
	action_match = re.search(r'Action:\s*(.+?)(?=\n\|$)', response, re.DOTALL)
	if action_match:
	action_str = action_match.group(1).strip()
	result['action'] = action_str

	# Parse action type and parameters
	parsed = cls.parse_action(action_str)
	result['action_type'] = parsed['action_type']
	result['parameters'] = parsed['parameters']
	else:
	# No "Action:" prefix, try to parse the whole response
	result['action'] = response.strip()
	parsed = cls.parse_action(result['action'])
	result['action_type'] = parsed['action_type']
	result['parameters'] = parsed['parameters']

	return result

	@classmethod
	def parse_action(cls, action_str: str) -> Dict[str, Any]:
	"""
	Parse an action string

	Args:
	action_str: Action string (e.g., "click(start_box='...')")

	Returns:
	Dictionary with action_type and parameters
	"""
	for action_type, pattern in cls.ACTION_PATTERNS.items():
	match = re.match(pattern, action_str)
	if match:
	return {
	'action_type': action_type,
	'parameters': cls._extract_parameters(action_type, match.groups())
	}

	return {
	'action_type': 'unknown',
	'parameters': {'raw': action_str}
	}

	@classmethod
	def _extract_parameters(cls, action_type: str, groups: Tuple) -> Dict[str, Any]:
	"""Extract parameters based on action type"""
	params = {}

	if action_type in ['click', 'left_double', 'right_single', 'long_press']:
	params['x'] = int(groups[0])
	params['y'] = int(groups[1])

	elif action_type == 'drag':
	params['start_x'] = int(groups[0])
	params['start_y'] = int(groups[1])
	params['end_x'] = int(groups[2])
	params['end_y'] = int(groups[3])

	elif action_type == 'type':
	params['content'] = groups[0]

	elif action_type == 'hotkey':
	params['key'] = groups[0]

	elif action_type == 'scroll':
	params['x'] = int(groups[0])
	params['y'] = int(groups[1])
	params['direction'] = groups[2]

	elif action_type == 'finished':
	params['content'] = groups[0]

	elif action_type == 'open_app':
	params['app_name'] = groups[0]

	return params

	@staticmethod
	def convert_coordinates(
	x_rel: int,
	y_rel: int,
	screen_width: int,
	screen_height: int
	) -> Tuple[int, int]:
	"""
	Convert relative coordinates (0-1000) to absolute screen coordinates

	Args:
	x_rel: Relative X coordinate (0-1000)
	y_rel: Relative Y coordinate (0-1000)
	screen_width: Screen width in pixels
	screen_height: Screen height in pixels

	Returns:
	Tuple of (x_absolute, y_absolute)
	"""
	x_abs = round(screen_width * x_rel / 1000)
	y_abs = round(screen_height * y_rel / 1000)
	return (x_abs, y_abs)

	@classmethod
	def get_all_coordinates(cls, action_str: str) -> List[Dict[str, int]]:
	"""
	Extract all coordinates from an action string

	Args:
	action_str: Action string

	Returns:
	List of coordinate dictionaries
	"""
	coords = []
	pattern = r'<\\|box_start\\|\>$(\d+),(\d+)$<\\|box_end\\|\>'
	matches = re.findall(pattern, action_str)

	for match in matches:
	coords.append({
	'x': int(match[0]),
	'y': int(match[1])
	})

	return coords


	class ActionExecutor:
	"""
	Execute parsed actions using pyautogui

	Note: This requires pyautogui to be installed
	"""

	def __init__(self, screen_width: int = 1920, screen_height: int = 1080):
	"""
	Initialize the executor

	Args:
	screen_width: Screen width in pixels
	screen_height: Screen height in pixels
	"""
	self.screen_width = screen_width
	self.screen_height = screen_height
	self.parser = ActionParser()

	try:
	import pyautogui
	self.pyautogui = pyautogui
	self.pyautogui.FAILSAFE = True
	except ImportError:
	raise ImportError("pyautogui is required for action execution. Install with: pip install pyautogui")

	def execute(self, action_str: str) -> Dict[str, Any]:
	"""
	Execute an action string

	Args:
	action_str: Action string from model

	Returns:
	Execution result
	"""
	parsed = self.parser.parse_action(action_str)
	action_type = parsed['action_type']
	params = parsed['parameters']

	try:
	if action_type == 'click':
	x, y = self.parser.convert_coordinates(
	params['x'], params['y'],
	self.screen_width, self.screen_height
	)
	self.pyautogui.click(x, y)
	return {'success': True, 'action': 'click', 'coordinates': (x, y)}

	elif action_type == 'left_double':
	x, y = self.parser.convert_coordinates(
	params['x'], params['y'],
	self.screen_width, self.screen_height
	)
	self.pyautogui.doubleClick(x, y)
	return {'success': True, 'action': 'double_click', 'coordinates': (x, y)}

	elif action_type == 'right_single':
	x, y = self.parser.convert_coordinates(
	params['x'], params['y'],
	self.screen_width, self.screen_height
	)
	self.pyautogui.rightClick(x, y)
	return {'success': True, 'action': 'right_click', 'coordinates': (x, y)}

	elif action_type == 'drag':
	start_x, start_y = self.parser.convert_coordinates(
	params['start_x'], params['start_y'],
	self.screen_width, self.screen_height
	)
	end_x, end_y = self.parser.convert_coordinates(
	params['end_x'], params['end_y'],
	self.screen_width, self.screen_height
	)
	self.pyautogui.moveTo(start_x, start_y)
	self.pyautogui.dragTo(end_x, end_y)
	return {'success': True, 'action': 'drag', 'start': (start_x, start_y), 'end': (end_x, end_y)}

	elif action_type == 'type':
	content = params['content'].replace('\\n', '\n').replace("\\'", "'").replace('\\"', '"')
	self.pyautogui.typewrite(content)
	return {'success': True, 'action': 'type', 'content': content}

	elif action_type == 'hotkey':
	keys = params['key'].split('+')
	self.pyautogui.hotkey(*keys)
	return {'success': True, 'action': 'hotkey', 'keys': keys}

	elif action_type == 'scroll':
	x, y = self.parser.convert_coordinates(
	params['x'], params['y'],
	self.screen_width, self.screen_height
	)
	self.pyautogui.moveTo(x, y)
	direction = params['direction']
	scroll_amount = 500 if direction in ['up', 'down'] else 300
	if direction in ['down', 'right']:
	scroll_amount = -scroll_amount
	self.pyautogui.scroll(scroll_amount)
	return {'success': True, 'action': 'scroll', 'direction': direction, 'coordinates': (x, y)}

	elif action_type == 'wait':
	import time
	time.sleep(5)
	return {'success': True, 'action': 'wait', 'duration': 5}

	elif action_type == 'finished':
	return {'success': True, 'action': 'finished', 'content': params.get('content', '')}

	else:
	return {'success': False, 'error': f'Unknown action type: {action_type}'}

	except Exception as e:
	return {'success': False, 'error': str(e)}


	# Example usage
	if __name__ == "__main__":
	# Example response from model
	response = """Thought: I need to click the search button to find the product
	Action: click(start_box='<\|box_start\|>(500,300)<\|box_end\|>')"""

	# Parse the response
	parsed = ActionParser.parse_response(response)
	print("Parsed Response:")
	print(f" Thought: {parsed['thought']}")
	print(f" Action: {parsed['action']}")
	print(f" Action Type: {parsed['action_type']}")
	print(f" Parameters: {parsed['parameters']}")

	# Convert coordinates
	x_abs, y_abs = ActionParser.convert_coordinates(500, 300, 1920, 1080)
	print(f"\nConverted Coordinates: ({x_abs}, {y_abs})")

	# Example: Execute action (requires pyautogui)
	# executor = ActionExecutor(1920, 1080)
	# result = executor.execute(parsed['action'])
	# print(f"Execution Result: {result}")