ui-tars-api / action_parser.py
omar0scarf's picture
Upload 10 files
3d37441 verified
"""
UI-TARS Action Parser
=====================
Utilities for parsing and executing UI-TARS model outputs
Compatible with: https://github.com/bytedance/UI-TARS-desktop
"""
import re
from typing import Dict, Any, Optional, List, Tuple
from dataclasses import dataclass
@dataclass
class ParsedAction:
"""Parsed action structure"""
action_type: str
parameters: Dict[str, Any]
raw_action: str
class ActionParser:
"""Parser for UI-TARS action outputs"""
# Action patterns
ACTION_PATTERNS = {
'click': r'click\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)',
'left_double': r'left_double\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)',
'right_single': r'right_single\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)',
'drag': r'drag\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"],\s*end_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)',
'type': r'type\(content=[\'"](.+?)[\'"]\)',
'hotkey': r'hotkey\(key=[\'"](.+?)[\'"]\)',
'scroll': r'scroll\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"],\s*direction=[\'"](\w+)[\'"]\)',
'wait': r'wait\(\)',
'finished': r'finished\(content=[\'"](.+?)[\'"]\)',
# Mobile actions
'long_press': r'long_press\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)',
'open_app': r'open_app\(app_name=[\'"](.+?)[\'"]\)',
'press_home': r'press_home\(\)',
'press_back': r'press_back\(\)',
}
@classmethod
def parse_response(cls, response: str) -> Dict[str, Any]:
"""
Parse the full model response
Args:
response: Raw model output
Returns:
Dictionary with thought and action
"""
result = {
'thought': None,
'action': None,
'action_type': None,
'parameters': {}
}
# Extract thought
thought_match = re.search(r'Thought:\s*(.+?)(?=\nAction:|$)', response, re.DOTALL)
if thought_match:
result['thought'] = thought_match.group(1).strip()
# Extract action
action_match = re.search(r'Action:\s*(.+?)(?=\n|$)', response, re.DOTALL)
if action_match:
action_str = action_match.group(1).strip()
result['action'] = action_str
# Parse action type and parameters
parsed = cls.parse_action(action_str)
result['action_type'] = parsed['action_type']
result['parameters'] = parsed['parameters']
else:
# No "Action:" prefix, try to parse the whole response
result['action'] = response.strip()
parsed = cls.parse_action(result['action'])
result['action_type'] = parsed['action_type']
result['parameters'] = parsed['parameters']
return result
@classmethod
def parse_action(cls, action_str: str) -> Dict[str, Any]:
"""
Parse an action string
Args:
action_str: Action string (e.g., "click(start_box='...')")
Returns:
Dictionary with action_type and parameters
"""
for action_type, pattern in cls.ACTION_PATTERNS.items():
match = re.match(pattern, action_str)
if match:
return {
'action_type': action_type,
'parameters': cls._extract_parameters(action_type, match.groups())
}
return {
'action_type': 'unknown',
'parameters': {'raw': action_str}
}
@classmethod
def _extract_parameters(cls, action_type: str, groups: Tuple) -> Dict[str, Any]:
"""Extract parameters based on action type"""
params = {}
if action_type in ['click', 'left_double', 'right_single', 'long_press']:
params['x'] = int(groups[0])
params['y'] = int(groups[1])
elif action_type == 'drag':
params['start_x'] = int(groups[0])
params['start_y'] = int(groups[1])
params['end_x'] = int(groups[2])
params['end_y'] = int(groups[3])
elif action_type == 'type':
params['content'] = groups[0]
elif action_type == 'hotkey':
params['key'] = groups[0]
elif action_type == 'scroll':
params['x'] = int(groups[0])
params['y'] = int(groups[1])
params['direction'] = groups[2]
elif action_type == 'finished':
params['content'] = groups[0]
elif action_type == 'open_app':
params['app_name'] = groups[0]
return params
@staticmethod
def convert_coordinates(
x_rel: int,
y_rel: int,
screen_width: int,
screen_height: int
) -> Tuple[int, int]:
"""
Convert relative coordinates (0-1000) to absolute screen coordinates
Args:
x_rel: Relative X coordinate (0-1000)
y_rel: Relative Y coordinate (0-1000)
screen_width: Screen width in pixels
screen_height: Screen height in pixels
Returns:
Tuple of (x_absolute, y_absolute)
"""
x_abs = round(screen_width * x_rel / 1000)
y_abs = round(screen_height * y_rel / 1000)
return (x_abs, y_abs)
@classmethod
def get_all_coordinates(cls, action_str: str) -> List[Dict[str, int]]:
"""
Extract all coordinates from an action string
Args:
action_str: Action string
Returns:
List of coordinate dictionaries
"""
coords = []
pattern = r'<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|\>'
matches = re.findall(pattern, action_str)
for match in matches:
coords.append({
'x': int(match[0]),
'y': int(match[1])
})
return coords
class ActionExecutor:
"""
Execute parsed actions using pyautogui
Note: This requires pyautogui to be installed
"""
def __init__(self, screen_width: int = 1920, screen_height: int = 1080):
"""
Initialize the executor
Args:
screen_width: Screen width in pixels
screen_height: Screen height in pixels
"""
self.screen_width = screen_width
self.screen_height = screen_height
self.parser = ActionParser()
try:
import pyautogui
self.pyautogui = pyautogui
self.pyautogui.FAILSAFE = True
except ImportError:
raise ImportError("pyautogui is required for action execution. Install with: pip install pyautogui")
def execute(self, action_str: str) -> Dict[str, Any]:
"""
Execute an action string
Args:
action_str: Action string from model
Returns:
Execution result
"""
parsed = self.parser.parse_action(action_str)
action_type = parsed['action_type']
params = parsed['parameters']
try:
if action_type == 'click':
x, y = self.parser.convert_coordinates(
params['x'], params['y'],
self.screen_width, self.screen_height
)
self.pyautogui.click(x, y)
return {'success': True, 'action': 'click', 'coordinates': (x, y)}
elif action_type == 'left_double':
x, y = self.parser.convert_coordinates(
params['x'], params['y'],
self.screen_width, self.screen_height
)
self.pyautogui.doubleClick(x, y)
return {'success': True, 'action': 'double_click', 'coordinates': (x, y)}
elif action_type == 'right_single':
x, y = self.parser.convert_coordinates(
params['x'], params['y'],
self.screen_width, self.screen_height
)
self.pyautogui.rightClick(x, y)
return {'success': True, 'action': 'right_click', 'coordinates': (x, y)}
elif action_type == 'drag':
start_x, start_y = self.parser.convert_coordinates(
params['start_x'], params['start_y'],
self.screen_width, self.screen_height
)
end_x, end_y = self.parser.convert_coordinates(
params['end_x'], params['end_y'],
self.screen_width, self.screen_height
)
self.pyautogui.moveTo(start_x, start_y)
self.pyautogui.dragTo(end_x, end_y)
return {'success': True, 'action': 'drag', 'start': (start_x, start_y), 'end': (end_x, end_y)}
elif action_type == 'type':
content = params['content'].replace('\\n', '\n').replace("\\'", "'").replace('\\"', '"')
self.pyautogui.typewrite(content)
return {'success': True, 'action': 'type', 'content': content}
elif action_type == 'hotkey':
keys = params['key'].split('+')
self.pyautogui.hotkey(*keys)
return {'success': True, 'action': 'hotkey', 'keys': keys}
elif action_type == 'scroll':
x, y = self.parser.convert_coordinates(
params['x'], params['y'],
self.screen_width, self.screen_height
)
self.pyautogui.moveTo(x, y)
direction = params['direction']
scroll_amount = 500 if direction in ['up', 'down'] else 300
if direction in ['down', 'right']:
scroll_amount = -scroll_amount
self.pyautogui.scroll(scroll_amount)
return {'success': True, 'action': 'scroll', 'direction': direction, 'coordinates': (x, y)}
elif action_type == 'wait':
import time
time.sleep(5)
return {'success': True, 'action': 'wait', 'duration': 5}
elif action_type == 'finished':
return {'success': True, 'action': 'finished', 'content': params.get('content', '')}
else:
return {'success': False, 'error': f'Unknown action type: {action_type}'}
except Exception as e:
return {'success': False, 'error': str(e)}
# Example usage
if __name__ == "__main__":
# Example response from model
response = """Thought: I need to click the search button to find the product
Action: click(start_box='<|box_start|>(500,300)<|box_end|>')"""
# Parse the response
parsed = ActionParser.parse_response(response)
print("Parsed Response:")
print(f" Thought: {parsed['thought']}")
print(f" Action: {parsed['action']}")
print(f" Action Type: {parsed['action_type']}")
print(f" Parameters: {parsed['parameters']}")
# Convert coordinates
x_abs, y_abs = ActionParser.convert_coordinates(500, 300, 1920, 1080)
print(f"\nConverted Coordinates: ({x_abs}, {y_abs})")
# Example: Execute action (requires pyautogui)
# executor = ActionExecutor(1920, 1080)
# result = executor.execute(parsed['action'])
# print(f"Execution Result: {result}")