""" UI-TARS Action Parser ===================== Utilities for parsing and executing UI-TARS model outputs Compatible with: https://github.com/bytedance/UI-TARS-desktop """ import re from typing import Dict, Any, Optional, List, Tuple from dataclasses import dataclass @dataclass class ParsedAction: """Parsed action structure""" action_type: str parameters: Dict[str, Any] raw_action: str class ActionParser: """Parser for UI-TARS action outputs""" # Action patterns ACTION_PATTERNS = { 'click': r'click\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)', 'left_double': r'left_double\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)', 'right_single': r'right_single\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)', 'drag': r'drag\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"],\s*end_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)', 'type': r'type\(content=[\'"](.+?)[\'"]\)', 'hotkey': r'hotkey\(key=[\'"](.+?)[\'"]\)', 'scroll': r'scroll\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"],\s*direction=[\'"](\w+)[\'"]\)', 'wait': r'wait\(\)', 'finished': r'finished\(content=[\'"](.+?)[\'"]\)', # Mobile actions 'long_press': r'long_press\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)', 'open_app': r'open_app\(app_name=[\'"](.+?)[\'"]\)', 'press_home': r'press_home\(\)', 'press_back': r'press_back\(\)', } @classmethod def parse_response(cls, response: str) -> Dict[str, Any]: """ Parse the full model response Args: response: Raw model output Returns: Dictionary with thought and action """ result = { 'thought': None, 'action': None, 'action_type': None, 'parameters': {} } # Extract thought thought_match = re.search(r'Thought:\s*(.+?)(?=\nAction:|$)', response, re.DOTALL) if thought_match: result['thought'] = thought_match.group(1).strip() # Extract action action_match = re.search(r'Action:\s*(.+?)(?=\n|$)', response, re.DOTALL) if action_match: action_str = action_match.group(1).strip() result['action'] = action_str # Parse action type and parameters parsed = cls.parse_action(action_str) result['action_type'] = parsed['action_type'] result['parameters'] = parsed['parameters'] else: # No "Action:" prefix, try to parse the whole response result['action'] = response.strip() parsed = cls.parse_action(result['action']) result['action_type'] = parsed['action_type'] result['parameters'] = parsed['parameters'] return result @classmethod def parse_action(cls, action_str: str) -> Dict[str, Any]: """ Parse an action string Args: action_str: Action string (e.g., "click(start_box='...')") Returns: Dictionary with action_type and parameters """ for action_type, pattern in cls.ACTION_PATTERNS.items(): match = re.match(pattern, action_str) if match: return { 'action_type': action_type, 'parameters': cls._extract_parameters(action_type, match.groups()) } return { 'action_type': 'unknown', 'parameters': {'raw': action_str} } @classmethod def _extract_parameters(cls, action_type: str, groups: Tuple) -> Dict[str, Any]: """Extract parameters based on action type""" params = {} if action_type in ['click', 'left_double', 'right_single', 'long_press']: params['x'] = int(groups[0]) params['y'] = int(groups[1]) elif action_type == 'drag': params['start_x'] = int(groups[0]) params['start_y'] = int(groups[1]) params['end_x'] = int(groups[2]) params['end_y'] = int(groups[3]) elif action_type == 'type': params['content'] = groups[0] elif action_type == 'hotkey': params['key'] = groups[0] elif action_type == 'scroll': params['x'] = int(groups[0]) params['y'] = int(groups[1]) params['direction'] = groups[2] elif action_type == 'finished': params['content'] = groups[0] elif action_type == 'open_app': params['app_name'] = groups[0] return params @staticmethod def convert_coordinates( x_rel: int, y_rel: int, screen_width: int, screen_height: int ) -> Tuple[int, int]: """ Convert relative coordinates (0-1000) to absolute screen coordinates Args: x_rel: Relative X coordinate (0-1000) y_rel: Relative Y coordinate (0-1000) screen_width: Screen width in pixels screen_height: Screen height in pixels Returns: Tuple of (x_absolute, y_absolute) """ x_abs = round(screen_width * x_rel / 1000) y_abs = round(screen_height * y_rel / 1000) return (x_abs, y_abs) @classmethod def get_all_coordinates(cls, action_str: str) -> List[Dict[str, int]]: """ Extract all coordinates from an action string Args: action_str: Action string Returns: List of coordinate dictionaries """ coords = [] pattern = r'<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|\>' matches = re.findall(pattern, action_str) for match in matches: coords.append({ 'x': int(match[0]), 'y': int(match[1]) }) return coords class ActionExecutor: """ Execute parsed actions using pyautogui Note: This requires pyautogui to be installed """ def __init__(self, screen_width: int = 1920, screen_height: int = 1080): """ Initialize the executor Args: screen_width: Screen width in pixels screen_height: Screen height in pixels """ self.screen_width = screen_width self.screen_height = screen_height self.parser = ActionParser() try: import pyautogui self.pyautogui = pyautogui self.pyautogui.FAILSAFE = True except ImportError: raise ImportError("pyautogui is required for action execution. Install with: pip install pyautogui") def execute(self, action_str: str) -> Dict[str, Any]: """ Execute an action string Args: action_str: Action string from model Returns: Execution result """ parsed = self.parser.parse_action(action_str) action_type = parsed['action_type'] params = parsed['parameters'] try: if action_type == 'click': x, y = self.parser.convert_coordinates( params['x'], params['y'], self.screen_width, self.screen_height ) self.pyautogui.click(x, y) return {'success': True, 'action': 'click', 'coordinates': (x, y)} elif action_type == 'left_double': x, y = self.parser.convert_coordinates( params['x'], params['y'], self.screen_width, self.screen_height ) self.pyautogui.doubleClick(x, y) return {'success': True, 'action': 'double_click', 'coordinates': (x, y)} elif action_type == 'right_single': x, y = self.parser.convert_coordinates( params['x'], params['y'], self.screen_width, self.screen_height ) self.pyautogui.rightClick(x, y) return {'success': True, 'action': 'right_click', 'coordinates': (x, y)} elif action_type == 'drag': start_x, start_y = self.parser.convert_coordinates( params['start_x'], params['start_y'], self.screen_width, self.screen_height ) end_x, end_y = self.parser.convert_coordinates( params['end_x'], params['end_y'], self.screen_width, self.screen_height ) self.pyautogui.moveTo(start_x, start_y) self.pyautogui.dragTo(end_x, end_y) return {'success': True, 'action': 'drag', 'start': (start_x, start_y), 'end': (end_x, end_y)} elif action_type == 'type': content = params['content'].replace('\\n', '\n').replace("\\'", "'").replace('\\"', '"') self.pyautogui.typewrite(content) return {'success': True, 'action': 'type', 'content': content} elif action_type == 'hotkey': keys = params['key'].split('+') self.pyautogui.hotkey(*keys) return {'success': True, 'action': 'hotkey', 'keys': keys} elif action_type == 'scroll': x, y = self.parser.convert_coordinates( params['x'], params['y'], self.screen_width, self.screen_height ) self.pyautogui.moveTo(x, y) direction = params['direction'] scroll_amount = 500 if direction in ['up', 'down'] else 300 if direction in ['down', 'right']: scroll_amount = -scroll_amount self.pyautogui.scroll(scroll_amount) return {'success': True, 'action': 'scroll', 'direction': direction, 'coordinates': (x, y)} elif action_type == 'wait': import time time.sleep(5) return {'success': True, 'action': 'wait', 'duration': 5} elif action_type == 'finished': return {'success': True, 'action': 'finished', 'content': params.get('content', '')} else: return {'success': False, 'error': f'Unknown action type: {action_type}'} except Exception as e: return {'success': False, 'error': str(e)} # Example usage if __name__ == "__main__": # Example response from model response = """Thought: I need to click the search button to find the product Action: click(start_box='<|box_start|>(500,300)<|box_end|>')""" # Parse the response parsed = ActionParser.parse_response(response) print("Parsed Response:") print(f" Thought: {parsed['thought']}") print(f" Action: {parsed['action']}") print(f" Action Type: {parsed['action_type']}") print(f" Parameters: {parsed['parameters']}") # Convert coordinates x_abs, y_abs = ActionParser.convert_coordinates(500, 300, 1920, 1080) print(f"\nConverted Coordinates: ({x_abs}, {y_abs})") # Example: Execute action (requires pyautogui) # executor = ActionExecutor(1920, 1080) # result = executor.execute(parsed['action']) # print(f"Execution Result: {result}")