Spaces:
Build error
Build error
| """ | |
| UI-TARS Action Parser | |
| ===================== | |
| Utilities for parsing and executing UI-TARS model outputs | |
| Compatible with: https://github.com/bytedance/UI-TARS-desktop | |
| """ | |
| import re | |
| from typing import Dict, Any, Optional, List, Tuple | |
| from dataclasses import dataclass | |
| class ParsedAction: | |
| """Parsed action structure""" | |
| action_type: str | |
| parameters: Dict[str, Any] | |
| raw_action: str | |
| class ActionParser: | |
| """Parser for UI-TARS action outputs""" | |
| # Action patterns | |
| ACTION_PATTERNS = { | |
| 'click': r'click\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)', | |
| 'left_double': r'left_double\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)', | |
| 'right_single': r'right_single\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)', | |
| 'drag': r'drag\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"],\s*end_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)', | |
| 'type': r'type\(content=[\'"](.+?)[\'"]\)', | |
| 'hotkey': r'hotkey\(key=[\'"](.+?)[\'"]\)', | |
| 'scroll': r'scroll\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"],\s*direction=[\'"](\w+)[\'"]\)', | |
| 'wait': r'wait\(\)', | |
| 'finished': r'finished\(content=[\'"](.+?)[\'"]\)', | |
| # Mobile actions | |
| 'long_press': r'long_press\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)', | |
| 'open_app': r'open_app\(app_name=[\'"](.+?)[\'"]\)', | |
| 'press_home': r'press_home\(\)', | |
| 'press_back': r'press_back\(\)', | |
| } | |
| def parse_response(cls, response: str) -> Dict[str, Any]: | |
| """ | |
| Parse the full model response | |
| Args: | |
| response: Raw model output | |
| Returns: | |
| Dictionary with thought and action | |
| """ | |
| result = { | |
| 'thought': None, | |
| 'action': None, | |
| 'action_type': None, | |
| 'parameters': {} | |
| } | |
| # Extract thought | |
| thought_match = re.search(r'Thought:\s*(.+?)(?=\nAction:|$)', response, re.DOTALL) | |
| if thought_match: | |
| result['thought'] = thought_match.group(1).strip() | |
| # Extract action | |
| action_match = re.search(r'Action:\s*(.+?)(?=\n|$)', response, re.DOTALL) | |
| if action_match: | |
| action_str = action_match.group(1).strip() | |
| result['action'] = action_str | |
| # Parse action type and parameters | |
| parsed = cls.parse_action(action_str) | |
| result['action_type'] = parsed['action_type'] | |
| result['parameters'] = parsed['parameters'] | |
| else: | |
| # No "Action:" prefix, try to parse the whole response | |
| result['action'] = response.strip() | |
| parsed = cls.parse_action(result['action']) | |
| result['action_type'] = parsed['action_type'] | |
| result['parameters'] = parsed['parameters'] | |
| return result | |
| def parse_action(cls, action_str: str) -> Dict[str, Any]: | |
| """ | |
| Parse an action string | |
| Args: | |
| action_str: Action string (e.g., "click(start_box='...')") | |
| Returns: | |
| Dictionary with action_type and parameters | |
| """ | |
| for action_type, pattern in cls.ACTION_PATTERNS.items(): | |
| match = re.match(pattern, action_str) | |
| if match: | |
| return { | |
| 'action_type': action_type, | |
| 'parameters': cls._extract_parameters(action_type, match.groups()) | |
| } | |
| return { | |
| 'action_type': 'unknown', | |
| 'parameters': {'raw': action_str} | |
| } | |
| def _extract_parameters(cls, action_type: str, groups: Tuple) -> Dict[str, Any]: | |
| """Extract parameters based on action type""" | |
| params = {} | |
| if action_type in ['click', 'left_double', 'right_single', 'long_press']: | |
| params['x'] = int(groups[0]) | |
| params['y'] = int(groups[1]) | |
| elif action_type == 'drag': | |
| params['start_x'] = int(groups[0]) | |
| params['start_y'] = int(groups[1]) | |
| params['end_x'] = int(groups[2]) | |
| params['end_y'] = int(groups[3]) | |
| elif action_type == 'type': | |
| params['content'] = groups[0] | |
| elif action_type == 'hotkey': | |
| params['key'] = groups[0] | |
| elif action_type == 'scroll': | |
| params['x'] = int(groups[0]) | |
| params['y'] = int(groups[1]) | |
| params['direction'] = groups[2] | |
| elif action_type == 'finished': | |
| params['content'] = groups[0] | |
| elif action_type == 'open_app': | |
| params['app_name'] = groups[0] | |
| return params | |
| def convert_coordinates( | |
| x_rel: int, | |
| y_rel: int, | |
| screen_width: int, | |
| screen_height: int | |
| ) -> Tuple[int, int]: | |
| """ | |
| Convert relative coordinates (0-1000) to absolute screen coordinates | |
| Args: | |
| x_rel: Relative X coordinate (0-1000) | |
| y_rel: Relative Y coordinate (0-1000) | |
| screen_width: Screen width in pixels | |
| screen_height: Screen height in pixels | |
| Returns: | |
| Tuple of (x_absolute, y_absolute) | |
| """ | |
| x_abs = round(screen_width * x_rel / 1000) | |
| y_abs = round(screen_height * y_rel / 1000) | |
| return (x_abs, y_abs) | |
| def get_all_coordinates(cls, action_str: str) -> List[Dict[str, int]]: | |
| """ | |
| Extract all coordinates from an action string | |
| Args: | |
| action_str: Action string | |
| Returns: | |
| List of coordinate dictionaries | |
| """ | |
| coords = [] | |
| pattern = r'<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|\>' | |
| matches = re.findall(pattern, action_str) | |
| for match in matches: | |
| coords.append({ | |
| 'x': int(match[0]), | |
| 'y': int(match[1]) | |
| }) | |
| return coords | |
| class ActionExecutor: | |
| """ | |
| Execute parsed actions using pyautogui | |
| Note: This requires pyautogui to be installed | |
| """ | |
| def __init__(self, screen_width: int = 1920, screen_height: int = 1080): | |
| """ | |
| Initialize the executor | |
| Args: | |
| screen_width: Screen width in pixels | |
| screen_height: Screen height in pixels | |
| """ | |
| self.screen_width = screen_width | |
| self.screen_height = screen_height | |
| self.parser = ActionParser() | |
| try: | |
| import pyautogui | |
| self.pyautogui = pyautogui | |
| self.pyautogui.FAILSAFE = True | |
| except ImportError: | |
| raise ImportError("pyautogui is required for action execution. Install with: pip install pyautogui") | |
| def execute(self, action_str: str) -> Dict[str, Any]: | |
| """ | |
| Execute an action string | |
| Args: | |
| action_str: Action string from model | |
| Returns: | |
| Execution result | |
| """ | |
| parsed = self.parser.parse_action(action_str) | |
| action_type = parsed['action_type'] | |
| params = parsed['parameters'] | |
| try: | |
| if action_type == 'click': | |
| x, y = self.parser.convert_coordinates( | |
| params['x'], params['y'], | |
| self.screen_width, self.screen_height | |
| ) | |
| self.pyautogui.click(x, y) | |
| return {'success': True, 'action': 'click', 'coordinates': (x, y)} | |
| elif action_type == 'left_double': | |
| x, y = self.parser.convert_coordinates( | |
| params['x'], params['y'], | |
| self.screen_width, self.screen_height | |
| ) | |
| self.pyautogui.doubleClick(x, y) | |
| return {'success': True, 'action': 'double_click', 'coordinates': (x, y)} | |
| elif action_type == 'right_single': | |
| x, y = self.parser.convert_coordinates( | |
| params['x'], params['y'], | |
| self.screen_width, self.screen_height | |
| ) | |
| self.pyautogui.rightClick(x, y) | |
| return {'success': True, 'action': 'right_click', 'coordinates': (x, y)} | |
| elif action_type == 'drag': | |
| start_x, start_y = self.parser.convert_coordinates( | |
| params['start_x'], params['start_y'], | |
| self.screen_width, self.screen_height | |
| ) | |
| end_x, end_y = self.parser.convert_coordinates( | |
| params['end_x'], params['end_y'], | |
| self.screen_width, self.screen_height | |
| ) | |
| self.pyautogui.moveTo(start_x, start_y) | |
| self.pyautogui.dragTo(end_x, end_y) | |
| return {'success': True, 'action': 'drag', 'start': (start_x, start_y), 'end': (end_x, end_y)} | |
| elif action_type == 'type': | |
| content = params['content'].replace('\\n', '\n').replace("\\'", "'").replace('\\"', '"') | |
| self.pyautogui.typewrite(content) | |
| return {'success': True, 'action': 'type', 'content': content} | |
| elif action_type == 'hotkey': | |
| keys = params['key'].split('+') | |
| self.pyautogui.hotkey(*keys) | |
| return {'success': True, 'action': 'hotkey', 'keys': keys} | |
| elif action_type == 'scroll': | |
| x, y = self.parser.convert_coordinates( | |
| params['x'], params['y'], | |
| self.screen_width, self.screen_height | |
| ) | |
| self.pyautogui.moveTo(x, y) | |
| direction = params['direction'] | |
| scroll_amount = 500 if direction in ['up', 'down'] else 300 | |
| if direction in ['down', 'right']: | |
| scroll_amount = -scroll_amount | |
| self.pyautogui.scroll(scroll_amount) | |
| return {'success': True, 'action': 'scroll', 'direction': direction, 'coordinates': (x, y)} | |
| elif action_type == 'wait': | |
| import time | |
| time.sleep(5) | |
| return {'success': True, 'action': 'wait', 'duration': 5} | |
| elif action_type == 'finished': | |
| return {'success': True, 'action': 'finished', 'content': params.get('content', '')} | |
| else: | |
| return {'success': False, 'error': f'Unknown action type: {action_type}'} | |
| except Exception as e: | |
| return {'success': False, 'error': str(e)} | |
| # Example usage | |
| if __name__ == "__main__": | |
| # Example response from model | |
| response = """Thought: I need to click the search button to find the product | |
| Action: click(start_box='<|box_start|>(500,300)<|box_end|>')""" | |
| # Parse the response | |
| parsed = ActionParser.parse_response(response) | |
| print("Parsed Response:") | |
| print(f" Thought: {parsed['thought']}") | |
| print(f" Action: {parsed['action']}") | |
| print(f" Action Type: {parsed['action_type']}") | |
| print(f" Parameters: {parsed['parameters']}") | |
| # Convert coordinates | |
| x_abs, y_abs = ActionParser.convert_coordinates(500, 300, 1920, 1080) | |
| print(f"\nConverted Coordinates: ({x_abs}, {y_abs})") | |
| # Example: Execute action (requires pyautogui) | |
| # executor = ActionExecutor(1920, 1080) | |
| # result = executor.execute(parsed['action']) | |
| # print(f"Execution Result: {result}") | |