Spaces:

omar0scarf
/

ui-tars-api

Build error

File size: 11,827 Bytes

3d37441

"""
UI-TARS Action Parser
=====================
Utilities for parsing and executing UI-TARS model outputs
Compatible with: https://github.com/bytedance/UI-TARS-desktop
"""

import re
from typing import Dict, Any, Optional, List, Tuple
from dataclasses import dataclass


@dataclass
class ParsedAction:
    """Parsed action structure"""
    action_type: str
    parameters: Dict[str, Any]
    raw_action: str


class ActionParser:
    """Parser for UI-TARS action outputs"""
    
    # Action patterns
    ACTION_PATTERNS = {
        'click': r'click\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)',
        'left_double': r'left_double\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)',
        'right_single': r'right_single\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)',
        'drag': r'drag\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"],\s*end_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)',
        'type': r'type\(content=[\'"](.+?)[\'"]\)',
        'hotkey': r'hotkey\(key=[\'"](.+?)[\'"]\)',
        'scroll': r'scroll\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"],\s*direction=[\'"](\w+)[\'"]\)',
        'wait': r'wait\(\)',
        'finished': r'finished\(content=[\'"](.+?)[\'"]\)',
        # Mobile actions
        'long_press': r'long_press\(start_box=[\'"]<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|>[\'"]\)',
        'open_app': r'open_app\(app_name=[\'"](.+?)[\'"]\)',
        'press_home': r'press_home\(\)',
        'press_back': r'press_back\(\)',
    }
    
    @classmethod
    def parse_response(cls, response: str) -> Dict[str, Any]:
        """
        Parse the full model response
        
        Args:
            response: Raw model output
        
        Returns:
            Dictionary with thought and action
        """
        result = {
            'thought': None,
            'action': None,
            'action_type': None,
            'parameters': {}
        }
        
        # Extract thought
        thought_match = re.search(r'Thought:\s*(.+?)(?=\nAction:|$)', response, re.DOTALL)
        if thought_match:
            result['thought'] = thought_match.group(1).strip()
        
        # Extract action
        action_match = re.search(r'Action:\s*(.+?)(?=\n|$)', response, re.DOTALL)
        if action_match:
            action_str = action_match.group(1).strip()
            result['action'] = action_str
            
            # Parse action type and parameters
            parsed = cls.parse_action(action_str)
            result['action_type'] = parsed['action_type']
            result['parameters'] = parsed['parameters']
        else:
            # No "Action:" prefix, try to parse the whole response
            result['action'] = response.strip()
            parsed = cls.parse_action(result['action'])
            result['action_type'] = parsed['action_type']
            result['parameters'] = parsed['parameters']
        
        return result
    
    @classmethod
    def parse_action(cls, action_str: str) -> Dict[str, Any]:
        """
        Parse an action string
        
        Args:
            action_str: Action string (e.g., "click(start_box='...')")
        
        Returns:
            Dictionary with action_type and parameters
        """
        for action_type, pattern in cls.ACTION_PATTERNS.items():
            match = re.match(pattern, action_str)
            if match:
                return {
                    'action_type': action_type,
                    'parameters': cls._extract_parameters(action_type, match.groups())
                }
        
        return {
            'action_type': 'unknown',
            'parameters': {'raw': action_str}
        }
    
    @classmethod
    def _extract_parameters(cls, action_type: str, groups: Tuple) -> Dict[str, Any]:
        """Extract parameters based on action type"""
        params = {}
        
        if action_type in ['click', 'left_double', 'right_single', 'long_press']:
            params['x'] = int(groups[0])
            params['y'] = int(groups[1])
        
        elif action_type == 'drag':
            params['start_x'] = int(groups[0])
            params['start_y'] = int(groups[1])
            params['end_x'] = int(groups[2])
            params['end_y'] = int(groups[3])
        
        elif action_type == 'type':
            params['content'] = groups[0]
        
        elif action_type == 'hotkey':
            params['key'] = groups[0]
        
        elif action_type == 'scroll':
            params['x'] = int(groups[0])
            params['y'] = int(groups[1])
            params['direction'] = groups[2]
        
        elif action_type == 'finished':
            params['content'] = groups[0]
        
        elif action_type == 'open_app':
            params['app_name'] = groups[0]
        
        return params
    
    @staticmethod
    def convert_coordinates(
        x_rel: int,
        y_rel: int,
        screen_width: int,
        screen_height: int
    ) -> Tuple[int, int]:
        """
        Convert relative coordinates (0-1000) to absolute screen coordinates
        
        Args:
            x_rel: Relative X coordinate (0-1000)
            y_rel: Relative Y coordinate (0-1000)
            screen_width: Screen width in pixels
            screen_height: Screen height in pixels
        
        Returns:
            Tuple of (x_absolute, y_absolute)
        """
        x_abs = round(screen_width * x_rel / 1000)
        y_abs = round(screen_height * y_rel / 1000)
        return (x_abs, y_abs)
    
    @classmethod
    def get_all_coordinates(cls, action_str: str) -> List[Dict[str, int]]:
        """
        Extract all coordinates from an action string
        
        Args:
            action_str: Action string
        
        Returns:
            List of coordinate dictionaries
        """
        coords = []
        pattern = r'<\|box_start\|\>\((\d+),(\d+)\)<\|box_end\|\>'
        matches = re.findall(pattern, action_str)
        
        for match in matches:
            coords.append({
                'x': int(match[0]),
                'y': int(match[1])
            })
        
        return coords


class ActionExecutor:
    """
    Execute parsed actions using pyautogui
    
    Note: This requires pyautogui to be installed
    """
    
    def __init__(self, screen_width: int = 1920, screen_height: int = 1080):
        """
        Initialize the executor
        
        Args:
            screen_width: Screen width in pixels
            screen_height: Screen height in pixels
        """
        self.screen_width = screen_width
        self.screen_height = screen_height
        self.parser = ActionParser()
        
        try:
            import pyautogui
            self.pyautogui = pyautogui
            self.pyautogui.FAILSAFE = True
        except ImportError:
            raise ImportError("pyautogui is required for action execution. Install with: pip install pyautogui")
    
    def execute(self, action_str: str) -> Dict[str, Any]:
        """
        Execute an action string
        
        Args:
            action_str: Action string from model
        
        Returns:
            Execution result
        """
        parsed = self.parser.parse_action(action_str)
        action_type = parsed['action_type']
        params = parsed['parameters']
        
        try:
            if action_type == 'click':
                x, y = self.parser.convert_coordinates(
                    params['x'], params['y'],
                    self.screen_width, self.screen_height
                )
                self.pyautogui.click(x, y)
                return {'success': True, 'action': 'click', 'coordinates': (x, y)}
            
            elif action_type == 'left_double':
                x, y = self.parser.convert_coordinates(
                    params['x'], params['y'],
                    self.screen_width, self.screen_height
                )
                self.pyautogui.doubleClick(x, y)
                return {'success': True, 'action': 'double_click', 'coordinates': (x, y)}
            
            elif action_type == 'right_single':
                x, y = self.parser.convert_coordinates(
                    params['x'], params['y'],
                    self.screen_width, self.screen_height
                )
                self.pyautogui.rightClick(x, y)
                return {'success': True, 'action': 'right_click', 'coordinates': (x, y)}
            
            elif action_type == 'drag':
                start_x, start_y = self.parser.convert_coordinates(
                    params['start_x'], params['start_y'],
                    self.screen_width, self.screen_height
                )
                end_x, end_y = self.parser.convert_coordinates(
                    params['end_x'], params['end_y'],
                    self.screen_width, self.screen_height
                )
                self.pyautogui.moveTo(start_x, start_y)
                self.pyautogui.dragTo(end_x, end_y)
                return {'success': True, 'action': 'drag', 'start': (start_x, start_y), 'end': (end_x, end_y)}
            
            elif action_type == 'type':
                content = params['content'].replace('\\n', '\n').replace("\\'", "'").replace('\\"', '"')
                self.pyautogui.typewrite(content)
                return {'success': True, 'action': 'type', 'content': content}
            
            elif action_type == 'hotkey':
                keys = params['key'].split('+')
                self.pyautogui.hotkey(*keys)
                return {'success': True, 'action': 'hotkey', 'keys': keys}
            
            elif action_type == 'scroll':
                x, y = self.parser.convert_coordinates(
                    params['x'], params['y'],
                    self.screen_width, self.screen_height
                )
                self.pyautogui.moveTo(x, y)
                direction = params['direction']
                scroll_amount = 500 if direction in ['up', 'down'] else 300
                if direction in ['down', 'right']:
                    scroll_amount = -scroll_amount
                self.pyautogui.scroll(scroll_amount)
                return {'success': True, 'action': 'scroll', 'direction': direction, 'coordinates': (x, y)}
            
            elif action_type == 'wait':
                import time
                time.sleep(5)
                return {'success': True, 'action': 'wait', 'duration': 5}
            
            elif action_type == 'finished':
                return {'success': True, 'action': 'finished', 'content': params.get('content', '')}
            
            else:
                return {'success': False, 'error': f'Unknown action type: {action_type}'}
        
        except Exception as e:
            return {'success': False, 'error': str(e)}


# Example usage
if __name__ == "__main__":
    # Example response from model
    response = """Thought: I need to click the search button to find the product
Action: click(start_box='<|box_start|>(500,300)<|box_end|>')"""
    
    # Parse the response
    parsed = ActionParser.parse_response(response)
    print("Parsed Response:")
    print(f"  Thought: {parsed['thought']}")
    print(f"  Action: {parsed['action']}")
    print(f"  Action Type: {parsed['action_type']}")
    print(f"  Parameters: {parsed['parameters']}")
    
    # Convert coordinates
    x_abs, y_abs = ActionParser.convert_coordinates(500, 300, 1920, 1080)
    print(f"\nConverted Coordinates: ({x_abs}, {y_abs})")
    
    # Example: Execute action (requires pyautogui)
    # executor = ActionExecutor(1920, 1080)
    # result = executor.execute(parsed['action'])
    # print(f"Execution Result: {result}")