File size: 8,160 Bytes
399b80c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
from typing import Dict, Any, Union
import os
from openspace.grounding.core.session import BaseSession
from openspace.grounding.core.types import BackendType, SessionStatus, SessionConfig
from openspace.utils.logging import Logger
from .transport.connector import GUIConnector
from .transport.local_connector import LocalGUIConnector
from .tool import GUIAgentTool
from .config import build_llm_config

logger = Logger.get_logger(__name__)


class GUISession(BaseSession):
    """
    Session for GUI desktop environment.
    Manages connection and tools for GUI automation.
    """
    
    def __init__(
        self,
        connector: Union[GUIConnector, LocalGUIConnector],
        session_id: str,
        backend_type: BackendType.GUI,
        config: SessionConfig,
        auto_connect: bool = True,
        auto_initialize: bool = True,
    ):
        """
        Initialize GUI session.
        
        Args:
            connector: GUI HTTP connector
            session_id: Unique session identifier
            backend_type: Backend type (GUI)
            config: Session configuration
            auto_connect: Auto-connect on context enter
            auto_initialize: Auto-initialize on context enter
        """
        super().__init__(
            connector=connector,
            session_id=session_id,
            backend_type=backend_type,
            auto_connect=auto_connect,
            auto_initialize=auto_initialize,
        )
        self.config = config
        self.gui_connector = connector
    
    async def initialize(self) -> Dict[str, Any]:
        """
        Initialize session: connect and discover tools.
        
        Returns:
            Session information dict
        """
        logger.info(f"Initializing GUI session: {self.session_id}")
        
        # Ensure connected
        if not self.connector.is_connected:
            await self.connect()
        
        # Create LLM client if configured
        llm_client = None
        user_llm_config = self.config.connection_params.get("llm_config")
        
        # Build complete LLM config with auto-detection
        # If user provides llm_config, merge with auto-detected values
        # If user doesn't provide llm_config, try to auto-build one if ANTHROPIC_API_KEY exists
        if user_llm_config or os.environ.get("ANTHROPIC_API_KEY"):
            llm_config = build_llm_config(user_llm_config)
            
            if llm_config.get("type") == "anthropic":
                # Check if API key is available
                if not llm_config.get("api_key"):
                    logger.warning(
                        "Anthropic API key not found. Skipping LLM client initialization. "
                        "Set ANTHROPIC_API_KEY environment variable or provide api_key in llm_config."
                    )
                else:
                    try:
                        from .anthropic_client import AnthropicGUIClient
                        
                        # Detect actual screen size from screenshot (most accurate)
                        # PyAutoGUI may report logical resolution, but we need the actual screenshot size
                        try:
                            screenshot_bytes = await self.gui_connector.get_screenshot()
                            if screenshot_bytes:
                                from PIL import Image
                                import io
                                img = Image.open(io.BytesIO(screenshot_bytes))
                                actual_screen_size = img.size
                                logger.info(f"Auto-detected screen size from screenshot: {actual_screen_size}")
                                screen_size = actual_screen_size
                            else:
                                raise RuntimeError("Could not get screenshot")
                        except Exception as e:
                            # Fallback to pyautogui detection
                            actual_screen_size = await self.gui_connector.get_screen_size()
                            if actual_screen_size:
                                logger.info(f"Auto-detected screen size from pyautogui: {actual_screen_size}")
                                screen_size = actual_screen_size
                            else:
                                # Final fallback to configured value
                                screen_size = llm_config.get("screen_size", (1920, 1080))
                                logger.warning(f"Could not auto-detect screen size, using configured: {screen_size}")
                        
                        # Detect PyAutoGUI working size (logical pixels)
                        pyautogui_size = await self.gui_connector.get_screen_size()
                        if pyautogui_size:
                            logger.info(f"PyAutoGUI working size (logical): {pyautogui_size}")
                        else:
                            # If we can't detect PyAutoGUI size, assume it's the same as screen size
                            pyautogui_size = screen_size
                            logger.warning(f"Could not detect PyAutoGUI size, assuming same as screen: {pyautogui_size}")
                        
                        llm_client = AnthropicGUIClient(
                            model=llm_config["model"],
                            platform=llm_config["platform"],
                            api_key=llm_config["api_key"],
                            provider=llm_config["provider"],
                            screen_size=screen_size,
                            pyautogui_size=pyautogui_size,
                            max_tokens=llm_config["max_tokens"],
                            only_n_most_recent_images=llm_config["only_n_most_recent_images"],
                        )
                        logger.info(
                            f"Initialized Anthropic LLM client - "
                            f"Model: {llm_config['model']}, Platform: {llm_config['platform']}"
                        )
                    except Exception as e:
                        logger.warning(f"Failed to initialize Anthropic client: {e}")
        
        # Get recording_manager from connection_params if available
        recording_manager = self.config.connection_params.get("recording_manager")
        
        # Create GUI Agent Tool
        self.tools = [
            GUIAgentTool(
                connector=self.gui_connector, 
                llm_client=llm_client,
                recording_manager=recording_manager
            )
        ]
        
        logger.info(f"Initialized GUI session with {len(self.tools)} tool(s)")
        
        # Return session info
        session_info = {
            "session_id": self.session_id,
            "backend_type": self.backend_type.value,
            "vm_ip": self.gui_connector.vm_ip,
            "server_port": self.gui_connector.server_port,
            "num_tools": len(self.tools),
            "tools": [tool.name for tool in self.tools],
            "llm_client": "anthropic" if llm_client else "none",
        }
        
        return session_info
    
    async def connect(self) -> None:
        """Connect to GUI desktop environment"""
        if self.connector.is_connected:
            return
        
        self.status = SessionStatus.CONNECTING
        logger.info(f"Connecting to desktop_env at {self.gui_connector.base_url}")
        
        await self.connector.connect()
        
        self.status = SessionStatus.CONNECTED
        logger.info("Connected to desktop environment")
    
    async def disconnect(self) -> None:
        """Disconnect from GUI desktop environment"""
        if not self.connector.is_connected:
            return
        
        logger.info("Disconnecting from desktop environment")
        await self.connector.disconnect()
        
        self.status = SessionStatus.DISCONNECTED
        logger.info("Disconnected from desktop environment")
    
    @property
    def is_connected(self) -> bool:
        """Check if session is connected"""
        return self.connector.is_connected