likhon saheikh commited on
Commit
6ae0024
·
1 Parent(s): ad8ba8a

Add Gemini 3 Computer Use client script

Browse files
Files changed (1) hide show
  1. gemini3_computer_use.py +433 -0
gemini3_computer_use.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gemini 3 Computer Use API Client
3
+
4
+ A comprehensive Python client for interacting with the Gemini 3 Computer Use model API
5
+ which excels at browser and mobile task automation.
6
+ """
7
+
8
+ import asyncio
9
+ import base64
10
+ import json
11
+ import logging
12
+ import time
13
+ from datetime import datetime
14
+ from typing import Dict, List, Optional, Union, Any
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+
18
+ import requests
19
+ import cv2
20
+ import numpy as np
21
+ from PIL import Image
22
+ import pyautogui
23
+ from selenium import webdriver
24
+ from selenium.webdriver.chrome.service import Service as ChromeService
25
+ from selenium.webdriver.common.by import By
26
+ from selenium.webdriver.common.keys import Keys
27
+ from selenium.webdriver.support.ui import WebDriverWait
28
+ from selenium.webdriver.support import expected_conditions as EC
29
+ from selenium.common.exceptions import TimeoutException, NoSuchElementException
30
+ from webdriver_manager.chrome import ChromeDriverManager
31
+
32
+ # Configure logging
33
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ @dataclass
38
+ class ComputerAction:
39
+ """Represents a computer action to be executed"""
40
+ action_type: str # 'click', 'type', 'scroll', 'drag', 'key_press', etc.
41
+ target: Optional[str] = None # Element selector, coordinates, or other identifier
42
+ value: Optional[str] = None # Value to input or additional parameters
43
+ coordinates: Optional[tuple] = None # (x, y) coordinates if applicable
44
+ duration: Optional[float] = None # Duration for complex actions
45
+ wait_time: Optional[float] = 1.0 # Wait time after action
46
+
47
+
48
+ @dataclass
49
+ class BrowserSession:
50
+ """Manages browser session state"""
51
+ driver: webdriver.Chrome
52
+ current_url: str
53
+ screenshot_path: Optional[str] = None
54
+ last_action_time: float = 0.0
55
+
56
+
57
+ class Gemini3ComputerUseAPI:
58
+ """
59
+ Main client for Gemini 3 Computer Use model API
60
+
61
+ This client provides advanced browser and mobile automation capabilities
62
+ powered by the Gemini 3 Computer Use model which excels at computer vision
63
+ and multi-modal task execution.
64
+ """
65
+
66
+ def __init__(self, api_key: str, base_url: str = "https://api.gemini3.com/v1"):
67
+ """
68
+ Initialize the Gemini 3 Computer Use API client
69
+
70
+ Args:
71
+ api_key: Your API key for Gemini 3
72
+ base_url: Base URL for the API endpoint
73
+ """
74
+ self.api_key = api_key
75
+ self.base_url = base_url.rstrip('/')
76
+ self.session = requests.Session()
77
+ self.headers = {
78
+ 'Authorization': f'Bearer {api_key}',
79
+ 'Content-Type': 'application/json',
80
+ 'User-Agent': 'Gemini3-ComputerUse-Client/1.0'
81
+ }
82
+
83
+ # Browser automation state
84
+ self.browser_session: Optional[BrowserSession] = None
85
+ self.screenshot_dir = Path("screenshots")
86
+ self.screenshot_dir.mkdir(exist_ok=True)
87
+
88
+ # Mobile automation state
89
+ self.mobile_session = None
90
+
91
+ logger.info("Gemini 3 Computer Use API Client initialized")
92
+
93
+ def _make_request(self, endpoint: str, method: str = "POST", data: Dict = None, files: Dict = None) -> Dict:
94
+ """Make authenticated request to the API"""
95
+ url = f"{self.base_url}/{endpoint.lstrip('/')}"
96
+
97
+ try:
98
+ if method.upper() == "GET":
99
+ response = self.session.get(url, headers=self.headers, params=data)
100
+ elif method.upper() == "POST":
101
+ if files:
102
+ # Remove Content-Type header for file uploads
103
+ headers = {k: v for k, v in self.headers.items() if k != 'Content-Type'}
104
+ response = self.session.post(url, headers=headers, data=data, files=files)
105
+ else:
106
+ response = self.session.post(url, headers=self.headers, json=data)
107
+ else:
108
+ response = self.session.request(method, url, headers=self.headers, json=data)
109
+
110
+ response.raise_for_status()
111
+ return response.json()
112
+
113
+ except requests.exceptions.RequestException as e:
114
+ logger.error(f"API request failed: {e}")
115
+ raise Exception(f"API request failed: {e}")
116
+
117
+ def analyze_screen(self, image_path: Optional[str] = None, screenshot: Optional[np.ndarray] = None) -> Dict:
118
+ """
119
+ Analyze the current screen state using computer vision
120
+
121
+ Args:
122
+ image_path: Path to image file to analyze
123
+ screenshot: Numpy array of screenshot image
124
+
125
+ Returns:
126
+ Dictionary containing analysis results
127
+ """
128
+ if screenshot is not None:
129
+ # Convert screenshot to base64
130
+ _, buffer = cv2.imencode('.png', screenshot)
131
+ image_b64 = base64.b64encode(buffer).decode()
132
+ elif image_path:
133
+ with open(image_path, 'rb') as f:
134
+ image_b64 = base64.b64encode(f.read()).decode()
135
+ else:
136
+ # Take a screenshot of current screen
137
+ screenshot = pyautogui.screenshot()
138
+ buffer = np.array(screenshot)
139
+ _, img_buffer = cv2.imencode('.png', cv2.cvtColor(buffer, cv2.COLOR_RGB2BGR))
140
+ image_b64 = base64.b64encode(img_buffer).decode()
141
+
142
+ data = {
143
+ "model": "gemini-3-computer-use",
144
+ "image": f"data:image/png;base64,{image_b64}",
145
+ "task": "analyze_screen"
146
+ }
147
+
148
+ result = self._make_request("analyze", data=data)
149
+ logger.info("Screen analysis completed")
150
+ return result
151
+
152
+ def execute_browser_action(self, action: ComputerAction) -> Dict:
153
+ """
154
+ Execute a browser action using Gemini 3 computer use capabilities
155
+
156
+ Args:
157
+ action: ComputerAction object describing the action to execute
158
+
159
+ Returns:
160
+ Dictionary containing execution results
161
+ """
162
+ if not self.browser_session:
163
+ raise Exception("No active browser session. Start a browser session first.")
164
+
165
+ # Take screenshot before action
166
+ screenshot = self.browser_session.driver.get_screenshot_as_png()
167
+ screenshot_np = np.frombuffer(screenshot, dtype=np.uint8)
168
+ screenshot_np = cv2.imdecode(screenshot_np, cv2.IMREAD_COLOR)
169
+
170
+ data = {
171
+ "model": "gemini-3-computer-use",
172
+ "image": f"data:image/png;base64,{base64.b64encode(screenshot).decode()}",
173
+ "task": "execute_action",
174
+ "action": {
175
+ "type": action.action_type,
176
+ "target": action.target,
177
+ "value": action.value,
178
+ "coordinates": action.coordinates,
179
+ "duration": action.duration
180
+ },
181
+ "context": {
182
+ "current_url": self.browser_session.current_url,
183
+ "browser_session_id": id(self.browser_session.driver)
184
+ }
185
+ }
186
+
187
+ result = self._make_request("execute", data=data)
188
+
189
+ # Execute the action in the actual browser
190
+ self._execute_action_locally(action)
191
+
192
+ # Update browser session
193
+ self.browser_session.last_action_time = time.time()
194
+
195
+ logger.info(f"Browser action '{action.action_type}' executed")
196
+ return result
197
+
198
+ def _execute_action_locally(self, action: ComputerAction) -> None:
199
+ """Execute action in the local browser session"""
200
+ driver = self.browser_session.driver
201
+
202
+ try:
203
+ if action.action_type == "click":
204
+ if action.coordinates:
205
+ webdriver.common.action_chains.ActionChains(driver).move_by_offset(
206
+ action.coordinates[0], action.coordinates[1]
207
+ ).click().perform()
208
+ elif action.target:
209
+ element = WebDriverWait(driver, 10).until(
210
+ EC.element_to_be_clickable((By.CSS_SELECTOR, action.target))
211
+ )
212
+ element.click()
213
+
214
+ elif action.action_type == "type":
215
+ if action.target:
216
+ element = WebDriverWait(driver, 10).until(
217
+ EC.presence_of_element_located((By.CSS_SELECTOR, action.target))
218
+ )
219
+ element.clear()
220
+ element.send_keys(action.value)
221
+ else:
222
+ pyautogui.write(action.value)
223
+
224
+ elif action.action_type == "scroll":
225
+ if action.coordinates:
226
+ driver.execute_script(f"window.scrollTo({action.coordinates[0]}, {action.coordinates[1]});")
227
+ else:
228
+ driver.execute_script("window.scrollBy(0, 500);")
229
+
230
+ elif action.action_type == "key_press":
231
+ if action.value == "Enter":
232
+ pyautogui.press('enter')
233
+ elif action.value == "Escape":
234
+ pyautogui.press('esc')
235
+ elif action.value == "Tab":
236
+ pyautogui.press('tab')
237
+ else:
238
+ pyautogui.press(action.value)
239
+
240
+ # Wait after action if specified
241
+ if action.wait_time:
242
+ time.sleep(action.wait_time)
243
+
244
+ except Exception as e:
245
+ logger.error(f"Failed to execute action locally: {e}")
246
+ raise
247
+
248
+ def start_browser_session(self, url: str = "https://www.google.com", headless: bool = False) -> BrowserSession:
249
+ """
250
+ Start a new browser automation session
251
+
252
+ Args:
253
+ url: URL to navigate to
254
+ headless: Whether to run browser in headless mode
255
+
256
+ Returns:
257
+ BrowserSession object
258
+ """
259
+ options = webdriver.ChromeOptions()
260
+ if headless:
261
+ options.add_argument("--headless")
262
+ options.add_argument("--no-sandbox")
263
+ options.add_argument("--disable-dev-shm-usage")
264
+ options.add_argument("--disable-gpu")
265
+ options.add_argument("--window-size=1920,1080")
266
+
267
+ try:
268
+ driver = webdriver.Chrome(
269
+ service=ChromeService(ChromeDriverManager().install()),
270
+ options=options
271
+ )
272
+ driver.get(url)
273
+ driver.maximize_window()
274
+
275
+ self.browser_session = BrowserSession(
276
+ driver=driver,
277
+ current_url=url
278
+ )
279
+
280
+ logger.info(f"Browser session started at {url}")
281
+ return self.browser_session
282
+
283
+ except Exception as e:
284
+ logger.error(f"Failed to start browser session: {e}")
285
+ raise
286
+
287
+ def close_browser_session(self) -> None:
288
+ """Close the current browser session"""
289
+ if self.browser_session:
290
+ self.browser_session.driver.quit()
291
+ self.browser_session = None
292
+ logger.info("Browser session closed")
293
+
294
+ def execute_mobile_action(self, action: ComputerAction, device_config: Dict = None) -> Dict:
295
+ """
296
+ Execute a mobile action using Gemini 3 computer use capabilities
297
+
298
+ Args:
299
+ action: ComputerAction object describing the action to execute
300
+ device_config: Device configuration for mobile automation
301
+
302
+ Returns:
303
+ Dictionary containing execution results
304
+ """
305
+ # This would require Appium setup for actual mobile automation
306
+ data = {
307
+ "model": "gemini-3-computer-use",
308
+ "task": "mobile_action",
309
+ "action": {
310
+ "type": action.action_type,
311
+ "target": action.target,
312
+ "value": action.value,
313
+ "coordinates": action.coordinates,
314
+ "duration": action.duration
315
+ },
316
+ "device_config": device_config or {}
317
+ }
318
+
319
+ result = self._make_request("mobile-execute", data=data)
320
+ logger.info(f"Mobile action '{action.action_type}' executed")
321
+ return result
322
+
323
+ def automate_browser_task(self, task_description: str, max_steps: int = 10) -> Dict:
324
+ """
325
+ Automate a complex browser task using Gemini 3 computer use model
326
+
327
+ Args:
328
+ task_description: Natural language description of the task
329
+ max_steps: Maximum number of action steps
330
+
331
+ Returns:
332
+ Dictionary containing task execution results
333
+ """
334
+ if not self.browser_session:
335
+ raise Exception("No active browser session. Start a browser session first.")
336
+
337
+ # Take initial screenshot
338
+ screenshot = self.browser_session.driver.get_screenshot_as_png()
339
+
340
+ data = {
341
+ "model": "gemini-3-computer-use",
342
+ "image": f"data:image/png;base64,{base64.b64encode(screenshot).decode()}",
343
+ "task": "automate_browser_task",
344
+ "task_description": task_description,
345
+ "current_url": self.browser_session.current_url,
346
+ "max_steps": max_steps
347
+ }
348
+
349
+ result = self._make_request("automate", data=data)
350
+
351
+ # Execute the planned actions
352
+ if 'actions' in result:
353
+ for action_data in result['actions']:
354
+ action = ComputerAction(
355
+ action_type=action_data['type'],
356
+ target=action_data.get('target'),
357
+ value=action_data.get('value'),
358
+ coordinates=action_data.get('coordinates'),
359
+ duration=action_data.get('duration'),
360
+ wait_time=action_data.get('wait_time', 1.0)
361
+ )
362
+ self.execute_browser_action(action)
363
+
364
+ logger.info(f"Browser task '{task_description}' completed")
365
+ return result
366
+
367
+ def get_system_info(self) -> Dict:
368
+ """Get system information about the computer use environment"""
369
+ data = {
370
+ "task": "get_system_info"
371
+ }
372
+ return self._make_request("system-info", data=data)
373
+
374
+ def take_screenshot(self, filename: Optional[str] = None) -> str:
375
+ """Take a screenshot and save it"""
376
+ if not self.browser_session:
377
+ # Take desktop screenshot
378
+ screenshot = pyautogui.screenshot()
379
+ else:
380
+ # Take browser screenshot
381
+ screenshot = self.browser_session.driver.get_screenshot_as_png()
382
+
383
+ if not filename:
384
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
385
+ filename = f"screenshot_{timestamp}.png"
386
+
387
+ filepath = self.screenshot_dir / filename
388
+
389
+ if isinstance(screenshot, bytes):
390
+ with open(filepath, 'wb') as f:
391
+ f.write(screenshot)
392
+ else:
393
+ screenshot.save(filepath)
394
+
395
+ logger.info(f"Screenshot saved: {filepath}")
396
+ return str(filepath)
397
+
398
+ def __enter__(self):
399
+ """Context manager entry"""
400
+ return self
401
+
402
+ def __exit__(self, exc_type, exc_val, exc_tb):
403
+ """Context manager exit"""
404
+ self.close_browser_session()
405
+
406
+
407
+ # Example usage and test functions
408
+ if __name__ == "__main__":
409
+ # Example usage
410
+ api = Gemini3ComputerUseAPI(api_key="your-api-key-here")
411
+
412
+ try:
413
+ # Start browser session
414
+ with api.start_browser_session("https://example.com") as session:
415
+ # Analyze screen
416
+ analysis = api.analyze_screen()
417
+ print(f"Screen analysis: {analysis}")
418
+
419
+ # Execute a simple action
420
+ action = ComputerAction(
421
+ action_type="click",
422
+ target="body",
423
+ wait_time=1.0
424
+ )
425
+ result = api.execute_browser_action(action)
426
+ print(f"Action result: {result}")
427
+
428
+ # Automate a complex task
429
+ task_result = api.automate_browser_task("Navigate to the search box and search for 'Python programming'", max_steps=5)
430
+ print(f"Task result: {task_result}")
431
+
432
+ except Exception as e:
433
+ logger.error(f"Example failed: {e}")