Spaces:
Running
Running
A-Mahla
commited on
Commit
·
1e72ba8
1
Parent(s):
51747e6
NEW agent (#10)
Browse files- cua2-core/src/cua2_core/models/models.py +6 -2
- cua2-core/src/cua2_core/services/agent_utils/desktop_agent.py +58 -2
- cua2-core/src/cua2_core/services/agent_utils/prompt.py +36 -5
- cua2-core/src/cua2_core/services/instruction_service.py +14 -12
- cua2-core/src/cua2_core/services/sandbox_service.py +29 -0
- cua2-core/src/cua2_core/websocket/websocket_manager.py +1 -1
- cua2-front/src/components/WelcomeScreen.tsx +8 -8
- cua2-front/src/stores/agentStore.ts +2 -2
cua2-core/src/cua2_core/models/models.py
CHANGED
|
@@ -77,7 +77,11 @@ class AgentAction(FunctionCall):
|
|
| 77 |
seconds = args.get("seconds") or args.get("arg_0")
|
| 78 |
return f"Wait for {seconds} seconds"
|
| 79 |
|
| 80 |
-
elif action_type == "
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
url = args.get("url") or args.get("arg_0")
|
| 82 |
return f"Open: {url}"
|
| 83 |
|
|
@@ -100,7 +104,7 @@ class AgentStep(BaseModel):
|
|
| 100 |
step_evaluation: Literal["like", "dislike", "neutral"]
|
| 101 |
error: Optional[str] = None
|
| 102 |
thought: Optional[str] = None
|
| 103 |
-
actions:
|
| 104 |
|
| 105 |
@field_serializer("actions")
|
| 106 |
def serialize_actions(self, actions: list[AgentAction], _info):
|
|
|
|
| 77 |
seconds = args.get("seconds") or args.get("arg_0")
|
| 78 |
return f"Wait for {seconds} seconds"
|
| 79 |
|
| 80 |
+
elif action_type == "open_url":
|
| 81 |
+
url = args.get("url") or args.get("arg_0")
|
| 82 |
+
return f"Open: {url}"
|
| 83 |
+
|
| 84 |
+
elif action_type == "launch":
|
| 85 |
url = args.get("url") or args.get("arg_0")
|
| 86 |
return f"Open: {url}"
|
| 87 |
|
|
|
|
| 104 |
step_evaluation: Literal["like", "dislike", "neutral"]
|
| 105 |
error: Optional[str] = None
|
| 106 |
thought: Optional[str] = None
|
| 107 |
+
actions: list[AgentAction] = []
|
| 108 |
|
| 109 |
@field_serializer("actions")
|
| 110 |
def serialize_actions(self, actions: list[AgentAction], _info):
|
cua2-core/src/cua2_core/services/agent_utils/desktop_agent.py
CHANGED
|
@@ -24,11 +24,13 @@ class E2BVisionAgent(CodeAgent):
|
|
| 24 |
verbosity_level: LogLevel = 2,
|
| 25 |
planning_interval: int | None = None,
|
| 26 |
use_v1_prompt: bool = False,
|
|
|
|
| 27 |
**kwargs,
|
| 28 |
):
|
| 29 |
self.desktop = desktop
|
| 30 |
self.data_dir = data_dir
|
| 31 |
self.planning_interval = planning_interval
|
|
|
|
| 32 |
# Initialize Desktop
|
| 33 |
self.width, self.height = self.desktop.get_screen_size()
|
| 34 |
print(f"Screen size: {self.width}x{self.height}")
|
|
@@ -60,6 +62,27 @@ class E2BVisionAgent(CodeAgent):
|
|
| 60 |
self.logger.log("Setting up agent tools...")
|
| 61 |
self._setup_desktop_tools()
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
def _setup_desktop_tools(self):
|
| 64 |
"""Register all desktop tools"""
|
| 65 |
|
|
@@ -71,6 +94,9 @@ class E2BVisionAgent(CodeAgent):
|
|
| 71 |
x: The x coordinate (horizontal position)
|
| 72 |
y: The y coordinate (vertical position)
|
| 73 |
"""
|
|
|
|
|
|
|
|
|
|
| 74 |
self.desktop.move_mouse(x, y)
|
| 75 |
self.desktop.left_click()
|
| 76 |
self.click_coordinates = [x, y]
|
|
@@ -85,6 +111,9 @@ class E2BVisionAgent(CodeAgent):
|
|
| 85 |
x: The x coordinate (horizontal position)
|
| 86 |
y: The y coordinate (vertical position)
|
| 87 |
"""
|
|
|
|
|
|
|
|
|
|
| 88 |
self.desktop.move_mouse(x, y)
|
| 89 |
self.desktop.right_click()
|
| 90 |
self.click_coordinates = [x, y]
|
|
@@ -99,6 +128,9 @@ class E2BVisionAgent(CodeAgent):
|
|
| 99 |
x: The x coordinate (horizontal position)
|
| 100 |
y: The y coordinate (vertical position)
|
| 101 |
"""
|
|
|
|
|
|
|
|
|
|
| 102 |
self.desktop.move_mouse(x, y)
|
| 103 |
self.desktop.double_click()
|
| 104 |
self.click_coordinates = [x, y]
|
|
@@ -113,6 +145,9 @@ class E2BVisionAgent(CodeAgent):
|
|
| 113 |
x: The x coordinate (horizontal position)
|
| 114 |
y: The y coordinate (vertical position)
|
| 115 |
"""
|
|
|
|
|
|
|
|
|
|
| 116 |
self.desktop.move_mouse(x, y)
|
| 117 |
self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
|
| 118 |
return f"Moved mouse to coordinates ({x}, {y})"
|
|
@@ -167,6 +202,11 @@ class E2BVisionAgent(CodeAgent):
|
|
| 167 |
x2: end x coordinate
|
| 168 |
y2: end y coordinate
|
| 169 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
self.desktop.drag([x1, y1], [x2, y2])
|
| 171 |
message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
|
| 172 |
self.logger.log(message)
|
|
@@ -182,6 +222,9 @@ class E2BVisionAgent(CodeAgent):
|
|
| 182 |
direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
|
| 183 |
amount: The amount to scroll. A good amount is 1 or 2.
|
| 184 |
"""
|
|
|
|
|
|
|
|
|
|
| 185 |
self.desktop.move_mouse(x, y)
|
| 186 |
self.desktop.scroll(direction=direction, amount=amount)
|
| 187 |
message = f"Scrolled {direction} by {amount}"
|
|
@@ -200,18 +243,30 @@ class E2BVisionAgent(CodeAgent):
|
|
| 200 |
return f"Waited for {seconds} seconds"
|
| 201 |
|
| 202 |
@tool
|
| 203 |
-
def
|
| 204 |
"""
|
| 205 |
Directly opens a browser with the specified url: use this at start of web searches rather than trying to click the browser.
|
| 206 |
Args:
|
| 207 |
url: The URL to open
|
| 208 |
"""
|
|
|
|
|
|
|
| 209 |
self.desktop.open(url)
|
| 210 |
|
| 211 |
time.sleep(2)
|
| 212 |
self.logger.log(f"Opening URL: {url}")
|
| 213 |
return f"Opened URL: {url}"
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
# Register the tools
|
| 216 |
self.tools["click"] = click
|
| 217 |
self.tools["right_click"] = right_click
|
|
@@ -221,7 +276,8 @@ class E2BVisionAgent(CodeAgent):
|
|
| 221 |
self.tools["press"] = press
|
| 222 |
self.tools["scroll"] = scroll
|
| 223 |
self.tools["wait"] = wait
|
| 224 |
-
self.tools["
|
|
|
|
| 225 |
self.tools["go_back"] = go_back
|
| 226 |
self.tools["drag"] = drag
|
| 227 |
self.tools["scroll"] = scroll
|
|
|
|
| 24 |
verbosity_level: LogLevel = 2,
|
| 25 |
planning_interval: int | None = None,
|
| 26 |
use_v1_prompt: bool = False,
|
| 27 |
+
qwen_normalization: bool = True,
|
| 28 |
**kwargs,
|
| 29 |
):
|
| 30 |
self.desktop = desktop
|
| 31 |
self.data_dir = data_dir
|
| 32 |
self.planning_interval = planning_interval
|
| 33 |
+
self.qwen_normalization = qwen_normalization
|
| 34 |
# Initialize Desktop
|
| 35 |
self.width, self.height = self.desktop.get_screen_size()
|
| 36 |
print(f"Screen size: {self.width}x{self.height}")
|
|
|
|
| 62 |
self.logger.log("Setting up agent tools...")
|
| 63 |
self._setup_desktop_tools()
|
| 64 |
|
| 65 |
+
def _qwen_unnormalization(self, arguments: dict[str, int]) -> dict[str, int]:
|
| 66 |
+
"""
|
| 67 |
+
Unnormalize coordinates from 0-999 range to actual screen pixel coordinates.
|
| 68 |
+
Coordinates are identified by keys containing 'x' or 'y'.
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
arguments: Dictionary with coordinate parameters (keys containing 'x' or 'y')
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
Dictionary with unnormalized pixel coordinates
|
| 75 |
+
"""
|
| 76 |
+
unnormalized: dict[str, int] = {}
|
| 77 |
+
for key, value in arguments.items():
|
| 78 |
+
if "x" in key.lower() and "y" not in key.lower():
|
| 79 |
+
unnormalized[key] = int((value / 1000) * self.width)
|
| 80 |
+
elif "y" in key.lower():
|
| 81 |
+
unnormalized[key] = int((value / 1000) * self.height)
|
| 82 |
+
else:
|
| 83 |
+
unnormalized[key] = value
|
| 84 |
+
return unnormalized
|
| 85 |
+
|
| 86 |
def _setup_desktop_tools(self):
|
| 87 |
"""Register all desktop tools"""
|
| 88 |
|
|
|
|
| 94 |
x: The x coordinate (horizontal position)
|
| 95 |
y: The y coordinate (vertical position)
|
| 96 |
"""
|
| 97 |
+
if self.qwen_normalization:
|
| 98 |
+
coords = self._qwen_unnormalization({"x": x, "y": y})
|
| 99 |
+
x, y = coords["x"], coords["y"]
|
| 100 |
self.desktop.move_mouse(x, y)
|
| 101 |
self.desktop.left_click()
|
| 102 |
self.click_coordinates = [x, y]
|
|
|
|
| 111 |
x: The x coordinate (horizontal position)
|
| 112 |
y: The y coordinate (vertical position)
|
| 113 |
"""
|
| 114 |
+
if self.qwen_normalization:
|
| 115 |
+
coords = self._qwen_unnormalization({"x": x, "y": y})
|
| 116 |
+
x, y = coords["x"], coords["y"]
|
| 117 |
self.desktop.move_mouse(x, y)
|
| 118 |
self.desktop.right_click()
|
| 119 |
self.click_coordinates = [x, y]
|
|
|
|
| 128 |
x: The x coordinate (horizontal position)
|
| 129 |
y: The y coordinate (vertical position)
|
| 130 |
"""
|
| 131 |
+
if self.qwen_normalization:
|
| 132 |
+
coords = self._qwen_unnormalization({"x": x, "y": y})
|
| 133 |
+
x, y = coords["x"], coords["y"]
|
| 134 |
self.desktop.move_mouse(x, y)
|
| 135 |
self.desktop.double_click()
|
| 136 |
self.click_coordinates = [x, y]
|
|
|
|
| 145 |
x: The x coordinate (horizontal position)
|
| 146 |
y: The y coordinate (vertical position)
|
| 147 |
"""
|
| 148 |
+
if self.qwen_normalization:
|
| 149 |
+
coords = self._qwen_unnormalization({"x": x, "y": y})
|
| 150 |
+
x, y = coords["x"], coords["y"]
|
| 151 |
self.desktop.move_mouse(x, y)
|
| 152 |
self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
|
| 153 |
return f"Moved mouse to coordinates ({x}, {y})"
|
|
|
|
| 202 |
x2: end x coordinate
|
| 203 |
y2: end y coordinate
|
| 204 |
"""
|
| 205 |
+
if self.qwen_normalization:
|
| 206 |
+
coords = self._qwen_unnormalization(
|
| 207 |
+
{"x1": x1, "y1": y1, "x2": x2, "y2": y2}
|
| 208 |
+
)
|
| 209 |
+
x1, y1, x2, y2 = coords["x1"], coords["y1"], coords["x2"], coords["y2"]
|
| 210 |
self.desktop.drag([x1, y1], [x2, y2])
|
| 211 |
message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
|
| 212 |
self.logger.log(message)
|
|
|
|
| 222 |
direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
|
| 223 |
amount: The amount to scroll. A good amount is 1 or 2.
|
| 224 |
"""
|
| 225 |
+
if self.qwen_normalization:
|
| 226 |
+
coords = self._qwen_unnormalization({"x": x, "y": y})
|
| 227 |
+
x, y = coords["x"], coords["y"]
|
| 228 |
self.desktop.move_mouse(x, y)
|
| 229 |
self.desktop.scroll(direction=direction, amount=amount)
|
| 230 |
message = f"Scrolled {direction} by {amount}"
|
|
|
|
| 243 |
return f"Waited for {seconds} seconds"
|
| 244 |
|
| 245 |
@tool
|
| 246 |
+
def open_url(url: str) -> str:
|
| 247 |
"""
|
| 248 |
Directly opens a browser with the specified url: use this at start of web searches rather than trying to click the browser.
|
| 249 |
Args:
|
| 250 |
url: The URL to open
|
| 251 |
"""
|
| 252 |
+
if not url.startswith("http") and not url.startswith("https"):
|
| 253 |
+
url = f"https://{url}"
|
| 254 |
self.desktop.open(url)
|
| 255 |
|
| 256 |
time.sleep(2)
|
| 257 |
self.logger.log(f"Opening URL: {url}")
|
| 258 |
return f"Opened URL: {url}"
|
| 259 |
|
| 260 |
+
@tool
|
| 261 |
+
def launch(app: str) -> str:
|
| 262 |
+
"""
|
| 263 |
+
Launches the specified application
|
| 264 |
+
Args:
|
| 265 |
+
app: The application to launch
|
| 266 |
+
"""
|
| 267 |
+
self.desktop.commands.run(f"{app}", background=True)
|
| 268 |
+
return f"Launched application: {app}"
|
| 269 |
+
|
| 270 |
# Register the tools
|
| 271 |
self.tools["click"] = click
|
| 272 |
self.tools["right_click"] = right_click
|
|
|
|
| 276 |
self.tools["press"] = press
|
| 277 |
self.tools["scroll"] = scroll
|
| 278 |
self.tools["wait"] = wait
|
| 279 |
+
self.tools["open_url"] = open_url
|
| 280 |
+
self.tools["launch"] = launch
|
| 281 |
self.tools["go_back"] = go_back
|
| 282 |
self.tools["drag"] = drag
|
| 283 |
self.tools["scroll"] = scroll
|
cua2-core/src/cua2_core/services/agent_utils/prompt.py
CHANGED
|
@@ -32,6 +32,32 @@ click(x, y)
|
|
| 32 |
|
| 33 |
<environment>
|
| 34 |
The desktop resolution is <<resolution_x>>x<<resolution_y>> pixels.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
You can only interact through the following tools:
|
| 36 |
|
| 37 |
{%- for tool in tools.values() %}
|
|
@@ -42,7 +68,11 @@ You can only interact through the following tools:
|
|
| 42 |
|
| 43 |
If a task requires a specific application or website, **use**:
|
| 44 |
```python
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
```
|
| 47 |
to launch it before interacting.
|
| 48 |
Never manually click the browser icon — use `open_url()` directly for web pages.
|
|
@@ -51,8 +81,9 @@ Never manually click the browser icon — use `open_url()` directly for web page
|
|
| 51 |
---
|
| 52 |
|
| 53 |
<click_guidelines>
|
| 54 |
-
- Always
|
| 55 |
- Click precisely **in the center** of the intended target (button, text, icon).
|
|
|
|
| 56 |
- Avoid random or approximate coordinates.
|
| 57 |
- If nothing changes after a click, check if you misclicked (green crosshair = last click position).
|
| 58 |
- If a menu item shows a ▶ (triangle), it means it expands—click directly on the text, not the icon.
|
|
@@ -64,7 +95,7 @@ Never manually click the browser icon — use `open_url()` directly for web page
|
|
| 64 |
<workflow_guidelines>
|
| 65 |
- **ALWAYS START** by analyzing if the task requires opening an application or URL. If so, your **first action** must be:
|
| 66 |
- For websites: `open_url("https://google.com")`
|
| 67 |
-
- For applications: `
|
| 68 |
- Never manually navigate to apps via clicking icons—use the open tools directly.
|
| 69 |
- Complete one atomic action per step: e.g., **click**, **type**, or **wait**.
|
| 70 |
- Never combine multiple tool calls in one step.
|
|
@@ -89,7 +120,7 @@ What I see: “Text Editor” visible under Accessories.
|
|
| 89 |
Reflection: Clicking directly on “Text Editor”.
|
| 90 |
Action:
|
| 91 |
```python
|
| 92 |
-
|
| 93 |
```<end_code>
|
| 94 |
|
| 95 |
Step 2
|
|
@@ -98,7 +129,7 @@ What I see: Text editor page.
|
|
| 98 |
Reflection: Click on the text editor page to write "Hello World".
|
| 99 |
Action:
|
| 100 |
```python
|
| 101 |
-
click(
|
| 102 |
```<end_code>
|
| 103 |
|
| 104 |
Step 3
|
|
|
|
| 32 |
|
| 33 |
<environment>
|
| 34 |
The desktop resolution is <<resolution_x>>x<<resolution_y>> pixels.
|
| 35 |
+
|
| 36 |
+
**Coordinate System:**
|
| 37 |
+
- **IMPORTANT**: All coordinates must be specified in a **normalized range from 0 to 1000**.
|
| 38 |
+
- The x-axis goes from 0 (left edge) to 1000 (right edge).
|
| 39 |
+
- The y-axis goes from 0 (top edge) to 1000 (bottom edge).
|
| 40 |
+
- The system will automatically convert these normalized coordinates to actual screen pixels.
|
| 41 |
+
- Example: To click the center of the screen, use `click(500, 500)`.
|
| 42 |
+
|
| 43 |
+
**System Information:**
|
| 44 |
+
You are running on **Xubuntu** (Ubuntu with XFCE desktop environment).
|
| 45 |
+
This is a lightweight setup with essential applications.
|
| 46 |
+
|
| 47 |
+
**Available Default Applications:**
|
| 48 |
+
- **File Manager**: Use terminal to browse and manage files (file browsing and management)
|
| 49 |
+
- **Document/Calc Editor**: LibreOffice (document/calculator editor)
|
| 50 |
+
- **Note-taking**: mousepad
|
| 51 |
+
- **Terminal**: xfce4-terminal (command-line interface)
|
| 52 |
+
- **Web Browser**: Firefox (use `open_url()` for websites)
|
| 53 |
+
- **Image Viewer**: ristretto (image viewer)
|
| 54 |
+
- **PDF Viewer**: xpdf (pdf viewer)
|
| 55 |
+
|
| 56 |
+
**Important Notes:**
|
| 57 |
+
- This is a **lightweight desktop environment** — do not assume specialized software is installed.
|
| 58 |
+
- For tasks requiring specific applications not listed above, you may need to adapt or use available alternatives.
|
| 59 |
+
- Always verify what's actually visible on the screen rather than assuming applications exist.
|
| 60 |
+
|
| 61 |
You can only interact through the following tools:
|
| 62 |
|
| 63 |
{%- for tool in tools.values() %}
|
|
|
|
| 68 |
|
| 69 |
If a task requires a specific application or website, **use**:
|
| 70 |
```python
|
| 71 |
+
open_url("https://google.com")
|
| 72 |
+
launch("xfce4-terminal")
|
| 73 |
+
launch("libreoffice --writer")
|
| 74 |
+
launch("libreoffice --calc")
|
| 75 |
+
launch("mousepad")
|
| 76 |
```
|
| 77 |
to launch it before interacting.
|
| 78 |
Never manually click the browser icon — use `open_url()` directly for web pages.
|
|
|
|
| 81 |
---
|
| 82 |
|
| 83 |
<click_guidelines>
|
| 84 |
+
- Always use **normalized coordinates (0-1000 range)** based on the current screenshot.
|
| 85 |
- Click precisely **in the center** of the intended target (button, text, icon).
|
| 86 |
+
- Coordinates must be integers between 0 and 1000 for both x and y axes.
|
| 87 |
- Avoid random or approximate coordinates.
|
| 88 |
- If nothing changes after a click, check if you misclicked (green crosshair = last click position).
|
| 89 |
- If a menu item shows a ▶ (triangle), it means it expands—click directly on the text, not the icon.
|
|
|
|
| 95 |
<workflow_guidelines>
|
| 96 |
- **ALWAYS START** by analyzing if the task requires opening an application or URL. If so, your **first action** must be:
|
| 97 |
- For websites: `open_url("https://google.com")`
|
| 98 |
+
- For applications: `launch("app_name")`
|
| 99 |
- Never manually navigate to apps via clicking icons—use the open tools directly.
|
| 100 |
- Complete one atomic action per step: e.g., **click**, **type**, or **wait**.
|
| 101 |
- Never combine multiple tool calls in one step.
|
|
|
|
| 120 |
Reflection: Clicking directly on “Text Editor”.
|
| 121 |
Action:
|
| 122 |
```python
|
| 123 |
+
launch("text_editor")
|
| 124 |
```<end_code>
|
| 125 |
|
| 126 |
Step 2
|
|
|
|
| 129 |
Reflection: Click on the text editor page to write "Hello World".
|
| 130 |
Action:
|
| 131 |
```python
|
| 132 |
+
click(150, 100)
|
| 133 |
```<end_code>
|
| 134 |
|
| 135 |
Step 3
|
cua2-core/src/cua2_core/services/instruction_service.py
CHANGED
|
@@ -14,12 +14,9 @@ class InstructionService:
|
|
| 14 |
available_models = AVAILABLE_MODELS
|
| 15 |
seed_topics = [
|
| 16 |
"web browsing",
|
| 17 |
-
"
|
| 18 |
-
"calendar scheduling",
|
| 19 |
-
"file management",
|
| 20 |
"note-taking",
|
| 21 |
-
"
|
| 22 |
-
"text editing",
|
| 23 |
"terminal commands",
|
| 24 |
]
|
| 25 |
|
|
@@ -60,32 +57,37 @@ class InstructionService:
|
|
| 60 |
(
|
| 61 |
"Generate a clear and specific web browsing task instruction for a desktop automation agent. "
|
| 62 |
"The task should be goal-centric, focused on retrieving information or performing an action online. "
|
| 63 |
-
"
|
|
|
|
| 64 |
"Return only the task instruction, nothing else. Keep it simple and focused on a single goal."
|
| 65 |
),
|
| 66 |
(
|
| 67 |
"Create a practical web browsing task for desktop automation. "
|
| 68 |
"The task should focus on finding specific information or completing an online action. "
|
| 69 |
-
"Include a specific URL or website name
|
|
|
|
| 70 |
"Provide only the task description without any additional explanation."
|
| 71 |
),
|
| 72 |
(
|
| 73 |
"Generate a specific web browsing task that a desktop automation agent can perform. "
|
| 74 |
"The task should be about retrieving information or performing an action on a website. "
|
| 75 |
-
"
|
| 76 |
-
"
|
|
|
|
| 77 |
),
|
| 78 |
(
|
| 79 |
"Provide a goal-oriented web browsing task instruction for a desktop agent. "
|
| 80 |
"Focus on what information to find or what action to perform online. "
|
| 81 |
-
"Specify a URL or website
|
|
|
|
| 82 |
"Output only the instruction."
|
| 83 |
),
|
| 84 |
(
|
| 85 |
"Think of a realistic web browsing task suitable for desktop automation. "
|
| 86 |
"The task should be about accessing online information or performing a web-based action. "
|
| 87 |
-
"Include specific URLs or websites
|
| 88 |
-
"
|
|
|
|
| 89 |
),
|
| 90 |
]
|
| 91 |
|
|
|
|
| 14 |
available_models = AVAILABLE_MODELS
|
| 15 |
seed_topics = [
|
| 16 |
"web browsing",
|
| 17 |
+
"file management (linux)",
|
|
|
|
|
|
|
| 18 |
"note-taking",
|
| 19 |
+
"text/document editing (no existing document required, create a new one if needed, use libreoffice)",
|
|
|
|
| 20 |
"terminal commands",
|
| 21 |
]
|
| 22 |
|
|
|
|
| 57 |
(
|
| 58 |
"Generate a clear and specific web browsing task instruction for a desktop automation agent. "
|
| 59 |
"The task should be goal-centric, focused on retrieving information or performing an action online. "
|
| 60 |
+
"Directly specify a URL or website to visit (e.g., 'Go to google.com and search for...'). "
|
| 61 |
+
"Do NOT instruct the agent to open a browser application first - just specify the URL or web task directly. "
|
| 62 |
"Return only the task instruction, nothing else. Keep it simple and focused on a single goal."
|
| 63 |
),
|
| 64 |
(
|
| 65 |
"Create a practical web browsing task for desktop automation. "
|
| 66 |
"The task should focus on finding specific information or completing an online action. "
|
| 67 |
+
"Include a specific URL or website name and what to do there (e.g., 'Visit github.com and...'). "
|
| 68 |
+
"Do NOT include steps about opening a browser - just specify the web task directly. "
|
| 69 |
"Provide only the task description without any additional explanation."
|
| 70 |
),
|
| 71 |
(
|
| 72 |
"Generate a specific web browsing task that a desktop automation agent can perform. "
|
| 73 |
"The task should be about retrieving information or performing an action on a website. "
|
| 74 |
+
"Specify URLs or web addresses directly (e.g., 'Navigate to wikipedia.org and...'). "
|
| 75 |
+
"Do NOT mention opening a browser application - assume the agent will handle that automatically. "
|
| 76 |
+
"Keep it concrete and single-purpose. Return just the task instruction."
|
| 77 |
),
|
| 78 |
(
|
| 79 |
"Provide a goal-oriented web browsing task instruction for a desktop agent. "
|
| 80 |
"Focus on what information to find or what action to perform online. "
|
| 81 |
+
"Specify a URL or website directly as part of the task (e.g., 'Go to amazon.com and...'). "
|
| 82 |
+
"Do NOT instruct to open a browser first - just state the URL and the web task. "
|
| 83 |
"Output only the instruction."
|
| 84 |
),
|
| 85 |
(
|
| 86 |
"Think of a realistic web browsing task suitable for desktop automation. "
|
| 87 |
"The task should be about accessing online information or performing a web-based action. "
|
| 88 |
+
"Include specific URLs or websites with the action to perform (e.g., 'Visit youtube.com and...'). "
|
| 89 |
+
"Do NOT include opening a browser as a separate step - just specify the web task directly. "
|
| 90 |
+
"Keep it simple and goal-focused. Return only the task."
|
| 91 |
),
|
| 92 |
]
|
| 93 |
|
cua2-core/src/cua2_core/services/sandbox_service.py
CHANGED
|
@@ -88,3 +88,32 @@ class SandboxService:
|
|
| 88 |
await asyncio.to_thread(self.sandboxes[session_hash].kill)
|
| 89 |
del self.sandboxes[session_hash]
|
| 90 |
del self.sandbox_metadata[session_hash]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
await asyncio.to_thread(self.sandboxes[session_hash].kill)
|
| 89 |
del self.sandboxes[session_hash]
|
| 90 |
del self.sandbox_metadata[session_hash]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
desktop: Sandbox = Sandbox.create(
|
| 95 |
+
api_key=os.getenv("E2B_API_KEY"),
|
| 96 |
+
resolution=(WIDTH, HEIGHT),
|
| 97 |
+
dpi=96,
|
| 98 |
+
timeout=SANDBOX_TIMEOUT,
|
| 99 |
+
template="k0wmnzir0zuzye6dndlw",
|
| 100 |
+
)
|
| 101 |
+
desktop.stream.start(require_auth=True)
|
| 102 |
+
setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
|
| 103 |
+
desktop.commands.run(setup_cmd)
|
| 104 |
+
print(
|
| 105 |
+
desktop.stream.get_url(
|
| 106 |
+
auto_connect=True,
|
| 107 |
+
view_only=False,
|
| 108 |
+
resize="scale",
|
| 109 |
+
auth_key=desktop.stream.get_auth_key(),
|
| 110 |
+
)
|
| 111 |
+
)
|
| 112 |
+
try:
|
| 113 |
+
while True:
|
| 114 |
+
application = input("Enter application to launch: ")
|
| 115 |
+
desktop.commands.run(f"{application} &")
|
| 116 |
+
except (KeyboardInterrupt, Exception):
|
| 117 |
+
pass
|
| 118 |
+
|
| 119 |
+
desktop.kill()
|
cua2-core/src/cua2_core/websocket/websocket_manager.py
CHANGED
|
@@ -52,7 +52,7 @@ class WebSocketManager:
|
|
| 52 |
try:
|
| 53 |
await websocket.send_text(
|
| 54 |
json.dumps(
|
| 55 |
-
message.model_dump(mode="json", context={"actions_as_json":
|
| 56 |
)
|
| 57 |
)
|
| 58 |
except Exception as e:
|
|
|
|
| 52 |
try:
|
| 53 |
await websocket.send_text(
|
| 54 |
json.dumps(
|
| 55 |
+
message.model_dump(mode="json", context={"actions_as_json": True})
|
| 56 |
)
|
| 57 |
)
|
| 58 |
except Exception as e:
|
cua2-front/src/components/WelcomeScreen.tsx
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
-
import
|
| 2 |
-
import {
|
| 3 |
-
import ShuffleIcon from '@mui/icons-material/Shuffle';
|
| 4 |
-
import SendIcon from '@mui/icons-material/Send';
|
| 5 |
-
import LightModeOutlined from '@mui/icons-material/LightModeOutlined';
|
| 6 |
import DarkModeOutlined from '@mui/icons-material/DarkModeOutlined';
|
|
|
|
|
|
|
|
|
|
| 7 |
import SmartToyIcon from '@mui/icons-material/SmartToy';
|
| 8 |
-
import {
|
| 9 |
-
import {
|
| 10 |
|
| 11 |
interface WelcomeScreenProps {
|
| 12 |
onStartTask: (instruction: string, modelId: string) => void;
|
|
@@ -147,7 +147,7 @@ export const WelcomeScreen: React.FC<WelcomeScreenProps> = ({ onStartTask, isCon
|
|
| 147 |
color: 'text.primary',
|
| 148 |
}}
|
| 149 |
>
|
| 150 |
-
|
| 151 |
</Typography>
|
| 152 |
|
| 153 |
{/* Powered by smolagents */}
|
|
|
|
| 1 |
+
import { fetchAvailableModels, generateRandomQuestion } from '@/services/api';
|
| 2 |
+
import { selectAvailableModels, selectIsDarkMode, selectIsLoadingModels, selectSelectedModelId, useAgentStore } from '@/stores/agentStore';
|
|
|
|
|
|
|
|
|
|
| 3 |
import DarkModeOutlined from '@mui/icons-material/DarkModeOutlined';
|
| 4 |
+
import LightModeOutlined from '@mui/icons-material/LightModeOutlined';
|
| 5 |
+
import SendIcon from '@mui/icons-material/Send';
|
| 6 |
+
import ShuffleIcon from '@mui/icons-material/Shuffle';
|
| 7 |
import SmartToyIcon from '@mui/icons-material/SmartToy';
|
| 8 |
+
import { Box, Button, CircularProgress, Container, FormControl, IconButton, InputLabel, MenuItem, Paper, Select, TextField, Typography } from '@mui/material';
|
| 9 |
+
import React, { useEffect, useRef, useState } from 'react';
|
| 10 |
|
| 11 |
interface WelcomeScreenProps {
|
| 12 |
onStartTask: (instruction: string, modelId: string) => void;
|
|
|
|
| 147 |
color: 'text.primary',
|
| 148 |
}}
|
| 149 |
>
|
| 150 |
+
Computer Use Agent
|
| 151 |
</Typography>
|
| 152 |
|
| 153 |
{/* Powered by smolagents */}
|
cua2-front/src/stores/agentStore.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
|
|
| 1 |
import { create } from 'zustand';
|
| 2 |
import { devtools } from 'zustand/middleware';
|
| 3 |
-
import { AgentTrace, AgentStep, AgentTraceMetadata, FinalStep } from '@/types/agent';
|
| 4 |
|
| 5 |
interface AgentState {
|
| 6 |
// State
|
|
@@ -39,7 +39,7 @@ const initialState = {
|
|
| 39 |
isAgentProcessing: false,
|
| 40 |
isConnectingToE2B: false,
|
| 41 |
vncUrl: '',
|
| 42 |
-
selectedModelId: 'Qwen/Qwen3-VL-
|
| 43 |
availableModels: [],
|
| 44 |
isLoadingModels: false,
|
| 45 |
isConnected: false,
|
|
|
|
| 1 |
+
import { AgentStep, AgentTrace, AgentTraceMetadata, FinalStep } from '@/types/agent';
|
| 2 |
import { create } from 'zustand';
|
| 3 |
import { devtools } from 'zustand/middleware';
|
|
|
|
| 4 |
|
| 5 |
interface AgentState {
|
| 6 |
// State
|
|
|
|
| 39 |
isAgentProcessing: false,
|
| 40 |
isConnectingToE2B: false,
|
| 41 |
vncUrl: '',
|
| 42 |
+
selectedModelId: 'Qwen/Qwen3-VL-30B-A3B-Instruct',
|
| 43 |
availableModels: [],
|
| 44 |
isLoadingModels: false,
|
| 45 |
isConnected: false,
|