Spaces:
Running
Running
Amir Mahla
commited on
Commit
·
ebff663
1
Parent(s):
c5e2e1f
FIX pre-commit
Browse files
cua2-core/src/cua2_core/services/agent_service.py
CHANGED
|
@@ -186,8 +186,10 @@ class AgentService:
|
|
| 186 |
|
| 187 |
# Wait for sandbox to be ready (it was already acquired in create_id_and_sandbox)
|
| 188 |
max_attempts = 60 # Increased timeout to 2 minutes (60 * 2s)
|
|
|
|
| 189 |
for attempt in range(max_attempts):
|
| 190 |
response = await self.sandbox_service.acquire_sandbox(message_id)
|
|
|
|
| 191 |
if response.sandbox is not None and response.state == "ready":
|
| 192 |
sandbox = response.sandbox
|
| 193 |
break
|
|
@@ -196,10 +198,18 @@ class AgentService:
|
|
| 196 |
logger.warning(
|
| 197 |
f"Sandbox pool limit reached for {message_id}, attempting cleanup of stuck/expired sandboxes"
|
| 198 |
)
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
# Try one more time after cleanup
|
| 202 |
response = await self.sandbox_service.acquire_sandbox(message_id)
|
|
|
|
| 203 |
if response.sandbox is not None and response.state == "ready":
|
| 204 |
sandbox = response.sandbox
|
| 205 |
break
|
|
@@ -217,17 +227,40 @@ class AgentService:
|
|
| 217 |
f"Waiting for sandbox for {message_id}, attempt {attempt}/{max_attempts}, state: {response.state}"
|
| 218 |
)
|
| 219 |
await asyncio.sleep(2)
|
|
|
|
|
|
|
| 220 |
if sandbox is None:
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
(
|
| 225 |
available_count,
|
| 226 |
non_available_count,
|
| 227 |
) = await self.sandbox_service.get_sandbox_counts()
|
| 228 |
-
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
| 230 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
data_dir = self.active_tasks[message_id].trace_path
|
| 233 |
user_content = self.active_tasks[message_id].instruction
|
|
|
|
| 186 |
|
| 187 |
# Wait for sandbox to be ready (it was already acquired in create_id_and_sandbox)
|
| 188 |
max_attempts = 60 # Increased timeout to 2 minutes (60 * 2s)
|
| 189 |
+
last_state = None
|
| 190 |
for attempt in range(max_attempts):
|
| 191 |
response = await self.sandbox_service.acquire_sandbox(message_id)
|
| 192 |
+
last_state = response.state
|
| 193 |
if response.sandbox is not None and response.state == "ready":
|
| 194 |
sandbox = response.sandbox
|
| 195 |
break
|
|
|
|
| 198 |
logger.warning(
|
| 199 |
f"Sandbox pool limit reached for {message_id}, attempting cleanup of stuck/expired sandboxes"
|
| 200 |
)
|
| 201 |
+
cleaned_creating = (
|
| 202 |
+
await self.sandbox_service.cleanup_stuck_creating_sandboxes()
|
| 203 |
+
)
|
| 204 |
+
cleaned_expired = (
|
| 205 |
+
await self.sandbox_service.cleanup_expired_ready_sandboxes()
|
| 206 |
+
)
|
| 207 |
+
logger.info(
|
| 208 |
+
f"Cleanup completed: removed {cleaned_creating} stuck creating + {cleaned_expired} expired ready sandboxes"
|
| 209 |
+
)
|
| 210 |
# Try one more time after cleanup
|
| 211 |
response = await self.sandbox_service.acquire_sandbox(message_id)
|
| 212 |
+
last_state = response.state
|
| 213 |
if response.sandbox is not None and response.state == "ready":
|
| 214 |
sandbox = response.sandbox
|
| 215 |
break
|
|
|
|
| 227 |
f"Waiting for sandbox for {message_id}, attempt {attempt}/{max_attempts}, state: {response.state}"
|
| 228 |
)
|
| 229 |
await asyncio.sleep(2)
|
| 230 |
+
|
| 231 |
+
# If sandbox is still None after all attempts, do final cleanup and check
|
| 232 |
if sandbox is None:
|
| 233 |
+
logger.warning(
|
| 234 |
+
f"Sandbox for {message_id} still not ready after {max_attempts} attempts (last state: {last_state}), performing final cleanup"
|
| 235 |
+
)
|
| 236 |
+
# Final cleanup attempt before raising error - be more aggressive
|
| 237 |
+
cleaned_creating = (
|
| 238 |
+
await self.sandbox_service.cleanup_stuck_creating_sandboxes()
|
| 239 |
+
)
|
| 240 |
+
cleaned_expired = (
|
| 241 |
+
await self.sandbox_service.cleanup_expired_ready_sandboxes()
|
| 242 |
+
)
|
| 243 |
+
logger.info(
|
| 244 |
+
f"Final cleanup: removed {cleaned_creating} stuck creating + {cleaned_expired} expired ready sandboxes"
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
# Try one last time after cleanup
|
| 248 |
(
|
| 249 |
available_count,
|
| 250 |
non_available_count,
|
| 251 |
) = await self.sandbox_service.get_sandbox_counts()
|
| 252 |
+
# Provide more detailed error message
|
| 253 |
+
error_msg = (
|
| 254 |
+
f"No sandbox available for {message_id}: "
|
| 255 |
+
f"available: {available_count}, non-available: {non_available_count}, "
|
| 256 |
+
f"max: {self.max_sandboxes}, last_state: {last_state}"
|
| 257 |
)
|
| 258 |
+
if non_available_count > 0:
|
| 259 |
+
error_msg += (
|
| 260 |
+
f". There are {non_available_count} sandbox(s) stuck in 'creating' state "
|
| 261 |
+
f"that may need manual cleanup or the cleanup threshold may be too high."
|
| 262 |
+
)
|
| 263 |
+
raise Exception(error_msg)
|
| 264 |
|
| 265 |
data_dir = self.active_tasks[message_id].trace_path
|
| 266 |
user_content = self.active_tasks[message_id].instruction
|
cua2-core/src/cua2_core/services/sandbox_service.py
CHANGED
|
@@ -9,9 +9,11 @@ from pydantic import BaseModel
|
|
| 9 |
|
| 10 |
SANDBOX_METADATA: dict[str, dict[str, Any]] = {}
|
| 11 |
SANDBOX_TIMEOUT = 500
|
| 12 |
-
|
| 13 |
SANDBOX_CREATION_MAX_TIME = (
|
| 14 |
-
|
|
|
|
|
|
|
| 15 |
)
|
| 16 |
WIDTH = 1280
|
| 17 |
HEIGHT = 960
|
|
@@ -167,7 +169,7 @@ class SandboxService:
|
|
| 167 |
and (
|
| 168 |
current_time - self.sandbox_metadata[session_hash]["created_at"]
|
| 169 |
).total_seconds()
|
| 170 |
-
<
|
| 171 |
):
|
| 172 |
print(f"Reusing Sandbox for session {session_hash}")
|
| 173 |
self.sandbox_metadata[session_hash]["last_accessed"] = current_time
|
|
@@ -180,8 +182,58 @@ class SandboxService:
|
|
| 180 |
session_hash in self.sandbox_metadata
|
| 181 |
and self.sandbox_metadata[session_hash].get("state") == "creating"
|
| 182 |
):
|
| 183 |
-
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
# Mark expired sandbox for cleanup (remove from dict within lock)
|
| 187 |
if session_hash in self.sandboxes:
|
|
@@ -298,14 +350,28 @@ class SandboxService:
|
|
| 298 |
for session_hash, metadata in list(self.sandbox_metadata.items()):
|
| 299 |
if metadata.get("state") == "creating":
|
| 300 |
created_at = metadata.get("created_at")
|
| 301 |
-
if
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
print(
|
| 307 |
f"Cleaning up stuck 'creating' sandbox for session {session_hash} "
|
| 308 |
-
f"(stuck for {
|
| 309 |
)
|
| 310 |
# Collect sandbox to kill if it exists
|
| 311 |
if session_hash in self.sandboxes:
|
|
@@ -339,7 +405,7 @@ class SandboxService:
|
|
| 339 |
if (
|
| 340 |
created_at
|
| 341 |
and (current_time - created_at).total_seconds()
|
| 342 |
-
>=
|
| 343 |
):
|
| 344 |
print(
|
| 345 |
f"Cleaning up expired ready sandbox for session {session_hash} "
|
|
|
|
| 9 |
|
| 10 |
SANDBOX_METADATA: dict[str, dict[str, Any]] = {}
|
| 11 |
SANDBOX_TIMEOUT = 500
|
| 12 |
+
SANDBOX_READY_TIMEOUT = 200
|
| 13 |
SANDBOX_CREATION_MAX_TIME = (
|
| 14 |
+
90 # Maximum time a sandbox can be in "creating" state (90 seconds)
|
| 15 |
+
# Reduced from 120 to be more aggressive and clean up stuck sandboxes
|
| 16 |
+
# before the agent_service loop times out (which waits 60 attempts * 2s = 120s)
|
| 17 |
)
|
| 18 |
WIDTH = 1280
|
| 19 |
HEIGHT = 960
|
|
|
|
| 169 |
and (
|
| 170 |
current_time - self.sandbox_metadata[session_hash]["created_at"]
|
| 171 |
).total_seconds()
|
| 172 |
+
< SANDBOX_READY_TIMEOUT
|
| 173 |
):
|
| 174 |
print(f"Reusing Sandbox for session {session_hash}")
|
| 175 |
self.sandbox_metadata[session_hash]["last_accessed"] = current_time
|
|
|
|
| 182 |
session_hash in self.sandbox_metadata
|
| 183 |
and self.sandbox_metadata[session_hash].get("state") == "creating"
|
| 184 |
):
|
| 185 |
+
# Check if this sandbox has been stuck in "creating" state for too long
|
| 186 |
+
created_at = self.sandbox_metadata[session_hash].get("created_at")
|
| 187 |
+
if created_at:
|
| 188 |
+
stuck_duration = (current_time - created_at).total_seconds()
|
| 189 |
+
if stuck_duration > SANDBOX_CREATION_MAX_TIME:
|
| 190 |
+
# This sandbox is stuck - clean it up immediately
|
| 191 |
+
print(
|
| 192 |
+
f"Sandbox for session {session_hash} has been stuck in 'creating' state "
|
| 193 |
+
f"for {stuck_duration:.1f}s (threshold: {SANDBOX_CREATION_MAX_TIME}s) - cleaning up"
|
| 194 |
+
)
|
| 195 |
+
# Remove from metadata and sandboxes dict
|
| 196 |
+
if session_hash in self.sandboxes:
|
| 197 |
+
# Schedule kill outside of lock with error handling
|
| 198 |
+
stuck_sandbox = self.sandboxes[session_hash]
|
| 199 |
+
del self.sandboxes[session_hash]
|
| 200 |
+
|
| 201 |
+
async def kill_stuck():
|
| 202 |
+
try:
|
| 203 |
+
await asyncio.to_thread(stuck_sandbox.kill)
|
| 204 |
+
except Exception as e:
|
| 205 |
+
print(
|
| 206 |
+
f"Error killing stuck sandbox for {session_hash}: {str(e)}"
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
asyncio.create_task(kill_stuck())
|
| 210 |
+
del self.sandbox_metadata[session_hash]
|
| 211 |
+
# Fall through to create a new sandbox
|
| 212 |
+
else:
|
| 213 |
+
print(
|
| 214 |
+
f"Sandbox for session {session_hash} is already being created"
|
| 215 |
+
)
|
| 216 |
+
return SandboxResponse(sandbox=None, state="creating")
|
| 217 |
+
else:
|
| 218 |
+
# Missing created_at - corrupted metadata, clean it up
|
| 219 |
+
print(
|
| 220 |
+
f"WARNING: Sandbox {session_hash} in 'creating' state has no 'created_at' - cleaning up"
|
| 221 |
+
)
|
| 222 |
+
if session_hash in self.sandboxes:
|
| 223 |
+
stuck_sandbox = self.sandboxes[session_hash]
|
| 224 |
+
del self.sandboxes[session_hash]
|
| 225 |
+
|
| 226 |
+
async def kill_stuck():
|
| 227 |
+
try:
|
| 228 |
+
await asyncio.to_thread(stuck_sandbox.kill)
|
| 229 |
+
except Exception as e:
|
| 230 |
+
print(
|
| 231 |
+
f"Error killing stuck sandbox for {session_hash}: {str(e)}"
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
asyncio.create_task(kill_stuck())
|
| 235 |
+
del self.sandbox_metadata[session_hash]
|
| 236 |
+
# Fall through to create a new sandbox
|
| 237 |
|
| 238 |
# Mark expired sandbox for cleanup (remove from dict within lock)
|
| 239 |
if session_hash in self.sandboxes:
|
|
|
|
| 350 |
for session_hash, metadata in list(self.sandbox_metadata.items()):
|
| 351 |
if metadata.get("state") == "creating":
|
| 352 |
created_at = metadata.get("created_at")
|
| 353 |
+
# Clean up if:
|
| 354 |
+
# 1. created_at exists and is older than threshold, OR
|
| 355 |
+
# 2. created_at is missing (corrupted metadata - should never happen but handle it)
|
| 356 |
+
should_cleanup = False
|
| 357 |
+
stuck_duration = 0.0
|
| 358 |
+
|
| 359 |
+
if created_at:
|
| 360 |
+
stuck_duration = (current_time - created_at).total_seconds()
|
| 361 |
+
if stuck_duration > SANDBOX_CREATION_MAX_TIME:
|
| 362 |
+
should_cleanup = True
|
| 363 |
+
else:
|
| 364 |
+
# Missing created_at is a bug, but clean it up anyway
|
| 365 |
+
print(
|
| 366 |
+
f"WARNING: Sandbox {session_hash} in 'creating' state has no 'created_at' timestamp - cleaning up"
|
| 367 |
+
)
|
| 368 |
+
should_cleanup = True
|
| 369 |
+
stuck_duration = float("inf")
|
| 370 |
+
|
| 371 |
+
if should_cleanup:
|
| 372 |
print(
|
| 373 |
f"Cleaning up stuck 'creating' sandbox for session {session_hash} "
|
| 374 |
+
f"(stuck for {stuck_duration:.1f}s)"
|
| 375 |
)
|
| 376 |
# Collect sandbox to kill if it exists
|
| 377 |
if session_hash in self.sandboxes:
|
|
|
|
| 405 |
if (
|
| 406 |
created_at
|
| 407 |
and (current_time - created_at).total_seconds()
|
| 408 |
+
>= SANDBOX_READY_TIMEOUT
|
| 409 |
):
|
| 410 |
print(
|
| 411 |
f"Cleaning up expired ready sandbox for session {session_hash} "
|