Amir Mahla commited on
Commit
ebff663
·
1 Parent(s): c5e2e1f

FIX pre-commit

Browse files
cua2-core/src/cua2_core/services/agent_service.py CHANGED
@@ -186,8 +186,10 @@ class AgentService:
186
 
187
  # Wait for sandbox to be ready (it was already acquired in create_id_and_sandbox)
188
  max_attempts = 60 # Increased timeout to 2 minutes (60 * 2s)
 
189
  for attempt in range(max_attempts):
190
  response = await self.sandbox_service.acquire_sandbox(message_id)
 
191
  if response.sandbox is not None and response.state == "ready":
192
  sandbox = response.sandbox
193
  break
@@ -196,10 +198,18 @@ class AgentService:
196
  logger.warning(
197
  f"Sandbox pool limit reached for {message_id}, attempting cleanup of stuck/expired sandboxes"
198
  )
199
- await self.sandbox_service.cleanup_stuck_creating_sandboxes()
200
- await self.sandbox_service.cleanup_expired_ready_sandboxes()
 
 
 
 
 
 
 
201
  # Try one more time after cleanup
202
  response = await self.sandbox_service.acquire_sandbox(message_id)
 
203
  if response.sandbox is not None and response.state == "ready":
204
  sandbox = response.sandbox
205
  break
@@ -217,17 +227,40 @@ class AgentService:
217
  f"Waiting for sandbox for {message_id}, attempt {attempt}/{max_attempts}, state: {response.state}"
218
  )
219
  await asyncio.sleep(2)
 
 
220
  if sandbox is None:
221
- # Final cleanup attempt before raising error
222
- await self.sandbox_service.cleanup_stuck_creating_sandboxes()
223
- await self.sandbox_service.cleanup_expired_ready_sandboxes()
 
 
 
 
 
 
 
 
 
 
 
 
224
  (
225
  available_count,
226
  non_available_count,
227
  ) = await self.sandbox_service.get_sandbox_counts()
228
- raise Exception(
229
- f"No sandbox available: pool limit reached (available: {available_count}, non-available: {non_available_count}, max: {self.max_sandboxes})"
 
 
 
230
  )
 
 
 
 
 
 
231
 
232
  data_dir = self.active_tasks[message_id].trace_path
233
  user_content = self.active_tasks[message_id].instruction
 
186
 
187
  # Wait for sandbox to be ready (it was already acquired in create_id_and_sandbox)
188
  max_attempts = 60 # Increased timeout to 2 minutes (60 * 2s)
189
+ last_state = None
190
  for attempt in range(max_attempts):
191
  response = await self.sandbox_service.acquire_sandbox(message_id)
192
+ last_state = response.state
193
  if response.sandbox is not None and response.state == "ready":
194
  sandbox = response.sandbox
195
  break
 
198
  logger.warning(
199
  f"Sandbox pool limit reached for {message_id}, attempting cleanup of stuck/expired sandboxes"
200
  )
201
+ cleaned_creating = (
202
+ await self.sandbox_service.cleanup_stuck_creating_sandboxes()
203
+ )
204
+ cleaned_expired = (
205
+ await self.sandbox_service.cleanup_expired_ready_sandboxes()
206
+ )
207
+ logger.info(
208
+ f"Cleanup completed: removed {cleaned_creating} stuck creating + {cleaned_expired} expired ready sandboxes"
209
+ )
210
  # Try one more time after cleanup
211
  response = await self.sandbox_service.acquire_sandbox(message_id)
212
+ last_state = response.state
213
  if response.sandbox is not None and response.state == "ready":
214
  sandbox = response.sandbox
215
  break
 
227
  f"Waiting for sandbox for {message_id}, attempt {attempt}/{max_attempts}, state: {response.state}"
228
  )
229
  await asyncio.sleep(2)
230
+
231
+ # If sandbox is still None after all attempts, do final cleanup and check
232
  if sandbox is None:
233
+ logger.warning(
234
+ f"Sandbox for {message_id} still not ready after {max_attempts} attempts (last state: {last_state}), performing final cleanup"
235
+ )
236
+ # Final cleanup attempt before raising error - be more aggressive
237
+ cleaned_creating = (
238
+ await self.sandbox_service.cleanup_stuck_creating_sandboxes()
239
+ )
240
+ cleaned_expired = (
241
+ await self.sandbox_service.cleanup_expired_ready_sandboxes()
242
+ )
243
+ logger.info(
244
+ f"Final cleanup: removed {cleaned_creating} stuck creating + {cleaned_expired} expired ready sandboxes"
245
+ )
246
+
247
+ # Try one last time after cleanup
248
  (
249
  available_count,
250
  non_available_count,
251
  ) = await self.sandbox_service.get_sandbox_counts()
252
+ # Provide more detailed error message
253
+ error_msg = (
254
+ f"No sandbox available for {message_id}: "
255
+ f"available: {available_count}, non-available: {non_available_count}, "
256
+ f"max: {self.max_sandboxes}, last_state: {last_state}"
257
  )
258
+ if non_available_count > 0:
259
+ error_msg += (
260
+ f". There are {non_available_count} sandbox(s) stuck in 'creating' state "
261
+ f"that may need manual cleanup or the cleanup threshold may be too high."
262
+ )
263
+ raise Exception(error_msg)
264
 
265
  data_dir = self.active_tasks[message_id].trace_path
266
  user_content = self.active_tasks[message_id].instruction
cua2-core/src/cua2_core/services/sandbox_service.py CHANGED
@@ -9,9 +9,11 @@ from pydantic import BaseModel
9
 
10
  SANDBOX_METADATA: dict[str, dict[str, Any]] = {}
11
  SANDBOX_TIMEOUT = 500
12
- SANDBOX_CREATION_TIMEOUT = 200
13
  SANDBOX_CREATION_MAX_TIME = (
14
- 120 # Maximum time a sandbox can be in "creating" state (2 minutes, reduced from 5)
 
 
15
  )
16
  WIDTH = 1280
17
  HEIGHT = 960
@@ -167,7 +169,7 @@ class SandboxService:
167
  and (
168
  current_time - self.sandbox_metadata[session_hash]["created_at"]
169
  ).total_seconds()
170
- < SANDBOX_CREATION_TIMEOUT
171
  ):
172
  print(f"Reusing Sandbox for session {session_hash}")
173
  self.sandbox_metadata[session_hash]["last_accessed"] = current_time
@@ -180,8 +182,58 @@ class SandboxService:
180
  session_hash in self.sandbox_metadata
181
  and self.sandbox_metadata[session_hash].get("state") == "creating"
182
  ):
183
- print(f"Sandbox for session {session_hash} is already being created")
184
- return SandboxResponse(sandbox=None, state="creating")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
  # Mark expired sandbox for cleanup (remove from dict within lock)
187
  if session_hash in self.sandboxes:
@@ -298,14 +350,28 @@ class SandboxService:
298
  for session_hash, metadata in list(self.sandbox_metadata.items()):
299
  if metadata.get("state") == "creating":
300
  created_at = metadata.get("created_at")
301
- if (
302
- created_at
303
- and (current_time - created_at).total_seconds()
304
- > SANDBOX_CREATION_MAX_TIME
305
- ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  print(
307
  f"Cleaning up stuck 'creating' sandbox for session {session_hash} "
308
- f"(stuck for {(current_time - created_at).total_seconds():.1f}s)"
309
  )
310
  # Collect sandbox to kill if it exists
311
  if session_hash in self.sandboxes:
@@ -339,7 +405,7 @@ class SandboxService:
339
  if (
340
  created_at
341
  and (current_time - created_at).total_seconds()
342
- >= SANDBOX_CREATION_TIMEOUT
343
  ):
344
  print(
345
  f"Cleaning up expired ready sandbox for session {session_hash} "
 
9
 
10
  SANDBOX_METADATA: dict[str, dict[str, Any]] = {}
11
  SANDBOX_TIMEOUT = 500
12
+ SANDBOX_READY_TIMEOUT = 200
13
  SANDBOX_CREATION_MAX_TIME = (
14
+ 90 # Maximum time a sandbox can be in "creating" state (90 seconds)
15
+ # Reduced from 120 to be more aggressive and clean up stuck sandboxes
16
+ # before the agent_service loop times out (which waits 60 attempts * 2s = 120s)
17
  )
18
  WIDTH = 1280
19
  HEIGHT = 960
 
169
  and (
170
  current_time - self.sandbox_metadata[session_hash]["created_at"]
171
  ).total_seconds()
172
+ < SANDBOX_READY_TIMEOUT
173
  ):
174
  print(f"Reusing Sandbox for session {session_hash}")
175
  self.sandbox_metadata[session_hash]["last_accessed"] = current_time
 
182
  session_hash in self.sandbox_metadata
183
  and self.sandbox_metadata[session_hash].get("state") == "creating"
184
  ):
185
+ # Check if this sandbox has been stuck in "creating" state for too long
186
+ created_at = self.sandbox_metadata[session_hash].get("created_at")
187
+ if created_at:
188
+ stuck_duration = (current_time - created_at).total_seconds()
189
+ if stuck_duration > SANDBOX_CREATION_MAX_TIME:
190
+ # This sandbox is stuck - clean it up immediately
191
+ print(
192
+ f"Sandbox for session {session_hash} has been stuck in 'creating' state "
193
+ f"for {stuck_duration:.1f}s (threshold: {SANDBOX_CREATION_MAX_TIME}s) - cleaning up"
194
+ )
195
+ # Remove from metadata and sandboxes dict
196
+ if session_hash in self.sandboxes:
197
+ # Schedule kill outside of lock with error handling
198
+ stuck_sandbox = self.sandboxes[session_hash]
199
+ del self.sandboxes[session_hash]
200
+
201
+ async def kill_stuck():
202
+ try:
203
+ await asyncio.to_thread(stuck_sandbox.kill)
204
+ except Exception as e:
205
+ print(
206
+ f"Error killing stuck sandbox for {session_hash}: {str(e)}"
207
+ )
208
+
209
+ asyncio.create_task(kill_stuck())
210
+ del self.sandbox_metadata[session_hash]
211
+ # Fall through to create a new sandbox
212
+ else:
213
+ print(
214
+ f"Sandbox for session {session_hash} is already being created"
215
+ )
216
+ return SandboxResponse(sandbox=None, state="creating")
217
+ else:
218
+ # Missing created_at - corrupted metadata, clean it up
219
+ print(
220
+ f"WARNING: Sandbox {session_hash} in 'creating' state has no 'created_at' - cleaning up"
221
+ )
222
+ if session_hash in self.sandboxes:
223
+ stuck_sandbox = self.sandboxes[session_hash]
224
+ del self.sandboxes[session_hash]
225
+
226
+ async def kill_stuck():
227
+ try:
228
+ await asyncio.to_thread(stuck_sandbox.kill)
229
+ except Exception as e:
230
+ print(
231
+ f"Error killing stuck sandbox for {session_hash}: {str(e)}"
232
+ )
233
+
234
+ asyncio.create_task(kill_stuck())
235
+ del self.sandbox_metadata[session_hash]
236
+ # Fall through to create a new sandbox
237
 
238
  # Mark expired sandbox for cleanup (remove from dict within lock)
239
  if session_hash in self.sandboxes:
 
350
  for session_hash, metadata in list(self.sandbox_metadata.items()):
351
  if metadata.get("state") == "creating":
352
  created_at = metadata.get("created_at")
353
+ # Clean up if:
354
+ # 1. created_at exists and is older than threshold, OR
355
+ # 2. created_at is missing (corrupted metadata - should never happen but handle it)
356
+ should_cleanup = False
357
+ stuck_duration = 0.0
358
+
359
+ if created_at:
360
+ stuck_duration = (current_time - created_at).total_seconds()
361
+ if stuck_duration > SANDBOX_CREATION_MAX_TIME:
362
+ should_cleanup = True
363
+ else:
364
+ # Missing created_at is a bug, but clean it up anyway
365
+ print(
366
+ f"WARNING: Sandbox {session_hash} in 'creating' state has no 'created_at' timestamp - cleaning up"
367
+ )
368
+ should_cleanup = True
369
+ stuck_duration = float("inf")
370
+
371
+ if should_cleanup:
372
  print(
373
  f"Cleaning up stuck 'creating' sandbox for session {session_hash} "
374
+ f"(stuck for {stuck_duration:.1f}s)"
375
  )
376
  # Collect sandbox to kill if it exists
377
  if session_hash in self.sandboxes:
 
405
  if (
406
  created_at
407
  and (current_time - created_at).total_seconds()
408
+ >= SANDBOX_READY_TIMEOUT
409
  ):
410
  print(
411
  f"Cleaning up expired ready sandbox for session {session_hash} "