Amogh1221 commited on
Commit
aa43f9e
Β·
verified Β·
1 Parent(s): 3e17b24

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +12 -49
main.py CHANGED
@@ -271,9 +271,7 @@ async def _engine_call(engine, coro, timeout_sec: float):
271
 
272
  # ─── Background Memory Cleanup Task ───────────────────────────────────────────
273
  _RAM_CLEANUP_THRESHOLD_MB = float(os.environ.get("RAM_CLEANUP_THRESHOLD_MB", "300"))
274
- _RAM_RESTART_THRESHOLD_MB = float(os.environ.get("RAM_RESTART_THRESHOLD_MB", "1400"))
275
- _RAM_CLEANUP_INTERVAL_SEC = int(os.environ.get("RAM_CLEANUP_INTERVAL_SEC", "30"))
276
- _ENGINE_NODES_LIMIT = int(os.environ.get("ENGINE_NODES_LIMIT", "1500000"))
277
  _CLEAR_HASH_AFTER_MOVE = os.environ.get("CLEAR_HASH_AFTER_MOVE", "1").strip().lower() not in {"0", "false", "no", "off"}
278
  _RESTART_ENGINE_AFTER_MOVE = os.environ.get("RESTART_ENGINE_AFTER_MOVE", "1").strip().lower() not in {"0", "false", "no", "off"}
279
 
@@ -288,61 +286,27 @@ async def memory_cleanup_task():
288
  try:
289
  process = psutil.Process(os.getpid())
290
  mem_mb = process.memory_info().rss / 1024 / 1024
291
-
292
- # Sum up all children (chess engines) memory too
293
- total_mb = mem_mb
294
- child_count = 0
295
- for child in process.children(recursive=True):
296
- try:
297
- total_mb += child.memory_info().rss / 1024 / 1024
298
- child_count += 1
299
- except (psutil.NoSuchProcess, psutil.AccessDenied):
300
- pass
301
-
302
- # ─── RESTART CHECK (The Last Resort) ─────────────────────────────
303
- if total_mb > _RAM_RESTART_THRESHOLD_MB:
304
- active_count = len(manager.active_connections)
305
- print(f"[CRITICAL] RAM at {total_mb:.1f}MB β€” RESTARTING SERVER (Limit: {_RAM_RESTART_THRESHOLD_MB}MB, Active: {active_count})")
306
- # Immediate exit: Hugging Face or Docker will automatically restart the process/container
307
- # This clears all leaky memory and hung engine processes at once.
308
- os._exit(0)
309
-
310
- # ─── REGULAR CLEANUP ─────────────────────────────────────────────
311
- if total_mb > _RAM_CLEANUP_THRESHOLD_MB:
312
- print(f"[CLEANUP] total_tree_ram={total_mb:.1f}MB (threshold {_RAM_CLEANUP_THRESHOLD_MB}MB) β€” purging engines & GC")
313
-
314
- # 1. Purge global engine
315
  engine = _GLOBAL_DEEPCASTLE_ENGINE
316
  if engine is not None:
317
  try:
318
- async with _ENGINE_IO_LOCK:
319
- async with _ENGINE_LOCK:
320
- _GLOBAL_DEEPCASTLE_ENGINE = None
321
- await asyncio.wait_for(engine.quit(), timeout=3.0)
322
- except Exception:
323
- pass
324
-
325
- # 2. Hard purge any orphaned engine processes
326
- for child in process.children(recursive=True):
327
- try:
328
- if "deepcastle" in child.name().lower() or "stockfish" in child.name().lower():
329
- print(f"[CLEANUP] Killing orphaned engine: {child.pid}")
330
- child.kill()
331
  except Exception:
332
  pass
333
-
334
- # 3. Release memory
335
  force_memory_release()
336
  after_mb = process.memory_info().rss / 1024 / 1024
337
- print(f"[CLEANUP] Finished. RAM: {total_mb:.1f}MB β†’ {after_mb:.1f}MB")
338
  else:
339
- # Basic maintenance
340
  force_memory_release()
341
- if total_mb > _RAM_CLEANUP_THRESHOLD_MB * 0.7:
342
- print(f"[CLEANUP] Maintenance. RAM {total_mb:.1f}MB (OK)")
343
 
344
  except Exception as e:
345
- print(f"[CLEANUP] Task error: {e}")
346
 
347
 
348
  @asynccontextmanager
@@ -454,7 +418,6 @@ def ram_usage():
454
  "child_process_count": child_count,
455
  "vms_mb": round(mem.vms / 1024 / 1024, 2),
456
  "threshold_mb": _RAM_CLEANUP_THRESHOLD_MB,
457
- "restart_threshold_mb": _RAM_RESTART_THRESHOLD_MB,
458
  "cleanup_interval_sec": _RAM_CLEANUP_INTERVAL_SEC,
459
  "status": "high" if total_mb > _RAM_CLEANUP_THRESHOLD_MB else "ok",
460
  "active_rooms": len(manager.active_connections),
@@ -812,7 +775,7 @@ async def analyze_game(request: AnalyzeRequest):
812
  try:
813
  engine = await get_stockfish_engine(hash_mb=2048)
814
  board = chess.Board(request.start_fen) if request.start_fen else chess.Board()
815
- limit = chess.engine.Limit(time=request.time_per_move, nodes=_ENGINE_NODES_LIMIT)
816
 
817
  analysis_results = []
818
  ply_timeout = _analyze_ply_timeout(request.time_per_move)
 
271
 
272
  # ─── Background Memory Cleanup Task ───────────────────────────────────────────
273
  _RAM_CLEANUP_THRESHOLD_MB = float(os.environ.get("RAM_CLEANUP_THRESHOLD_MB", "300"))
274
+ _RAM_CLEANUP_INTERVAL_SEC = int(os.environ.get("RAM_CLEANUP_INTERVAL_SEC", "60"))
 
 
275
  _CLEAR_HASH_AFTER_MOVE = os.environ.get("CLEAR_HASH_AFTER_MOVE", "1").strip().lower() not in {"0", "false", "no", "off"}
276
  _RESTART_ENGINE_AFTER_MOVE = os.environ.get("RESTART_ENGINE_AFTER_MOVE", "1").strip().lower() not in {"0", "false", "no", "off"}
277
 
 
286
  try:
287
  process = psutil.Process(os.getpid())
288
  mem_mb = process.memory_info().rss / 1024 / 1024
289
+
290
+ if mem_mb > _RAM_CLEANUP_THRESHOLD_MB:
291
+ print(f"[CLEANUP] RAM at {mem_mb:.1f}MB (threshold {_RAM_CLEANUP_THRESHOLD_MB}MB) β€” clearing engine hash")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  engine = _GLOBAL_DEEPCASTLE_ENGINE
293
  if engine is not None:
294
  try:
295
+ if not engine.is_terminated():
296
+ async with _ENGINE_IO_LOCK:
297
+ await _clear_engine_hash(engine)
 
 
 
 
 
 
 
 
 
 
298
  except Exception:
299
  pass
 
 
300
  force_memory_release()
301
  after_mb = process.memory_info().rss / 1024 / 1024
302
+ print(f"[CLEANUP] Done. RAM: {mem_mb:.1f}MB β†’ {after_mb:.1f}MB")
303
  else:
304
+ # Always nudge GC + malloc_trim even when RAM is fine
305
  force_memory_release()
306
+ print(f"[CLEANUP] RAM at {mem_mb:.1f}MB β€” OK")
 
307
 
308
  except Exception as e:
309
+ print(f"[CLEANUP] Error during cleanup: {e}")
310
 
311
 
312
  @asynccontextmanager
 
418
  "child_process_count": child_count,
419
  "vms_mb": round(mem.vms / 1024 / 1024, 2),
420
  "threshold_mb": _RAM_CLEANUP_THRESHOLD_MB,
 
421
  "cleanup_interval_sec": _RAM_CLEANUP_INTERVAL_SEC,
422
  "status": "high" if total_mb > _RAM_CLEANUP_THRESHOLD_MB else "ok",
423
  "active_rooms": len(manager.active_connections),
 
775
  try:
776
  engine = await get_stockfish_engine(hash_mb=2048)
777
  board = chess.Board(request.start_fen) if request.start_fen else chess.Board()
778
+ limit = chess.engine.Limit(time=request.time_per_move)
779
 
780
  analysis_results = []
781
  ply_timeout = _analyze_ply_timeout(request.time_per_move)