gary-boon Claude Opus 4.5 commited on
Commit
a94eb19
·
1 Parent(s): 959074d

Add per-step memory cleanup for large model support

Browse files

- Delete outputs, logits, probs tensors after each generation step
- Run garbage collection every 8 steps for large models
- Clear MPS cache on Apple Silicon to release GPU memory

This prevents memory accumulation during generation with models
like Devstral that have 40 layers × 32 heads, which previously
caused RAM exhaustion on longer token sequences (32+).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show
  1. backend/model_service.py +32 -0
backend/model_service.py CHANGED
@@ -1990,6 +1990,22 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
1990
  if next_token_id == manager.tokenizer.eos_token_id:
1991
  break
1992
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1993
  # Clean up hooks after generation
1994
  for hook in hooks:
1995
  hook.remove()
@@ -2523,6 +2539,22 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
2523
  if next_token_id == manager.tokenizer.eos_token_id:
2524
  break
2525
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2526
  # Clean up hooks
2527
  for hook in hooks:
2528
  hook.remove()
 
1990
  if next_token_id == manager.tokenizer.eos_token_id:
1991
  break
1992
 
1993
+ # Free memory from this step's outputs to prevent accumulation
1994
+ # This is critical for large models like Devstral (40 layers, 32 heads)
1995
+ del outputs
1996
+ del logits
1997
+ del probs
1998
+ if 'layer_attn' in dir():
1999
+ del layer_attn
2000
+ if 'current_hidden' in dir():
2001
+ del current_hidden
2002
+
2003
+ # Periodic garbage collection for large models (every 8 steps)
2004
+ if (step + 1) % 8 == 0:
2005
+ gc.collect()
2006
+ if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
2007
+ torch.mps.empty_cache() if hasattr(torch.mps, 'empty_cache') else None
2008
+
2009
  # Clean up hooks after generation
2010
  for hook in hooks:
2011
  hook.remove()
 
2539
  if next_token_id == manager.tokenizer.eos_token_id:
2540
  break
2541
 
2542
+ # Free memory from this step's outputs to prevent accumulation
2543
+ # This is critical for large models like Devstral (40 layers, 32 heads)
2544
+ del outputs
2545
+ del logits
2546
+ del probs
2547
+ if 'layer_attn' in dir():
2548
+ del layer_attn
2549
+ if 'current_hidden' in dir():
2550
+ del current_hidden
2551
+
2552
+ # Periodic garbage collection for large models (every 8 steps)
2553
+ if (step + 1) % 8 == 0:
2554
+ gc.collect()
2555
+ if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
2556
+ torch.mps.empty_cache() if hasattr(torch.mps, 'empty_cache') else None
2557
+
2558
  # Clean up hooks
2559
  for hook in hooks:
2560
  hook.remove()