Spaces:
Sleeping
Sleeping
gary-boon
Claude Opus 4.5
commited on
Commit
·
a94eb19
1
Parent(s):
959074d
Add per-step memory cleanup for large model support
Browse files- Delete outputs, logits, probs tensors after each generation step
- Run garbage collection every 8 steps for large models
- Clear MPS cache on Apple Silicon to release GPU memory
This prevents memory accumulation during generation with models
like Devstral that have 40 layers × 32 heads, which previously
caused RAM exhaustion on longer token sequences (32+).
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- backend/model_service.py +32 -0
backend/model_service.py
CHANGED
|
@@ -1990,6 +1990,22 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
|
|
| 1990 |
if next_token_id == manager.tokenizer.eos_token_id:
|
| 1991 |
break
|
| 1992 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1993 |
# Clean up hooks after generation
|
| 1994 |
for hook in hooks:
|
| 1995 |
hook.remove()
|
|
@@ -2523,6 +2539,22 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
|
|
| 2523 |
if next_token_id == manager.tokenizer.eos_token_id:
|
| 2524 |
break
|
| 2525 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2526 |
# Clean up hooks
|
| 2527 |
for hook in hooks:
|
| 2528 |
hook.remove()
|
|
|
|
| 1990 |
if next_token_id == manager.tokenizer.eos_token_id:
|
| 1991 |
break
|
| 1992 |
|
| 1993 |
+
# Free memory from this step's outputs to prevent accumulation
|
| 1994 |
+
# This is critical for large models like Devstral (40 layers, 32 heads)
|
| 1995 |
+
del outputs
|
| 1996 |
+
del logits
|
| 1997 |
+
del probs
|
| 1998 |
+
if 'layer_attn' in dir():
|
| 1999 |
+
del layer_attn
|
| 2000 |
+
if 'current_hidden' in dir():
|
| 2001 |
+
del current_hidden
|
| 2002 |
+
|
| 2003 |
+
# Periodic garbage collection for large models (every 8 steps)
|
| 2004 |
+
if (step + 1) % 8 == 0:
|
| 2005 |
+
gc.collect()
|
| 2006 |
+
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
| 2007 |
+
torch.mps.empty_cache() if hasattr(torch.mps, 'empty_cache') else None
|
| 2008 |
+
|
| 2009 |
# Clean up hooks after generation
|
| 2010 |
for hook in hooks:
|
| 2011 |
hook.remove()
|
|
|
|
| 2539 |
if next_token_id == manager.tokenizer.eos_token_id:
|
| 2540 |
break
|
| 2541 |
|
| 2542 |
+
# Free memory from this step's outputs to prevent accumulation
|
| 2543 |
+
# This is critical for large models like Devstral (40 layers, 32 heads)
|
| 2544 |
+
del outputs
|
| 2545 |
+
del logits
|
| 2546 |
+
del probs
|
| 2547 |
+
if 'layer_attn' in dir():
|
| 2548 |
+
del layer_attn
|
| 2549 |
+
if 'current_hidden' in dir():
|
| 2550 |
+
del current_hidden
|
| 2551 |
+
|
| 2552 |
+
# Periodic garbage collection for large models (every 8 steps)
|
| 2553 |
+
if (step + 1) % 8 == 0:
|
| 2554 |
+
gc.collect()
|
| 2555 |
+
if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
| 2556 |
+
torch.mps.empty_cache() if hasattr(torch.mps, 'empty_cache') else None
|
| 2557 |
+
|
| 2558 |
# Clean up hooks
|
| 2559 |
for hook in hooks:
|
| 2560 |
hook.remove()
|