Spaces:

visualisable-ai
/

api

Paused

gary-boon Claude Opus 4.5 commited on Dec 24, 2025

Commit

959074d

1 Parent(s): bb689ce

Fix RAM exhaustion for large token generation

Add memory management to MatrixCache:
- Track request IDs for each cache entry
- Clear old request cache entries before starting new analysis
- Force garbage collection after clearing cache
- Clear GPU/MPS cache on Apple Silicon

This prevents memory accumulation when running multiple
analyses, particularly for large models like Devstral
with 40 layers × 32 heads per step.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show

backend/model_service.py +45 -0

backend/model_service.py CHANGED Viewed

@@ -8,6 +8,7 @@ from fastapi.responses import StreamingResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import asyncio
 import json
 import os
 import time
@@ -69,15 +70,53 @@ class MatrixCache:
     def __init__(self, ttl_seconds: int = 3600):
         self._cache: Dict[str, Dict] = {}
         self._timestamps: Dict[str, float] = {}
         self._lock = Lock()
         self._ttl = ttl_seconds
     def store(self, request_id: str, step: int, layer: int, head: int, data: dict):
         """Store matrix data for a specific head."""
         key = f"{request_id}:{step}:{layer}:{head}"
         with self._lock:
             self._cache[key] = data
             self._timestamps[key] = time_now()
     def get(self, request_id: str, step: int, layer: int, head: int) -> Optional[dict]:
         """Retrieve matrix data, returning None if expired or not found."""
@@ -1570,6 +1609,9 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
         # Generate unique request ID for matrix cache lookup
         request_id = str(uuid.uuid4())
         # Get parameters
         prompt = request.get("prompt", "def quicksort(arr):")
         max_tokens = request.get("max_tokens", 8)
@@ -2094,6 +2136,9 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
             # Generate unique request ID for matrix cache lookup
             request_id = str(uuid.uuid4())
             # Get parameters
             prompt = request.get("prompt", "def quicksort(arr):")
             max_tokens = request.get("max_tokens", 8)

 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import asyncio
+import gc
 import json
 import os
 import time
     def __init__(self, ttl_seconds: int = 3600):
         self._cache: Dict[str, Dict] = {}
         self._timestamps: Dict[str, float] = {}
+        self._request_ids: set = set()  # Track active request IDs
         self._lock = Lock()
         self._ttl = ttl_seconds
+    def clear_request(self, request_id: str):
+        """Clear all cache entries for a specific request."""
+        with self._lock:
+            keys_to_delete = [k for k in self._cache.keys() if k.startswith(f"{request_id}:")]
+            for k in keys_to_delete:
+                del self._cache[k]
+                if k in self._timestamps:
+                    del self._timestamps[k]
+            self._request_ids.discard(request_id)
+            if keys_to_delete:
+                logger.info(f"MatrixCache: cleared {len(keys_to_delete)} entries for request {request_id[:8]}")
+    def clear_old_requests(self, keep_request_id: str = None):
+        """Clear all requests except the specified one to free memory."""
+        with self._lock:
+            request_ids_to_clear = self._request_ids - {keep_request_id} if keep_request_id else self._request_ids.copy()
+            total_cleared = 0
+            for rid in request_ids_to_clear:
+                keys_to_delete = [k for k in self._cache.keys() if k.startswith(f"{rid}:")]
+                for k in keys_to_delete:
+                    del self._cache[k]
+                    if k in self._timestamps:
+                        del self._timestamps[k]
+                total_cleared += len(keys_to_delete)
+            self._request_ids = {keep_request_id} if keep_request_id else set()
+            if total_cleared:
+                logger.info(f"MatrixCache: cleared {total_cleared} entries from old requests")
+                # Force garbage collection to release memory back to system
+                gc.collect()
+                # Also clear any GPU cache if using CUDA
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+                elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+                    # For Apple Silicon MPS, trigger garbage collection
+                    torch.mps.empty_cache() if hasattr(torch.mps, 'empty_cache') else None
     def store(self, request_id: str, step: int, layer: int, head: int, data: dict):
         """Store matrix data for a specific head."""
         key = f"{request_id}:{step}:{layer}:{head}"
         with self._lock:
             self._cache[key] = data
             self._timestamps[key] = time_now()
+            self._request_ids.add(request_id)
     def get(self, request_id: str, step: int, layer: int, head: int) -> Optional[dict]:
         """Retrieve matrix data, returning None if expired or not found."""
         # Generate unique request ID for matrix cache lookup
         request_id = str(uuid.uuid4())
+        # Clear old cached matrices to free memory before starting new analysis
+        matrix_cache.clear_old_requests(request_id)
         # Get parameters
         prompt = request.get("prompt", "def quicksort(arr):")
         max_tokens = request.get("max_tokens", 8)
             # Generate unique request ID for matrix cache lookup
             request_id = str(uuid.uuid4())
+            # Clear old cached matrices to free memory before starting new analysis
+            matrix_cache.clear_old_requests(request_id)
             # Get parameters
             prompt = request.get("prompt", "def quicksort(arr):")
             max_tokens = request.get("max_tokens", 8)