Spaces:

visualisable-ai
/

api

Paused

gary-boon Claude Opus 4.5 commited on Dec 24, 2025

Commit

f2e89c2

1 Parent(s): a94eb19

Store matrices as numpy arrays instead of Python lists

Memory optimization for large models like Devstral:
- Store attention/QKV matrices as numpy arrays (~4 bytes/float)
- Previously used Python lists (~28 bytes/float = 7x overhead)
- Convert to lists only when frontend requests via /matrix endpoint

This reduces cache memory from ~25GB to ~3.5GB for:
40 layers × 32 heads × 32 steps × ~90KB per head

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show

backend/model_service.py +23 -11

backend/model_service.py CHANGED Viewed

@@ -1920,8 +1920,9 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
                         confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
                         # Get full attention weights for this head [seq_len, seq_len]
-                        # Convert to float32 for numpy (bfloat16 not supported)
-                        attention_matrix = layer_attn[head_idx].cpu().float().numpy().tolist()
                         # Get Q/K/V for this head if available
                         q_matrix = None
@@ -1929,10 +1930,10 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
                         v_matrix = None
                         if layer_idx in qkv_captures:
                             # Q/K/V shape: [seq_len, n_heads, head_dim]
-                            # Convert to float32 for numpy (bfloat16 not supported)
-                            q_matrix = qkv_captures[layer_idx]['q'][:, head_idx, :].float().numpy().tolist()
-                            k_matrix = qkv_captures[layer_idx]['k'][:, head_idx, :].float().numpy().tolist()
-                            v_matrix = qkv_captures[layer_idx]['v'][:, head_idx, :].float().numpy().tolist()
                         # Store matrices in cache for lazy loading (reduces response size)
                         matrix_cache.store(request_id, step, layer_idx, head_idx, {
@@ -2478,15 +2479,17 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
                             confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
-                            attention_matrix = layer_attn[head_idx].cpu().float().numpy().tolist()
                             q_matrix = None
                             k_matrix = None
                             v_matrix = None
                             if layer_idx in qkv_captures:
-                                q_matrix = qkv_captures[layer_idx]['q'][:, head_idx, :].float().numpy().tolist()
-                                k_matrix = qkv_captures[layer_idx]['k'][:, head_idx, :].float().numpy().tolist()
-                                v_matrix = qkv_captures[layer_idx]['v'][:, head_idx, :].float().numpy().tolist()
                             # Store matrices in cache for lazy loading (reduces response size)
                             matrix_cache.store(request_id, step, layer_idx, head_idx, {
@@ -2731,7 +2734,16 @@ async def get_attention_matrix(
         )
     logger.info(f"Matrix cache hit: request_id={request_id}, step={step}, layer={layer}, head={head}")
-    return data
 @app.get("/analyze/research/attention/matrix/stats")

                         confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
                         # Get full attention weights for this head [seq_len, seq_len]
+                        # Store as numpy arrays (not Python lists) to save memory
+                        # ~7x more memory efficient: 4 bytes/float vs 28 bytes/float
+                        attention_matrix = layer_attn[head_idx].cpu().float().numpy()
                         # Get Q/K/V for this head if available
                         q_matrix = None
                         v_matrix = None
                         if layer_idx in qkv_captures:
                             # Q/K/V shape: [seq_len, n_heads, head_dim]
+                            # Store as numpy arrays for memory efficiency
+                            q_matrix = qkv_captures[layer_idx]['q'][:, head_idx, :].float().numpy()
+                            k_matrix = qkv_captures[layer_idx]['k'][:, head_idx, :].float().numpy()
+                            v_matrix = qkv_captures[layer_idx]['v'][:, head_idx, :].float().numpy()
                         # Store matrices in cache for lazy loading (reduces response size)
                         matrix_cache.store(request_id, step, layer_idx, head_idx, {
                             confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
+                            # Store as numpy arrays (not Python lists) to save memory
+                            # ~7x more memory efficient: 4 bytes/float vs 28 bytes/float
+                            attention_matrix = layer_attn[head_idx].cpu().float().numpy()
                             q_matrix = None
                             k_matrix = None
                             v_matrix = None
                             if layer_idx in qkv_captures:
+                                q_matrix = qkv_captures[layer_idx]['q'][:, head_idx, :].float().numpy()
+                                k_matrix = qkv_captures[layer_idx]['k'][:, head_idx, :].float().numpy()
+                                v_matrix = qkv_captures[layer_idx]['v'][:, head_idx, :].float().numpy()
                             # Store matrices in cache for lazy loading (reduces response size)
                             matrix_cache.store(request_id, step, layer_idx, head_idx, {
         )
     logger.info(f"Matrix cache hit: request_id={request_id}, step={step}, layer={layer}, head={head}")
+    # Convert numpy arrays to lists for JSON serialization
+    # Arrays are stored as numpy for memory efficiency, converted on-demand here
+    response = {}
+    for key, value in data.items():
+        if value is not None and hasattr(value, 'tolist'):
+            response[key] = value.tolist()
+        else:
+            response[key] = value
+    return response
 @app.get("/analyze/research/attention/matrix/stats")