gary-boon Claude Opus 4.5 commited on
Commit
f2e89c2
·
1 Parent(s): a94eb19

Store matrices as numpy arrays instead of Python lists

Browse files

Memory optimization for large models like Devstral:
- Store attention/QKV matrices as numpy arrays (~4 bytes/float)
- Previously used Python lists (~28 bytes/float = 7x overhead)
- Convert to lists only when frontend requests via /matrix endpoint

This reduces cache memory from ~25GB to ~3.5GB for:
40 layers × 32 heads × 32 steps × ~90KB per head

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (1) hide show
  1. backend/model_service.py +23 -11
backend/model_service.py CHANGED
@@ -1920,8 +1920,9 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
1920
  confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
1921
 
1922
  # Get full attention weights for this head [seq_len, seq_len]
1923
- # Convert to float32 for numpy (bfloat16 not supported)
1924
- attention_matrix = layer_attn[head_idx].cpu().float().numpy().tolist()
 
1925
 
1926
  # Get Q/K/V for this head if available
1927
  q_matrix = None
@@ -1929,10 +1930,10 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
1929
  v_matrix = None
1930
  if layer_idx in qkv_captures:
1931
  # Q/K/V shape: [seq_len, n_heads, head_dim]
1932
- # Convert to float32 for numpy (bfloat16 not supported)
1933
- q_matrix = qkv_captures[layer_idx]['q'][:, head_idx, :].float().numpy().tolist()
1934
- k_matrix = qkv_captures[layer_idx]['k'][:, head_idx, :].float().numpy().tolist()
1935
- v_matrix = qkv_captures[layer_idx]['v'][:, head_idx, :].float().numpy().tolist()
1936
 
1937
  # Store matrices in cache for lazy loading (reduces response size)
1938
  matrix_cache.store(request_id, step, layer_idx, head_idx, {
@@ -2478,15 +2479,17 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
2478
 
2479
  confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
2480
 
2481
- attention_matrix = layer_attn[head_idx].cpu().float().numpy().tolist()
 
 
2482
 
2483
  q_matrix = None
2484
  k_matrix = None
2485
  v_matrix = None
2486
  if layer_idx in qkv_captures:
2487
- q_matrix = qkv_captures[layer_idx]['q'][:, head_idx, :].float().numpy().tolist()
2488
- k_matrix = qkv_captures[layer_idx]['k'][:, head_idx, :].float().numpy().tolist()
2489
- v_matrix = qkv_captures[layer_idx]['v'][:, head_idx, :].float().numpy().tolist()
2490
 
2491
  # Store matrices in cache for lazy loading (reduces response size)
2492
  matrix_cache.store(request_id, step, layer_idx, head_idx, {
@@ -2731,7 +2734,16 @@ async def get_attention_matrix(
2731
  )
2732
 
2733
  logger.info(f"Matrix cache hit: request_id={request_id}, step={step}, layer={layer}, head={head}")
2734
- return data
 
 
 
 
 
 
 
 
 
2735
 
2736
 
2737
  @app.get("/analyze/research/attention/matrix/stats")
 
1920
  confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
1921
 
1922
  # Get full attention weights for this head [seq_len, seq_len]
1923
+ # Store as numpy arrays (not Python lists) to save memory
1924
+ # ~7x more memory efficient: 4 bytes/float vs 28 bytes/float
1925
+ attention_matrix = layer_attn[head_idx].cpu().float().numpy()
1926
 
1927
  # Get Q/K/V for this head if available
1928
  q_matrix = None
 
1930
  v_matrix = None
1931
  if layer_idx in qkv_captures:
1932
  # Q/K/V shape: [seq_len, n_heads, head_dim]
1933
+ # Store as numpy arrays for memory efficiency
1934
+ q_matrix = qkv_captures[layer_idx]['q'][:, head_idx, :].float().numpy()
1935
+ k_matrix = qkv_captures[layer_idx]['k'][:, head_idx, :].float().numpy()
1936
+ v_matrix = qkv_captures[layer_idx]['v'][:, head_idx, :].float().numpy()
1937
 
1938
  # Store matrices in cache for lazy loading (reduces response size)
1939
  matrix_cache.store(request_id, step, layer_idx, head_idx, {
 
2479
 
2480
  confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
2481
 
2482
+ # Store as numpy arrays (not Python lists) to save memory
2483
+ # ~7x more memory efficient: 4 bytes/float vs 28 bytes/float
2484
+ attention_matrix = layer_attn[head_idx].cpu().float().numpy()
2485
 
2486
  q_matrix = None
2487
  k_matrix = None
2488
  v_matrix = None
2489
  if layer_idx in qkv_captures:
2490
+ q_matrix = qkv_captures[layer_idx]['q'][:, head_idx, :].float().numpy()
2491
+ k_matrix = qkv_captures[layer_idx]['k'][:, head_idx, :].float().numpy()
2492
+ v_matrix = qkv_captures[layer_idx]['v'][:, head_idx, :].float().numpy()
2493
 
2494
  # Store matrices in cache for lazy loading (reduces response size)
2495
  matrix_cache.store(request_id, step, layer_idx, head_idx, {
 
2734
  )
2735
 
2736
  logger.info(f"Matrix cache hit: request_id={request_id}, step={step}, layer={layer}, head={head}")
2737
+
2738
+ # Convert numpy arrays to lists for JSON serialization
2739
+ # Arrays are stored as numpy for memory efficiency, converted on-demand here
2740
+ response = {}
2741
+ for key, value in data.items():
2742
+ if value is not None and hasattr(value, 'tolist'):
2743
+ response[key] = value.tolist()
2744
+ else:
2745
+ response[key] = value
2746
+ return response
2747
 
2748
 
2749
  @app.get("/analyze/research/attention/matrix/stats")