Spaces:
Sleeping
Sleeping
gary-boon
Claude Opus 4.5
commited on
Commit
·
f2e89c2
1
Parent(s):
a94eb19
Store matrices as numpy arrays instead of Python lists
Browse filesMemory optimization for large models like Devstral:
- Store attention/QKV matrices as numpy arrays (~4 bytes/float)
- Previously used Python lists (~28 bytes/float = 7x overhead)
- Convert to lists only when frontend requests via /matrix endpoint
This reduces cache memory from ~25GB to ~3.5GB for:
40 layers × 32 heads × 32 steps × ~90KB per head
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- backend/model_service.py +23 -11
backend/model_service.py
CHANGED
|
@@ -1920,8 +1920,9 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
|
|
| 1920 |
confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
|
| 1921 |
|
| 1922 |
# Get full attention weights for this head [seq_len, seq_len]
|
| 1923 |
-
#
|
| 1924 |
-
|
|
|
|
| 1925 |
|
| 1926 |
# Get Q/K/V for this head if available
|
| 1927 |
q_matrix = None
|
|
@@ -1929,10 +1930,10 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
|
|
| 1929 |
v_matrix = None
|
| 1930 |
if layer_idx in qkv_captures:
|
| 1931 |
# Q/K/V shape: [seq_len, n_heads, head_dim]
|
| 1932 |
-
#
|
| 1933 |
-
q_matrix = qkv_captures[layer_idx]['q'][:, head_idx, :].float().numpy()
|
| 1934 |
-
k_matrix = qkv_captures[layer_idx]['k'][:, head_idx, :].float().numpy()
|
| 1935 |
-
v_matrix = qkv_captures[layer_idx]['v'][:, head_idx, :].float().numpy()
|
| 1936 |
|
| 1937 |
# Store matrices in cache for lazy loading (reduces response size)
|
| 1938 |
matrix_cache.store(request_id, step, layer_idx, head_idx, {
|
|
@@ -2478,15 +2479,17 @@ async def analyze_research_attention_stream(request: Dict[str, Any], authenticat
|
|
| 2478 |
|
| 2479 |
confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
|
| 2480 |
|
| 2481 |
-
|
|
|
|
|
|
|
| 2482 |
|
| 2483 |
q_matrix = None
|
| 2484 |
k_matrix = None
|
| 2485 |
v_matrix = None
|
| 2486 |
if layer_idx in qkv_captures:
|
| 2487 |
-
q_matrix = qkv_captures[layer_idx]['q'][:, head_idx, :].float().numpy()
|
| 2488 |
-
k_matrix = qkv_captures[layer_idx]['k'][:, head_idx, :].float().numpy()
|
| 2489 |
-
v_matrix = qkv_captures[layer_idx]['v'][:, head_idx, :].float().numpy()
|
| 2490 |
|
| 2491 |
# Store matrices in cache for lazy loading (reduces response size)
|
| 2492 |
matrix_cache.store(request_id, step, layer_idx, head_idx, {
|
|
@@ -2731,7 +2734,16 @@ async def get_attention_matrix(
|
|
| 2731 |
)
|
| 2732 |
|
| 2733 |
logger.info(f"Matrix cache hit: request_id={request_id}, step={step}, layer={layer}, head={head}")
|
| 2734 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2735 |
|
| 2736 |
|
| 2737 |
@app.get("/analyze/research/attention/matrix/stats")
|
|
|
|
| 1920 |
confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
|
| 1921 |
|
| 1922 |
# Get full attention weights for this head [seq_len, seq_len]
|
| 1923 |
+
# Store as numpy arrays (not Python lists) to save memory
|
| 1924 |
+
# ~7x more memory efficient: 4 bytes/float vs 28 bytes/float
|
| 1925 |
+
attention_matrix = layer_attn[head_idx].cpu().float().numpy()
|
| 1926 |
|
| 1927 |
# Get Q/K/V for this head if available
|
| 1928 |
q_matrix = None
|
|
|
|
| 1930 |
v_matrix = None
|
| 1931 |
if layer_idx in qkv_captures:
|
| 1932 |
# Q/K/V shape: [seq_len, n_heads, head_dim]
|
| 1933 |
+
# Store as numpy arrays for memory efficiency
|
| 1934 |
+
q_matrix = qkv_captures[layer_idx]['q'][:, head_idx, :].float().numpy()
|
| 1935 |
+
k_matrix = qkv_captures[layer_idx]['k'][:, head_idx, :].float().numpy()
|
| 1936 |
+
v_matrix = qkv_captures[layer_idx]['v'][:, head_idx, :].float().numpy()
|
| 1937 |
|
| 1938 |
# Store matrices in cache for lazy loading (reduces response size)
|
| 1939 |
matrix_cache.store(request_id, step, layer_idx, head_idx, {
|
|
|
|
| 2479 |
|
| 2480 |
confidence = 0.0 if math.isnan(confidence) or math.isinf(confidence) else confidence
|
| 2481 |
|
| 2482 |
+
# Store as numpy arrays (not Python lists) to save memory
|
| 2483 |
+
# ~7x more memory efficient: 4 bytes/float vs 28 bytes/float
|
| 2484 |
+
attention_matrix = layer_attn[head_idx].cpu().float().numpy()
|
| 2485 |
|
| 2486 |
q_matrix = None
|
| 2487 |
k_matrix = None
|
| 2488 |
v_matrix = None
|
| 2489 |
if layer_idx in qkv_captures:
|
| 2490 |
+
q_matrix = qkv_captures[layer_idx]['q'][:, head_idx, :].float().numpy()
|
| 2491 |
+
k_matrix = qkv_captures[layer_idx]['k'][:, head_idx, :].float().numpy()
|
| 2492 |
+
v_matrix = qkv_captures[layer_idx]['v'][:, head_idx, :].float().numpy()
|
| 2493 |
|
| 2494 |
# Store matrices in cache for lazy loading (reduces response size)
|
| 2495 |
matrix_cache.store(request_id, step, layer_idx, head_idx, {
|
|
|
|
| 2734 |
)
|
| 2735 |
|
| 2736 |
logger.info(f"Matrix cache hit: request_id={request_id}, step={step}, layer={layer}, head={head}")
|
| 2737 |
+
|
| 2738 |
+
# Convert numpy arrays to lists for JSON serialization
|
| 2739 |
+
# Arrays are stored as numpy for memory efficiency, converted on-demand here
|
| 2740 |
+
response = {}
|
| 2741 |
+
for key, value in data.items():
|
| 2742 |
+
if value is not None and hasattr(value, 'tolist'):
|
| 2743 |
+
response[key] = value.tolist()
|
| 2744 |
+
else:
|
| 2745 |
+
response[key] = value
|
| 2746 |
+
return response
|
| 2747 |
|
| 2748 |
|
| 2749 |
@app.get("/analyze/research/attention/matrix/stats")
|