File size: 5,987 Bytes
2ece486 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | """
ENGRAM Protocol β Test Fixtures
Shared pytest fixtures for all test modules.
Provides synthetic KV cache tensors at correct shapes,
temp directories, and model specs.
"""
from __future__ import annotations
from pathlib import Path
import pytest
import torch
from kvcos.core.cache_spec import GEMMA_4_26B_A4B, LLAMA_3_1_8B, PHI_3_MINI
from kvcos.core.types import AttentionType, CacheSection, ModelCacheSpec
@pytest.fixture
def llama_spec() -> ModelCacheSpec:
"""Llama 3.1 8B model spec."""
return LLAMA_3_1_8B
@pytest.fixture
def phi3_spec() -> ModelCacheSpec:
"""Phi-3-Mini model spec."""
return PHI_3_MINI
@pytest.fixture
def gemma4_spec() -> ModelCacheSpec:
"""Gemma 4 26B-A4B ISWA model spec."""
return GEMMA_4_26B_A4B
@pytest.fixture
def tmp_data_dir(tmp_path: Path) -> Path:
"""Temporary data directory for storage tests."""
data_dir = tmp_path / "engram_data"
data_dir.mkdir()
return data_dir
@pytest.fixture
def tmp_index_dir(tmp_path: Path) -> Path:
"""Temporary directory for FAISS index persistence tests."""
index_dir = tmp_path / "engram_index"
index_dir.mkdir()
return index_dir
def make_synthetic_kv(
spec: ModelCacheSpec,
ctx_len: int = 256,
seed: int = 42,
) -> tuple[torch.Tensor, torch.Tensor]:
"""Create synthetic KV cache tensors with correct shapes.
Returns (keys, values) each [n_layers, n_kv_heads, ctx_len, head_dim].
Values are random but reproducible via seed.
"""
torch.manual_seed(seed)
shape = (spec["n_layers"], spec["n_kv_heads"], ctx_len, spec["head_dim"])
keys = torch.randn(shape, dtype=torch.float16)
values = torch.randn(shape, dtype=torch.float16)
return keys, values
@pytest.fixture
def llama_kv_256(llama_spec: ModelCacheSpec) -> tuple[torch.Tensor, torch.Tensor]:
"""Synthetic Llama 3.1 8B KV cache, 256 tokens.
Shape: [32, 8, 256, 128] for both keys and values.
"""
return make_synthetic_kv(llama_spec, ctx_len=256)
@pytest.fixture
def llama_kv_1024(llama_spec: ModelCacheSpec) -> tuple[torch.Tensor, torch.Tensor]:
"""Synthetic Llama 3.1 8B KV cache, 1024 tokens."""
return make_synthetic_kv(llama_spec, ctx_len=1024, seed=123)
@pytest.fixture
def phi3_kv_256(phi3_spec: ModelCacheSpec) -> tuple[torch.Tensor, torch.Tensor]:
"""Synthetic Phi-3-Mini KV cache, 256 tokens.
Shape: [32, 32, 256, 96] for both keys and values.
"""
return make_synthetic_kv(phi3_spec, ctx_len=256, seed=99)
# ββ ISWA Fixtures ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def make_synthetic_iswa_blob(
sections: tuple[CacheSection, ...],
n_cells: int = 4,
arch: str = "gemma4",
v_trans: bool = True,
seed: int = 42,
) -> bytes:
"""Build a synthetic ISWA blob with multiple KV cache sections.
Matches llama.cpp state blob format for ISWA models:
1. Architecture string header
2. n_stream = len(sections)
3. Per stream: cell metadata + K/V data per layer
Args:
sections: Cache sections (e.g., global + SWA for Gemma 4).
n_cells: Number of KV cells per section.
arch: Architecture string in blob header.
v_trans: Whether V tensors are stored transposed.
seed: Random seed for reproducible data.
"""
import struct
import numpy as np
from kvcos.core.blob_parser import GGML_TYPE_F16
rng = np.random.RandomState(seed)
parts: list[bytes] = []
# 1. Architecture string header
parts.append(struct.pack("<I", len(arch)))
parts.append(arch.encode("ascii"))
# 2. Stream count = number of cache sections
parts.append(struct.pack("<I", len(sections)))
# 3. Per-stream data
for section in sections:
n_embd_kv = section.n_kv_heads * section.head_dim
row_size = n_embd_kv * 2 # fp16
# Cell metadata
parts.append(struct.pack("<I", n_cells))
for i in range(n_cells):
parts.append(struct.pack("<i", i)) # pos
parts.append(struct.pack("<I", 1)) # n_seq_id = 1
parts.append(struct.pack("<i", 0)) # seq_id = 0
# Data section header
parts.append(struct.pack("<I", 1 if v_trans else 0))
parts.append(struct.pack("<I", section.n_layers))
# K layers
for _ in range(section.n_layers):
parts.append(struct.pack("<i", GGML_TYPE_F16))
parts.append(struct.pack("<Q", row_size))
data = rng.randn(n_cells * n_embd_kv).astype(np.float16)
parts.append(data.tobytes())
# V layers
for _ in range(section.n_layers):
parts.append(struct.pack("<i", GGML_TYPE_F16))
if v_trans:
parts.append(struct.pack("<I", 2)) # el_size (fp16)
parts.append(struct.pack("<I", n_embd_kv)) # n_embd_v_gqa
else:
parts.append(struct.pack("<Q", row_size))
data = rng.randn(n_cells * n_embd_kv).astype(np.float16)
parts.append(data.tobytes())
return b"".join(parts)
# Gemma 4 ISWA section constants (reverse-engineered)
GEMMA4_GLOBAL_SECTION = CacheSection(
attention_type=AttentionType.FULL,
n_layers=5,
n_kv_heads=2,
head_dim=512,
)
GEMMA4_SWA_SECTION = CacheSection(
attention_type=AttentionType.SLIDING,
n_layers=25,
n_kv_heads=8,
head_dim=256,
window_size=1024,
)
GEMMA4_SECTIONS = (GEMMA4_GLOBAL_SECTION, GEMMA4_SWA_SECTION)
@pytest.fixture
def gemma4_iswa_blob() -> bytes:
"""Synthetic Gemma 4 ISWA blob with 2 sections, 4 cells."""
return make_synthetic_iswa_blob(GEMMA4_SECTIONS, n_cells=4)
@pytest.fixture
def gemma4_iswa_blob_8cells() -> bytes:
"""Synthetic Gemma 4 ISWA blob with 2 sections, 8 cells."""
return make_synthetic_iswa_blob(GEMMA4_SECTIONS, n_cells=8, seed=99)
|