eigengram commited on
Commit
57f4e4d
·
verified ·
1 Parent(s): 954cf8a

feat: upload integrations (llama.cpp bridge)

Browse files
integrations/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """ENGRAM Protocol — Integration bridges for external LLM runtimes."""
integrations/llama_cpp_bridge.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ENGRAM Protocol — llama-cpp-python Bridge
3
+
4
+
5
+ D1: llama-cpp-python direct. No Ollama. n_gpu_layers=0 for Phase 1.
6
+
7
+ Provides:
8
+ - KV cache extraction via llama_state_seq_get_data() → blob_parser
9
+ - KV cache injection via llama_state_seq_set_data() for session restore
10
+ - TTFT measurement for benchmarking (D6: >10x at 16K)
11
+ - Model loading with architecture spec auto-detection
12
+
13
+ WARNING: State blob format is llama.cpp version-dependent.
14
+ Pin llama-cpp-python version in pyproject.toml.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import logging
20
+ import time
21
+ from dataclasses import dataclass
22
+ from pathlib import Path
23
+
24
+ import torch
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ from kvcos.core.blob_parser import (
29
+ GGML_TYPE_F16,
30
+ GGML_TYPE_Q8_0,
31
+ ParsedKVCache,
32
+ ParsedMultiSectionCache,
33
+ parse_multi_section_blob,
34
+ parse_state_blob,
35
+ )
36
+ from kvcos.core.cache_spec import (
37
+ ModelCacheSpec,
38
+ get_model_spec,
39
+ is_iswa_spec,
40
+ make_spec_from_metadata,
41
+ )
42
+
43
+
44
+ # Metadata key prefixes in order of preference per architecture.
45
+ # llama.cpp uses architecture-specific keys (e.g., gemma4.block_count).
46
+ _METADATA_PREFIXES = ("llama", "gemma4", "gemma", "phi", "qwen", "mistral", "deepseek")
47
+
48
+
49
+ def _meta_get(metadata: dict, key_suffix: str, default: str = "0") -> str:
50
+ """Get a metadata value trying architecture-specific prefixes.
51
+
52
+ Searches: llama.{suffix}, gemma4.{suffix}, gemma.{suffix}, etc.
53
+ Falls back to general.{suffix}, then default.
54
+
55
+ Args:
56
+ metadata: llama.cpp model metadata dict.
57
+ key_suffix: Key without prefix, e.g. "block_count" or "attention.head_count".
58
+ default: Default if no key found.
59
+ """
60
+ for prefix in _METADATA_PREFIXES:
61
+ val = metadata.get(f"{prefix}.{key_suffix}")
62
+ if val is not None:
63
+ return val
64
+ # Fall back to general.*
65
+ val = metadata.get(f"general.{key_suffix}")
66
+ return val if val is not None else default
67
+
68
+
69
+ @dataclass
70
+ class TTFTMeasurement:
71
+ """Time-to-first-token measurement for benchmarking."""
72
+
73
+ ttft_ms: float # milliseconds
74
+ context_len: int
75
+ method: str # "cold_prefill" or "cached_restore"
76
+ model_id: str
77
+
78
+
79
+ class LlamaCppBridge:
80
+ """Bridge between llama-cpp-python and ENGRAM's KV cache system.
81
+
82
+ Handles model loading, KV cache extraction, and injection.
83
+
84
+ Usage:
85
+ bridge = LlamaCppBridge("/path/to/model.gguf")
86
+ bridge.load_model()
87
+
88
+ # Generate and extract KV state
89
+ bridge.generate(prompt)
90
+ parsed = bridge.extract_kv_cache()
91
+
92
+ # Later: inject cached state
93
+ bridge.inject_kv_cache(cached_blob, spec)
94
+ bridge.generate("Continue from cached state:")
95
+ """
96
+
97
+ def __init__(
98
+ self,
99
+ model_path: str,
100
+ n_ctx: int = 16384,
101
+ n_gpu_layers: int = 0, # D1: CPU-only Phase 1
102
+ kv_cache_type: str = "f16", # "f16" or "q8_0"
103
+ verbose: bool = False,
104
+ ):
105
+ self.model_path = model_path
106
+ self.n_ctx = n_ctx
107
+ self.n_gpu_layers = n_gpu_layers
108
+ self.kv_cache_type = kv_cache_type
109
+ self.verbose = verbose
110
+ self._llm = None
111
+ self._spec: ModelCacheSpec | None = None
112
+
113
+ def load_model(self) -> ModelCacheSpec:
114
+ """Load the GGUF model and auto-detect architecture spec.
115
+
116
+ Returns the ModelCacheSpec for this model.
117
+ """
118
+ from llama_cpp import Llama
119
+
120
+ self._llm = Llama(
121
+ model_path=self.model_path,
122
+ n_ctx=self.n_ctx,
123
+ n_gpu_layers=self.n_gpu_layers,
124
+ verbose=self.verbose,
125
+ )
126
+
127
+ # Auto-detect model architecture from llama.cpp metadata.
128
+ # Uses fallback chain across architecture prefixes (llama.*, gemma4.*, etc.)
129
+ metadata = self._llm.metadata
130
+ model_name = metadata.get("general.name", Path(self.model_path).stem)
131
+
132
+ # Check registry first (handles ISWA specs with cache_sections)
133
+ registry_spec = get_model_spec(model_name)
134
+ if registry_spec is not None:
135
+ self._spec = registry_spec
136
+ else:
137
+ n_layers = int(_meta_get(metadata, "block_count", "32"))
138
+ n_heads = int(_meta_get(metadata, "attention.head_count", "32"))
139
+ n_kv_heads = int(_meta_get(metadata, "attention.head_count_kv", str(n_heads)))
140
+ embed_dim = int(_meta_get(metadata, "embedding_length", "4096"))
141
+ head_dim = embed_dim // n_heads if n_heads > 0 else 128
142
+
143
+ self._spec = make_spec_from_metadata(
144
+ model_id=model_name,
145
+ n_layers=n_layers,
146
+ n_heads=n_heads,
147
+ n_kv_heads=n_kv_heads,
148
+ head_dim=head_dim,
149
+ rope_enabled=True,
150
+ )
151
+
152
+ if self.verbose:
153
+ logger.info("Loaded model: %s", model_name)
154
+ logger.info(
155
+ " Layers: %d, KV Heads: %d, Head Dim: %d",
156
+ self._spec["n_layers"], self._spec["n_kv_heads"], self._spec["head_dim"],
157
+ )
158
+ logger.info(" Context: %d, GPU Layers: %d", self.n_ctx, self.n_gpu_layers)
159
+ if is_iswa_spec(self._spec):
160
+ sections = self._spec["cache_sections"]
161
+ logger.info(" ISWA: %d cache sections", len(sections))
162
+ for i, s in enumerate(sections):
163
+ logger.info(
164
+ " Section %d: %s — %d layers, %d KV heads, head_dim=%d",
165
+ i, s.attention_type, s.n_layers, s.n_kv_heads, s.head_dim,
166
+ )
167
+
168
+ return self._spec
169
+
170
+ @property
171
+ def spec(self) -> ModelCacheSpec:
172
+ if self._spec is None:
173
+ raise RuntimeError("Model not loaded. Call load_model() first.")
174
+ return self._spec
175
+
176
+ @property
177
+ def llm(self):
178
+ if self._llm is None:
179
+ raise RuntimeError("Model not loaded. Call load_model() first.")
180
+ return self._llm
181
+
182
+ def generate(
183
+ self,
184
+ prompt: str,
185
+ max_tokens: int = 1,
186
+ temperature: float = 0.0,
187
+ ) -> tuple[str, float]:
188
+ """Generate tokens and return (output_text, ttft_ms).
189
+
190
+ With max_tokens=1, this effectively does a prefill + one decode step,
191
+ which is what we need for TTFT measurement.
192
+ """
193
+ t0 = time.perf_counter()
194
+ output = self.llm(
195
+ prompt,
196
+ max_tokens=max_tokens,
197
+ temperature=temperature,
198
+ )
199
+ t1 = time.perf_counter()
200
+
201
+ ttft_ms = (t1 - t0) * 1000
202
+ text = output["choices"][0]["text"]
203
+ return text, ttft_ms
204
+
205
+ def extract_kv_cache(self, seq_id: int = 0) -> ParsedKVCache:
206
+ """Extract the current KV cache as structured tensors.
207
+
208
+ For standard models: returns ParsedKVCache.
209
+ For ISWA models: parses only the first (global) section.
210
+ Use extract_kv_cache_iswa() for full multi-section extraction.
211
+
212
+ Args:
213
+ seq_id: Sequence ID to extract (default 0 for single-sequence use)
214
+
215
+ Returns:
216
+ ParsedKVCache with [n_layers, n_kv_heads, seq_len, head_dim] tensors
217
+ """
218
+ state_data = self.llm.save_state()
219
+ blob = bytes(state_data.llama_state)
220
+
221
+ if is_iswa_spec(self.spec):
222
+ # For backward compat, parse just the first section
223
+ sections = self.spec["cache_sections"]
224
+ first = sections[0]
225
+ return parse_state_blob(
226
+ blob,
227
+ n_kv_heads=first.n_kv_heads,
228
+ head_dim=first.head_dim,
229
+ )
230
+
231
+ return parse_state_blob(
232
+ blob,
233
+ n_kv_heads=self.spec["n_kv_heads"],
234
+ head_dim=self.spec["head_dim"],
235
+ )
236
+
237
+ def extract_kv_cache_iswa(self) -> ParsedMultiSectionCache:
238
+ """Extract all ISWA cache sections as structured tensors.
239
+
240
+ Only valid for ISWA models (those with cache_sections in spec).
241
+
242
+ Returns:
243
+ ParsedMultiSectionCache with one ParsedKVCache per section.
244
+
245
+ Raises:
246
+ RuntimeError: If model is not ISWA.
247
+ """
248
+ if not is_iswa_spec(self.spec):
249
+ raise RuntimeError(
250
+ f"extract_kv_cache_iswa() requires an ISWA model, "
251
+ f"but {self.spec['model_id']} has no cache_sections"
252
+ )
253
+
254
+ state_data = self.llm.save_state()
255
+ blob = bytes(state_data.llama_state)
256
+
257
+ return parse_multi_section_blob(blob, self.spec["cache_sections"])
258
+
259
+ def inject_kv_cache(self, state_data: bytes) -> float:
260
+ """Inject a previously saved KV cache state, returning restore time in ms.
261
+
262
+ Args:
263
+ state_data: Raw state blob (as returned by save_state / extracted earlier)
264
+
265
+ Returns:
266
+ Restore time in milliseconds
267
+ """
268
+ from llama_cpp import LlamaState
269
+
270
+ t0 = time.perf_counter()
271
+
272
+ state = LlamaState(
273
+ input_ids=[], # Will be overridden by the state
274
+ scores=[],
275
+ llama_state=list(state_data),
276
+ llama_state_size=len(state_data),
277
+ )
278
+ self.llm.load_state(state)
279
+
280
+ t1 = time.perf_counter()
281
+ return (t1 - t0) * 1000
282
+
283
+ def measure_cold_ttft(self, prompt: str) -> TTFTMeasurement:
284
+ """Measure cold TTFT (full prefill from scratch).
285
+
286
+ Resets the KV cache before generation.
287
+ """
288
+ self.llm.reset()
289
+
290
+ tokens = self.llm.tokenize(prompt.encode())
291
+ _, ttft_ms = self.generate(prompt, max_tokens=1)
292
+
293
+ return TTFTMeasurement(
294
+ ttft_ms=ttft_ms,
295
+ context_len=len(tokens),
296
+ method="cold_prefill",
297
+ model_id=self.spec["model_id"],
298
+ )
299
+
300
+ def measure_cached_ttft(self, state_data: bytes, continuation: str = " ") -> TTFTMeasurement:
301
+ """Measure cached TTFT (restore from saved state + generate).
302
+
303
+ Args:
304
+ state_data: Saved state blob to restore from
305
+ continuation: Text to generate after restore
306
+
307
+ Returns:
308
+ TTFTMeasurement with restore + first token time
309
+ """
310
+ self.llm.reset()
311
+
312
+ t0 = time.perf_counter()
313
+ self.inject_kv_cache(state_data)
314
+ output = self.llm(continuation, max_tokens=1, temperature=0.0)
315
+ t1 = time.perf_counter()
316
+
317
+ ttft_ms = (t1 - t0) * 1000
318
+
319
+ return TTFTMeasurement(
320
+ ttft_ms=ttft_ms,
321
+ context_len=0, # Not re-prefilling
322
+ method="cached_restore",
323
+ model_id=self.spec["model_id"],
324
+ )
325
+
326
+ def close(self) -> None:
327
+ """Release model resources."""
328
+ self._llm = None
329
+ self._spec = None