engram / integrations /llama_cpp_bridge.py

feat: upload integrations (llama.cpp bridge)

57f4e4d verified 8 days ago

10.6 kB

	"""
	ENGRAM Protocol — llama-cpp-python Bridge


	D1: llama-cpp-python direct. No Ollama. n_gpu_layers=0 for Phase 1.

	Provides:
	- KV cache extraction via llama_state_seq_get_data() → blob_parser
	- KV cache injection via llama_state_seq_set_data() for session restore
	- TTFT measurement for benchmarking (D6: >10x at 16K)
	- Model loading with architecture spec auto-detection

	WARNING: State blob format is llama.cpp version-dependent.
	Pin llama-cpp-python version in pyproject.toml.
	"""

	from __future__ import annotations

	import logging
	import time
	from dataclasses import dataclass
	from pathlib import Path

	import torch

	logger = logging.getLogger(__name__)

	from kvcos.core.blob_parser import (
	GGML_TYPE_F16,
	GGML_TYPE_Q8_0,
	ParsedKVCache,
	ParsedMultiSectionCache,
	parse_multi_section_blob,
	parse_state_blob,
	)
	from kvcos.core.cache_spec import (
	ModelCacheSpec,
	get_model_spec,
	is_iswa_spec,
	make_spec_from_metadata,
	)


	# Metadata key prefixes in order of preference per architecture.
	# llama.cpp uses architecture-specific keys (e.g., gemma4.block_count).
	_METADATA_PREFIXES = ("llama", "gemma4", "gemma", "phi", "qwen", "mistral", "deepseek")


	def _meta_get(metadata: dict, key_suffix: str, default: str = "0") -> str:
	"""Get a metadata value trying architecture-specific prefixes.

	Searches: llama.{suffix}, gemma4.{suffix}, gemma.{suffix}, etc.
	Falls back to general.{suffix}, then default.

	Args:
	metadata: llama.cpp model metadata dict.
	key_suffix: Key without prefix, e.g. "block_count" or "attention.head_count".
	default: Default if no key found.
	"""
	for prefix in _METADATA_PREFIXES:
	val = metadata.get(f"{prefix}.{key_suffix}")
	if val is not None:
	return val
	# Fall back to general.*
	val = metadata.get(f"general.{key_suffix}")
	return val if val is not None else default


	@dataclass
	class TTFTMeasurement:
	"""Time-to-first-token measurement for benchmarking."""

	ttft_ms: float # milliseconds
	context_len: int
	method: str # "cold_prefill" or "cached_restore"
	model_id: str


	class LlamaCppBridge:
	"""Bridge between llama-cpp-python and ENGRAM's KV cache system.

	Handles model loading, KV cache extraction, and injection.

	Usage:
	bridge = LlamaCppBridge("/path/to/model.gguf")
	bridge.load_model()

	# Generate and extract KV state
	bridge.generate(prompt)
	parsed = bridge.extract_kv_cache()

	# Later: inject cached state
	bridge.inject_kv_cache(cached_blob, spec)
	bridge.generate("Continue from cached state:")
	"""

	def __init__(
	self,
	model_path: str,
	n_ctx: int = 16384,
	n_gpu_layers: int = 0, # D1: CPU-only Phase 1
	kv_cache_type: str = "f16", # "f16" or "q8_0"
	verbose: bool = False,
	):
	self.model_path = model_path
	self.n_ctx = n_ctx
	self.n_gpu_layers = n_gpu_layers
	self.kv_cache_type = kv_cache_type
	self.verbose = verbose
	self._llm = None
	self._spec: ModelCacheSpec \| None = None

	def load_model(self) -> ModelCacheSpec:
	"""Load the GGUF model and auto-detect architecture spec.

	Returns the ModelCacheSpec for this model.
	"""
	from llama_cpp import Llama

	self._llm = Llama(
	model_path=self.model_path,
	n_ctx=self.n_ctx,
	n_gpu_layers=self.n_gpu_layers,
	verbose=self.verbose,
	)

	# Auto-detect model architecture from llama.cpp metadata.
	# Uses fallback chain across architecture prefixes (llama., gemma4., etc.)
	metadata = self._llm.metadata
	model_name = metadata.get("general.name", Path(self.model_path).stem)

	# Check registry first (handles ISWA specs with cache_sections)
	registry_spec = get_model_spec(model_name)
	if registry_spec is not None:
	self._spec = registry_spec
	else:
	n_layers = int(_meta_get(metadata, "block_count", "32"))
	n_heads = int(_meta_get(metadata, "attention.head_count", "32"))
	n_kv_heads = int(_meta_get(metadata, "attention.head_count_kv", str(n_heads)))
	embed_dim = int(_meta_get(metadata, "embedding_length", "4096"))
	head_dim = embed_dim // n_heads if n_heads > 0 else 128

	self._spec = make_spec_from_metadata(
	model_id=model_name,
	n_layers=n_layers,
	n_heads=n_heads,
	n_kv_heads=n_kv_heads,
	head_dim=head_dim,
	rope_enabled=True,
	)

	if self.verbose:
	logger.info("Loaded model: %s", model_name)
	logger.info(
	" Layers: %d, KV Heads: %d, Head Dim: %d",
	self._spec["n_layers"], self._spec["n_kv_heads"], self._spec["head_dim"],
	)
	logger.info(" Context: %d, GPU Layers: %d", self.n_ctx, self.n_gpu_layers)
	if is_iswa_spec(self._spec):
	sections = self._spec["cache_sections"]
	logger.info(" ISWA: %d cache sections", len(sections))
	for i, s in enumerate(sections):
	logger.info(
	" Section %d: %s — %d layers, %d KV heads, head_dim=%d",
	i, s.attention_type, s.n_layers, s.n_kv_heads, s.head_dim,
	)

	return self._spec

	@property
	def spec(self) -> ModelCacheSpec:
	if self._spec is None:
	raise RuntimeError("Model not loaded. Call load_model() first.")
	return self._spec

	@property
	def llm(self):
	if self._llm is None:
	raise RuntimeError("Model not loaded. Call load_model() first.")
	return self._llm

	def generate(
	self,
	prompt: str,
	max_tokens: int = 1,
	temperature: float = 0.0,
	) -> tuple[str, float]:
	"""Generate tokens and return (output_text, ttft_ms).

	With max_tokens=1, this effectively does a prefill + one decode step,
	which is what we need for TTFT measurement.
	"""
	t0 = time.perf_counter()
	output = self.llm(
	prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	)
	t1 = time.perf_counter()

	ttft_ms = (t1 - t0) * 1000
	text = output["choices"][0]["text"]
	return text, ttft_ms

	def extract_kv_cache(self, seq_id: int = 0) -> ParsedKVCache:
	"""Extract the current KV cache as structured tensors.

	For standard models: returns ParsedKVCache.
	For ISWA models: parses only the first (global) section.
	Use extract_kv_cache_iswa() for full multi-section extraction.

	Args:
	seq_id: Sequence ID to extract (default 0 for single-sequence use)

	Returns:
	ParsedKVCache with [n_layers, n_kv_heads, seq_len, head_dim] tensors
	"""
	state_data = self.llm.save_state()
	blob = bytes(state_data.llama_state)

	if is_iswa_spec(self.spec):
	# For backward compat, parse just the first section
	sections = self.spec["cache_sections"]
	first = sections[0]
	return parse_state_blob(
	blob,
	n_kv_heads=first.n_kv_heads,
	head_dim=first.head_dim,
	)

	return parse_state_blob(
	blob,
	n_kv_heads=self.spec["n_kv_heads"],
	head_dim=self.spec["head_dim"],
	)

	def extract_kv_cache_iswa(self) -> ParsedMultiSectionCache:
	"""Extract all ISWA cache sections as structured tensors.

	Only valid for ISWA models (those with cache_sections in spec).

	Returns:
	ParsedMultiSectionCache with one ParsedKVCache per section.

	Raises:
	RuntimeError: If model is not ISWA.
	"""
	if not is_iswa_spec(self.spec):
	raise RuntimeError(
	f"extract_kv_cache_iswa() requires an ISWA model, "
	f"but {self.spec['model_id']} has no cache_sections"
	)

	state_data = self.llm.save_state()
	blob = bytes(state_data.llama_state)

	return parse_multi_section_blob(blob, self.spec["cache_sections"])

	def inject_kv_cache(self, state_data: bytes) -> float:
	"""Inject a previously saved KV cache state, returning restore time in ms.

	Args:
	state_data: Raw state blob (as returned by save_state / extracted earlier)

	Returns:
	Restore time in milliseconds
	"""
	from llama_cpp import LlamaState

	t0 = time.perf_counter()

	state = LlamaState(
	input_ids=[], # Will be overridden by the state
	scores=[],
	llama_state=list(state_data),
	llama_state_size=len(state_data),
	)
	self.llm.load_state(state)

	t1 = time.perf_counter()
	return (t1 - t0) * 1000

	def measure_cold_ttft(self, prompt: str) -> TTFTMeasurement:
	"""Measure cold TTFT (full prefill from scratch).

	Resets the KV cache before generation.
	"""
	self.llm.reset()

	tokens = self.llm.tokenize(prompt.encode())
	_, ttft_ms = self.generate(prompt, max_tokens=1)

	return TTFTMeasurement(
	ttft_ms=ttft_ms,
	context_len=len(tokens),
	method="cold_prefill",
	model_id=self.spec["model_id"],
	)

	def measure_cached_ttft(self, state_data: bytes, continuation: str = " ") -> TTFTMeasurement:
	"""Measure cached TTFT (restore from saved state + generate).

	Args:
	state_data: Saved state blob to restore from
	continuation: Text to generate after restore

	Returns:
	TTFTMeasurement with restore + first token time
	"""
	self.llm.reset()

	t0 = time.perf_counter()
	self.inject_kv_cache(state_data)
	output = self.llm(continuation, max_tokens=1, temperature=0.0)
	t1 = time.perf_counter()

	ttft_ms = (t1 - t0) * 1000

	return TTFTMeasurement(
	ttft_ms=ttft_ms,
	context_len=0, # Not re-prefilling
	method="cached_restore",
	model_id=self.spec["model_id"],
	)

	def close(self) -> None:
	"""Release model resources."""
	self._llm = None
	self._spec = None