eigengram commited on Apr 3

Commit

0769ff3

verified ·

1 Parent(s): 2a3efd4

feat: upload core kvcos library

Browse files

Files changed (38) hide show

kvcos/.DS_Store +0 -0
kvcos/__init__.py +5 -0
kvcos/api/__init__.py +5 -0
kvcos/api/routes.py +211 -0
kvcos/api/schemas.py +88 -0
kvcos/api/server.py +126 -0
kvcos/client/__init__.py +5 -0
kvcos/client/python_client.py +158 -0
kvcos/core/__init__.py +40 -0
kvcos/core/blob_parser.py +482 -0
kvcos/core/block_pool.py +167 -0
kvcos/core/cache_spec.py +215 -0
kvcos/core/compression.py +395 -0
kvcos/core/config.py +82 -0
kvcos/core/fingerprint.py +167 -0
kvcos/core/manifold_index.py +294 -0
kvcos/core/retriever.py +263 -0
kvcos/core/serializer.py +274 -0
kvcos/core/state_extractor.py +489 -0
kvcos/core/types.py +201 -0
kvcos/engram/__init__.py +4 -0
kvcos/engram/__main__.py +265 -0
kvcos/engram/chunker.py +327 -0
kvcos/engram/embedder.py +221 -0
kvcos/engram/format.py +251 -0
kvcos/engram/hnsw_index.py +205 -0
kvcos/engram/index_c.py +426 -0
kvcos/engram/knowledge_index.py +289 -0
kvcos/engram/manifest.py +232 -0
kvcos/engram/metadata_disambiguate.py +119 -0
kvcos/engram/reader.py +54 -0
kvcos/engram/retrieval.py +513 -0
kvcos/engram/session_propagator.py +159 -0
kvcos/engram/writer.py +134 -0
kvcos/mar/__init__.py +20 -0
kvcos/storage/__init__.py +9 -0
kvcos/storage/backends.py +71 -0
kvcos/storage/local.py +202 -0

kvcos/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

kvcos/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""ENGRAM Protocol — KV cache fingerprinting for persistent semantic retrieval."""
+from kvcos.core.types import ENGRAM_VERSION
+__version__ = ENGRAM_VERSION

kvcos/api/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""ENGRAM Protocol — REST API package."""
+from kvcos.api.server import create_app
+__all__ = ["create_app"]

kvcos/api/routes.py ADDED Viewed

	@@ -0,0 +1,211 @@

+"""
+ENGRAM Protocol — API Routes
+FastAPI route handlers for the ENGRAM REST API.
+All endpoints under /v1/ prefix.
+"""
+from __future__ import annotations
+from fastapi import APIRouter, HTTPException, UploadFile, File
+from kvcos.api.schemas import (
+    DeleteResponse,
+    HealthResponse,
+    SearchRequest,
+    SearchResponse,
+    SearchResultItem,
+    StatsResponse,
+    StoreResponse,
+)
+from kvcos.core.types import ENGRAM_VERSION
+router = APIRouter(prefix="/v1")
+# ── Dependency stubs ──────────────────────────────────────────────────────────
+# These are replaced by real instances in server.py lifespan.
+# Using module-level state that the server sets during startup.
+_retriever = None
+_storage = None
+_index = None
+def _get_retriever():
+    if _retriever is None:
+        raise HTTPException(503, "ENGRAM not initialized. Server starting up.")
+    return _retriever
+def _get_storage():
+    if _storage is None:
+        raise HTTPException(503, "ENGRAM not initialized. Server starting up.")
+    return _storage
+def _get_index():
+    if _index is None:
+        raise HTTPException(503, "ENGRAM not initialized. Server starting up.")
+    return _index
+# ── Health ────────────────────────────────────────────────────────────────────
+@router.get("/health", response_model=HealthResponse)
+async def health():
+    """Health check endpoint."""
+    index = _get_index()
+    storage = _get_storage()
+    return HealthResponse(
+        status="ok",
+        version=ENGRAM_VERSION,
+        index_entries=index.n_entries,
+        storage_backend="local",
+    )
+# ── Stats (must come before /cache/{cache_id} to avoid route shadowing) ──────
+@router.get("/cache/stats", response_model=StatsResponse)
+async def cache_stats():
+    """Get aggregate statistics for the engram store."""
+    storage = _get_storage()
+    stats = storage.stats()
+    return StatsResponse(
+        total_entries=stats["total_entries"],
+        total_size_bytes=stats["total_size_bytes"],
+        total_size_mb=round(stats["total_size_bytes"] / (1024 * 1024), 2),
+        avg_compression_ratio=stats["avg_compression_ratio"],
+        model_breakdown=stats["model_breakdown"],
+    )
+# ── Store ─────────────────────────────────────────────────────────────────────
+@router.post("/cache", response_model=StoreResponse)
+async def store_cache(
+    agent_id: str,
+    task_description: str,
+    model_id: str,
+    file: UploadFile = File(...),
+    compression: str = "q8_0",
+):
+    """Store a .eng file in the engram store.
+    Accepts a pre-serialized .eng file upload.
+    The file is stored and its metadata indexed for EGR retrieval.
+    """
+    storage = _get_storage()
+    data = await file.read()
+    if len(data) == 0:
+        raise HTTPException(400, "Empty file upload")
+    import uuid
+    cache_id = str(uuid.uuid4())
+    from kvcos.core.types import EngramMetadata
+    from datetime import datetime, timezone
+    metadata: EngramMetadata = {
+        "engram_version": ENGRAM_VERSION,
+        "cache_id": cache_id,
+        "compression": compression,
+        "model_id": model_id,
+        "model_family": "",
+        "n_layers": "0",
+        "n_heads": "0",
+        "n_kv_heads": "0",
+        "head_dim": "0",
+        "context_len": "0",
+        "agent_id": agent_id,
+        "task_description": task_description,
+        "created_at": datetime.now(timezone.utc).isoformat(),
+    }
+    path = storage.store(cache_id, data, metadata)
+    return StoreResponse(
+        cache_id=cache_id,
+        size_bytes=len(data),
+        compression_ratio=1.0,
+        path=path,
+    )
+# ── Retrieve by ID ────────────────────────────────────────────────────────────
+@router.get("/cache/{cache_id}")
+async def get_cache(cache_id: str):
+    """Retrieve a .eng file by cache ID.
+    Returns the raw .eng file bytes (application/octet-stream).
+    """
+    storage = _get_storage()
+    data = storage.get(cache_id)
+    if data is None:
+        raise HTTPException(404, f"Cache entry not found: {cache_id}")
+    from fastapi.responses import Response
+    return Response(
+        content=data,
+        media_type="application/octet-stream",
+        headers={"Content-Disposition": f'attachment; filename="{cache_id}.eng"'},
+    )
+# ── Search ────────────────────────────────────────────────────────────────────
+@router.post("/cache/search", response_model=SearchResponse)
+async def search_cache(req: SearchRequest):
+    """Search for similar engram states via EGR manifold search.
+    Uses inner product similarity (MIPS) in the model's pre-RoPE
+    key manifold. D2: K→K retrieval only.
+    """
+    index = _get_index()
+    # For text-only search without a KV query vector, we need the
+    # retriever to extract a state vector first. This endpoint
+    # currently returns index entries matching by metadata filter.
+    # Full EGR vector search requires a query KV cache (via /egr/retrieve).
+    # Metadata-based listing with optional filters
+    storage = _get_storage()
+    entries = storage.list_entries(model_family=None, limit=req.top_k)
+    results = [
+        SearchResultItem(
+            cache_id=e.get("cache_id", ""),
+            similarity=0.0,
+            task_description=e.get("task_description", ""),
+            model_id=e.get("model_id", ""),
+            created_at=e.get("created_at", ""),
+            context_len=int(e.get("context_len", "0")),
+        )
+        for e in entries
+        if (req.model_id is None or e.get("model_id") == req.model_id)
+    ]
+    return SearchResponse(results=results[:req.top_k], n_searched=index.n_entries)
+# ── Delete ────────────────────────────────────────────────────────────────────
+@router.delete("/cache/{cache_id}", response_model=DeleteResponse)
+async def delete_cache(cache_id: str):
+    """Delete an engram from storage and index."""
+    retriever = _get_retriever()
+    deleted = retriever.delete_engram(cache_id)
+    return DeleteResponse(deleted=deleted, cache_id=cache_id)

kvcos/api/schemas.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""
+ENGRAM Protocol — API Schemas
+Pydantic models for all REST API request/response payloads.
+"""
+from __future__ import annotations
+from pydantic import BaseModel, Field
+# ── Store ─────────────────────────────────────────────────────────────────────
+class StoreRequest(BaseModel):
+    agent_id: str
+    task_description: str
+    model_id: str
+    compression: str = "q8_0"
+class StoreResponse(BaseModel):
+    cache_id: str
+    size_bytes: int
+    compression_ratio: float
+    path: str
+# ── Retrieve ──────────────────────────────────────────────────────────────────
+class SearchRequest(BaseModel):
+    task_description: str
+    model_id: str | None = None
+    top_k: int = Field(default=5, ge=1, le=100)
+    min_similarity: float | None = None
+class SearchResultItem(BaseModel):
+    cache_id: str
+    similarity: float
+    task_description: str
+    model_id: str
+    created_at: str
+    context_len: int
+class SearchResponse(BaseModel):
+    results: list[SearchResultItem]
+    n_searched: int
+# ── Extend ────────────────────────────────────────────────────────────────────
+class ExtendResponse(BaseModel):
+    cache_id: str
+    new_context_len: int
+# ── Delete ────────────────────────────────────────────────────────────────────
+class DeleteResponse(BaseModel):
+    deleted: bool
+    cache_id: str
+# ── Stats ─────────────────────────────────────────────────────────────────────
+class StatsResponse(BaseModel):
+    total_entries: int
+    total_size_bytes: int
+    total_size_mb: float
+    avg_compression_ratio: float
+    model_breakdown: dict[str, int]
+# ── Health ────────────────────────────────────────────────────────────────────
+class HealthResponse(BaseModel):
+    status: str = "ok"
+    version: str
+    index_entries: int
+    storage_backend: str

kvcos/api/server.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+ENGRAM Protocol — ENGRAM Server
+FastAPI application factory with lifespan management.
+Initializes storage, index, extractor, and retriever on startup.
+"""
+from __future__ import annotations
+import logging
+from contextlib import asynccontextmanager
+from fastapi import FastAPI
+from kvcos.api import routes
+from kvcos.core.config import get_config
+from kvcos.core.serializer import EngramSerializer
+from kvcos.core.types import ENGRAM_VERSION, StateExtractionMode
+from kvcos.core.manifold_index import ManifoldIndex
+from kvcos.core.retriever import EGRRetriever
+from kvcos.core.state_extractor import MARStateExtractor
+from kvcos.storage.local import LocalStorageBackend
+logger = logging.getLogger(__name__)
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Initialize ENGRAM components on startup, clean up on shutdown."""
+    config = get_config()
+    # Initialize storage backend
+    storage = LocalStorageBackend(data_dir=config.data_dir)
+    # Initialize EGR manifold index
+    index_path = config.index_dir / "egr.faiss"
+    index = ManifoldIndex(dim=config.state_vec_dim, index_path=index_path)
+    # Initialize state extractor
+    extractor = MARStateExtractor(
+        mode=StateExtractionMode.SVD_PROJECT,
+        rank=config.state_vec_dim,
+    )
+    # Initialize retriever
+    serializer = EngramSerializer()
+    retriever = EGRRetriever(
+        extractor=extractor,
+        index=index,
+        storage=storage,
+        serializer=serializer,
+    )
+    # Wire into route handlers
+    routes._storage = storage
+    routes._index = index
+    routes._retriever = retriever
+    logger.info("ENGRAM v%s started", ENGRAM_VERSION)
+    logger.info("  Storage:  %s (%d entries)", config.data_dir, storage.stats()["total_entries"])
+    logger.info("  Index:    %s (%d vectors, dim=%d)", config.index_dir, index.n_entries, config.state_vec_dim)
+    logger.info("  Backend:  %s", config.backend.value)
+    yield
+    # Shutdown: persist index
+    try:
+        index.save(index_path)
+        logger.info("Index saved to %s", index_path)
+    except Exception as e:
+        logger.warning("Failed to save index: %s", e)
+    # Clear route references
+    routes._storage = None
+    routes._index = None
+    routes._retriever = None
+    logger.info("ENGRAM shutdown complete")
+def create_app() -> FastAPI:
+    """Create the ENGRAM FastAPI application."""
+    app = FastAPI(
+        title="ENGRAM Protocol API",
+        description="ENGRAM Protocol: Cognitive state, persisted.",
+        version=ENGRAM_VERSION,
+        lifespan=lifespan,
+        docs_url="/docs",
+        redoc_url="/redoc",
+    )
+    app.include_router(routes.router)
+    return app
+def main() -> None:
+    """Entry point for `engram-server` console script."""
+    import uvicorn
+    config = get_config()
+    application = create_app()
+    uvicorn.run(
+        application,
+        host=config.host,
+        port=config.port,
+        log_level="info",
+    )
+def _get_app() -> FastAPI:
+    """Lazy app factory for `uvicorn kvcos.api.server:app`.
+    Defers create_app() until the attribute is actually accessed,
+    avoiding side effects on module import.
+    """
+    return create_app()
+def __getattr__(name: str) -> FastAPI:
+    if name == "app":
+        return _get_app()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+if __name__ == "__main__":
+    main()

kvcos/client/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""ENGRAM Protocol — Client library."""
+from kvcos.client.python_client import EngramClient
+__all__ = ["EngramClient"]

kvcos/client/python_client.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""
+ENGRAM Protocol — ENGRAM Python Client
+Async HTTP client wrapping all ENGRAM API endpoints.
+This is what agents import to interact with the engram store.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any
+import httpx
+class EngramClient:
+    """Python client for the ENGRAM REST API.
+    Usage:
+        client = EngramClient("http://localhost:8080")
+        result = client.store_file(path, agent_id="worker", task="analyze code", model_id="llama-3.1-8b")
+        matches = client.search(task_description="debug auth error", top_k=3)
+        data = client.get(matches[0]["cache_id"])
+    """
+    def __init__(self, base_url: str = "http://localhost:8080", timeout: float = 30.0):
+        self.base_url = base_url.rstrip("/")
+        self._client = httpx.Client(base_url=f"{self.base_url}/v1", timeout=timeout)
+    def close(self) -> None:
+        self._client.close()
+    def __enter__(self):
+        return self
+    def __exit__(self, *args):
+        self.close()
+    # ── Health ────────────────────────────────────────────────
+    def health(self) -> dict[str, Any]:
+        """Check ENGRAM server health."""
+        resp = self._client.get("/health")
+        resp.raise_for_status()
+        return resp.json()
+    # ── Store ─────────────────────────────────────────────────
+    def store_file(
+        self,
+        file_path: Path,
+        agent_id: str,
+        task_description: str,
+        model_id: str,
+        compression: str = "q8_0",
+    ) -> dict[str, Any]:
+        """Upload a .eng file to the engram store.
+        Args:
+            file_path: Path to the .eng file
+            agent_id: Agent identifier
+            task_description: Human-readable description
+            model_id: Model identifier
+            compression: Compression method used
+        Returns:
+            Dict with cache_id, size_bytes, compression_ratio, path
+        """
+        with open(file_path, "rb") as f:
+            resp = self._client.post(
+                "/cache",
+                params={
+                    "agent_id": agent_id,
+                    "task_description": task_description,
+                    "model_id": model_id,
+                    "compression": compression,
+                },
+                files={"file": (file_path.name, f, "application/octet-stream")},
+            )
+        resp.raise_for_status()
+        return resp.json()
+    def store_bytes(
+        self,
+        data: bytes,
+        agent_id: str,
+        task_description: str,
+        model_id: str,
+        compression: str = "q8_0",
+        filename: str = "cache.eng",
+    ) -> dict[str, Any]:
+        """Upload raw .eng bytes to the engram store."""
+        resp = self._client.post(
+            "/cache",
+            params={
+                "agent_id": agent_id,
+                "task_description": task_description,
+                "model_id": model_id,
+                "compression": compression,
+            },
+            files={"file": (filename, data, "application/octet-stream")},
+        )
+        resp.raise_for_status()
+        return resp.json()
+    # ── Retrieve ──────────────────────────────────────────────
+    def get(self, cache_id: str) -> bytes:
+        """Retrieve a .eng file by cache ID.
+        Returns raw bytes of the .eng file.
+        """
+        resp = self._client.get(f"/cache/{cache_id}")
+        resp.raise_for_status()
+        return resp.content
+    # ── Search ────────────────────────────────────────────────
+    def search(
+        self,
+        task_description: str,
+        model_id: str | None = None,
+        top_k: int = 5,
+        min_similarity: float | None = None,
+    ) -> list[dict[str, Any]]:
+        """Search for similar engram states.
+        Returns list of search result dicts with cache_id, similarity, etc.
+        """
+        body: dict[str, Any] = {
+            "task_description": task_description,
+            "top_k": top_k,
+        }
+        if model_id:
+            body["model_id"] = model_id
+        if min_similarity is not None:
+            body["min_similarity"] = min_similarity
+        resp = self._client.post("/cache/search", json=body)
+        resp.raise_for_status()
+        return resp.json()["results"]
+    # ── Delete ────────────────────────────────────────────────
+    def delete(self, cache_id: str) -> bool:
+        """Delete an engram from storage and index."""
+        resp = self._client.delete(f"/cache/{cache_id}")
+        resp.raise_for_status()
+        return resp.json()["deleted"]
+    # ── Stats ─────────────────────────────────────────────────
+    def stats(self) -> dict[str, Any]:
+        """Get aggregate engram store statistics."""
+        resp = self._client.get("/cache/stats")
+        resp.raise_for_status()
+        return resp.json()

kvcos/core/__init__.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""ENGRAM Protocol — Core library: types, parsing, compression, serialization, retrieval."""
+from kvcos.core.types import (
+    ENGRAM_VERSION,
+    AttentionType,
+    CacheSection,
+    CacheSearchResult,
+    CacheStats,
+    CompressionMethod,
+    EngramMetadata,
+    ModelCacheSpec,
+    StateExtractionMode,
+)
+from kvcos.core.manifold_index import IndexEntry, ManifoldIndex
+from kvcos.core.retriever import EGRRetriever, RetrievalResponse, RetrievalResult
+from kvcos.core.state_extractor import ExtractionResult, MARStateExtractor, SVDProjection
+__all__ = [
+    # Types
+    "ENGRAM_VERSION",
+    "AttentionType",
+    "CacheSection",
+    "CacheSearchResult",
+    "CacheStats",
+    "CompressionMethod",
+    "EngramMetadata",
+    "ModelCacheSpec",
+    "StateExtractionMode",
+    # Manifold index
+    "IndexEntry",
+    "ManifoldIndex",
+    # Retriever
+    "EGRRetriever",
+    "RetrievalResponse",
+    "RetrievalResult",
+    # State extraction (MAR)
+    "ExtractionResult",
+    "MARStateExtractor",
+    "SVDProjection",
+]

kvcos/core/blob_parser.py ADDED Viewed

	@@ -0,0 +1,482 @@

+"""
+ENGRAM Protocol — llama.cpp State Blob Parser
+Parses the binary state blob from llama_state_get_data() (via save_state())
+into structured PyTorch tensors of shape [n_layers, n_kv_heads, n_cells, head_dim].
+D1: This is the critical extraction path. The blob format is defined by
+llama.cpp's llama_kv_cache::state_write() and is version-dependent.
+Validated against llama-cpp-python 0.3.19 (llama.cpp b5000+).
+Binary format of llama_state_get_data() output:
+  1. Architecture string: uint32 str_len + str_len bytes (e.g. "llama")
+  2. KV cache section (from memory->state_write()):
+     a. uint32 n_stream (always 1 for single-context)
+     b. Per stream:
+        - uint32 cell_count (= n_used_cells, NOT n_ctx)
+        - Per cell: int32 pos, uint32 n_seq_id, int32[] seq_ids
+        - uint32 v_trans (1 = values stored transposed)
+        - uint32 n_layer
+        - Per layer K: int32 type_k, uint64 row_size_k, bytes data[row_size_k * cell_count]
+        - Per layer V (non-transposed): int32 type_v, uint64 row_size_v, bytes data[row_size_v * cell_count]
+        - Per layer V (transposed): int32 type_v, uint32 el_size, uint32 n_embd_v_gqa,
+                                    bytes data[el_size * n_embd_v_gqa * cell_count]
+WARNING: This format is not stable across llama.cpp versions.
+Pin llama-cpp-python version in pyproject.toml.
+"""
+from __future__ import annotations
+import struct
+from dataclasses import dataclass
+import numpy as np
+import torch
+from kvcos.core.types import CacheSection
+# ── GGML dtype constants ──────────────────────────────────────────────────────
+GGML_TYPE_F32 = 0
+GGML_TYPE_F16 = 1
+GGML_TYPE_Q8_0 = 8
+GGML_TYPE_Q4_0 = 2
+GGML_TYPE_SIZE: dict[int, float] = {
+    GGML_TYPE_F32: 4.0,
+    GGML_TYPE_F16: 2.0,
+    GGML_TYPE_Q8_0: 34.0 / 32.0,
+    GGML_TYPE_Q4_0: 18.0 / 32.0,
+}
+GGML_BLOCK_SIZE: dict[int, int] = {
+    GGML_TYPE_F32: 1,
+    GGML_TYPE_F16: 1,
+    GGML_TYPE_Q8_0: 32,
+    GGML_TYPE_Q4_0: 32,
+}
+@dataclass
+class CellMeta:
+    """Metadata for a single KV cache cell."""
+    pos: int
+    seq_ids: list[int]
+@dataclass
+class ParsedKVCache:
+    """Result of parsing a llama.cpp state blob into structured engram tensors."""
+    keys: torch.Tensor  # [n_layers, n_kv_heads, n_cells, head_dim] float16
+    values: torch.Tensor  # [n_layers, n_kv_heads, n_cells, head_dim] float16
+    cells: list[CellMeta]
+    n_cells: int
+    n_layers: int
+    v_trans: bool
+    arch: str
+@dataclass
+class ParsedMultiSectionCache:
+    """Result of parsing an ISWA state blob with multiple KV cache sections.
+    Each section is a ParsedKVCache with its own tensor shapes.
+    For Gemma 4: section[0] is Global (5 layers), section[1] is SWA (25 layers).
+    """
+    sections: list[ParsedKVCache]
+    arch: str
+    @property
+    def n_sections(self) -> int:
+        return len(self.sections)
+    @property
+    def total_layers(self) -> int:
+        return sum(s.n_layers for s in self.sections)
+class BlobParseError(Exception):
+    """Raised when the state blob cannot be parsed."""
+def _read_u32(data: bytes, offset: int) -> tuple[int, int]:
+    return struct.unpack_from("<I", data, offset)[0], offset + 4
+def _read_i32(data: bytes, offset: int) -> tuple[int, int]:
+    return struct.unpack_from("<i", data, offset)[0], offset + 4
+def _read_u64(data: bytes, offset: int) -> tuple[int, int]:
+    return struct.unpack_from("<Q", data, offset)[0], offset + 8
+def _read_f16_block(
+    data: bytes, offset: int, n_elements: int,
+) -> tuple[torch.Tensor, int]:
+    """Read n_elements of float16 data from bytes."""
+    n_bytes = n_elements * 2
+    if offset + n_bytes > len(data):
+        raise BlobParseError(
+            f"F16 read overflow: need {n_bytes}B at offset {offset}, blob is {len(data)}B"
+        )
+    arr = np.frombuffer(data, dtype=np.float16, count=n_elements, offset=offset)
+    return torch.from_numpy(arr.copy()).to(torch.float16), offset + n_bytes
+def parse_state_blob(
+    blob: bytes,
+    n_kv_heads: int,
+    head_dim: int,
+) -> ParsedKVCache:
+    """Parse a llama.cpp full-context state blob into structured KV tensors.
+    Parses output of llama_state_get_data() (via save_state()):
+      1. Architecture string header
+      2. KV stream: cell metadata + per-layer K and V tensor data
+    The parser auto-detects n_layers, cell_count, and v_trans from the blob.
+    Args:
+        blob: Raw bytes from save_state().llama_state
+        n_kv_heads: Number of KV heads (from model spec)
+        head_dim: Head dimension (from model spec)
+    Returns:
+        ParsedKVCache with [n_layers, n_kv_heads, n_cells, head_dim] tensors.
+    """
+    if len(blob) < 20:
+        raise BlobParseError(f"Blob too small: {len(blob)} bytes")
+    offset = 0
+    n_embd_kv = n_kv_heads * head_dim
+    # ── 1. Architecture string ────────────────────────────────
+    str_len, offset = _read_u32(blob, offset)
+    if str_len > 100:
+        raise BlobParseError(f"Arch string length {str_len} too large — format mismatch")
+    arch = blob[offset : offset + str_len].decode("ascii", errors="replace")
+    offset += str_len
+    # ── 2. KV stream header ───────────────────────────────────
+    n_stream, offset = _read_u32(blob, offset)
+    if n_stream != 1:
+        raise BlobParseError(f"Expected 1 KV stream, got {n_stream}")
+    cell_count, offset = _read_u32(blob, offset)
+    if cell_count == 0:
+        raise BlobParseError("State blob has 0 cells")
+    if cell_count > 200_000:
+        raise BlobParseError(f"Suspiciously large cell_count: {cell_count}")
+    # ── 3. Cell metadata ──────────────────────────────────────
+    cells: list[CellMeta] = []
+    for _ in range(cell_count):
+        pos, offset = _read_i32(blob, offset)
+        n_seq, offset = _read_u32(blob, offset)
+        seq_ids: list[int] = []
+        for _ in range(n_seq):
+            sid, offset = _read_i32(blob, offset)
+            seq_ids.append(sid)
+        cells.append(CellMeta(pos=pos, seq_ids=seq_ids))
+    # ── 4. Data section header ────────────────────────────────
+    v_trans_u32, offset = _read_u32(blob, offset)
+    v_trans = v_trans_u32 != 0
+    n_layers, offset = _read_u32(blob, offset)
+    if n_layers == 0 or n_layers > 200:
+        raise BlobParseError(f"Invalid n_layers: {n_layers}")
+    # ── 5. K tensor data (per layer) ──────────────────────────
+    k_layers: list[torch.Tensor] = []
+    for layer_idx in range(n_layers):
+        type_k, offset = _read_i32(blob, offset)
+        row_size_k, offset = _read_u64(blob, offset)
+        if type_k != GGML_TYPE_F16:
+            raise BlobParseError(
+                f"Layer {layer_idx} K: unsupported type {type_k} (expected F16={GGML_TYPE_F16})"
+            )
+        data_bytes = row_size_k * cell_count
+        n_elements = data_bytes // 2  # fp16
+        if n_elements != n_embd_kv * cell_count:
+            raise BlobParseError(
+                f"Layer {layer_idx} K: expected {n_embd_kv * cell_count} elements, "
+                f"got {n_elements} (row_size={row_size_k}, cells={cell_count})"
+            )
+        tensor, offset = _read_f16_block(blob, offset, n_elements)
+        # Shape: [cell_count, n_kv_heads * head_dim] → [n_kv_heads, cell_count, head_dim]
+        tensor = tensor.reshape(cell_count, n_kv_heads, head_dim)
+        tensor = tensor.permute(1, 0, 2).contiguous()
+        k_layers.append(tensor)
+    # ── 6. V tensor data (per layer) ──────────────────────────
+    v_layers: list[torch.Tensor] = []
+    for layer_idx in range(n_layers):
+        type_v, offset = _read_i32(blob, offset)
+        if type_v != GGML_TYPE_F16:
+            raise BlobParseError(
+                f"Layer {layer_idx} V: unsupported type {type_v} (expected F16={GGML_TYPE_F16})"
+            )
+        if v_trans:
+            el_size, offset = _read_u32(blob, offset)
+            n_embd_v, offset = _read_u32(blob, offset)
+            data_bytes = el_size * n_embd_v * cell_count
+            n_elements = data_bytes // 2
+            tensor, offset = _read_f16_block(blob, offset, n_elements)
+            # V transposed: stored as [n_embd_v, cell_count] per layer
+            # n_embd_v = n_kv_heads * head_dim
+            tensor = tensor.reshape(n_embd_v // head_dim, head_dim, cell_count)
+            # → [n_kv_heads, head_dim, cell_count] → [n_kv_heads, cell_count, head_dim]
+            tensor = tensor.permute(0, 2, 1).contiguous()
+        else:
+            row_size_v, offset = _read_u64(blob, offset)
+            data_bytes = row_size_v * cell_count
+            n_elements = data_bytes // 2
+            tensor, offset = _read_f16_block(blob, offset, n_elements)
+            tensor = tensor.reshape(cell_count, n_kv_heads, head_dim)
+            tensor = tensor.permute(1, 0, 2).contiguous()
+        v_layers.append(tensor)
+    # ── 7. Stack into [n_layers, n_kv_heads, n_cells, head_dim] ─
+    keys = torch.stack(k_layers, dim=0)
+    values = torch.stack(v_layers, dim=0)
+    expected_shape = (n_layers, n_kv_heads, cell_count, head_dim)
+    if keys.shape != expected_shape:
+        raise BlobParseError(f"K shape {keys.shape} != expected {expected_shape}")
+    if values.shape != expected_shape:
+        raise BlobParseError(f"V shape {values.shape} != expected {expected_shape}")
+    return ParsedKVCache(
+        keys=keys,
+        values=values,
+        cells=cells,
+        n_cells=cell_count,
+        n_layers=n_layers,
+        v_trans=v_trans,
+        arch=arch,
+    )
+def _parse_single_stream(
+    blob: bytes,
+    offset: int,
+    n_kv_heads: int,
+    head_dim: int,
+    arch: str,
+) -> tuple[ParsedKVCache, int]:
+    """Parse one KV cache stream from blob at given offset.
+    Returns (ParsedKVCache, new_offset) so caller can continue
+    parsing subsequent streams for ISWA blobs.
+    """
+    n_embd_kv = n_kv_heads * head_dim
+    # Cell count
+    cell_count, offset = _read_u32(blob, offset)
+    if cell_count == 0:
+        raise BlobParseError("Stream has 0 cells")
+    if cell_count > 200_000:
+        raise BlobParseError(f"Suspiciously large cell_count: {cell_count}")
+    # Cell metadata
+    cells: list[CellMeta] = []
+    for _ in range(cell_count):
+        pos, offset = _read_i32(blob, offset)
+        n_seq, offset = _read_u32(blob, offset)
+        seq_ids: list[int] = []
+        for _ in range(n_seq):
+            sid, offset = _read_i32(blob, offset)
+            seq_ids.append(sid)
+        cells.append(CellMeta(pos=pos, seq_ids=seq_ids))
+    # Data section header
+    v_trans_u32, offset = _read_u32(blob, offset)
+    v_trans = v_trans_u32 != 0
+    n_layers, offset = _read_u32(blob, offset)
+    if n_layers == 0 or n_layers > 200:
+        raise BlobParseError(f"Invalid n_layers: {n_layers}")
+    # K layers
+    k_layers: list[torch.Tensor] = []
+    for layer_idx in range(n_layers):
+        type_k, offset = _read_i32(blob, offset)
+        row_size_k, offset = _read_u64(blob, offset)
+        if type_k != GGML_TYPE_F16:
+            raise BlobParseError(
+                f"Layer {layer_idx} K: unsupported type {type_k} (expected F16={GGML_TYPE_F16})"
+            )
+        data_bytes = row_size_k * cell_count
+        n_elements = data_bytes // 2
+        if n_elements != n_embd_kv * cell_count:
+            raise BlobParseError(
+                f"Layer {layer_idx} K: expected {n_embd_kv * cell_count} elements, "
+                f"got {n_elements} (row_size={row_size_k}, cells={cell_count})"
+            )
+        tensor, offset = _read_f16_block(blob, offset, n_elements)
+        tensor = tensor.reshape(cell_count, n_kv_heads, head_dim)
+        tensor = tensor.permute(1, 0, 2).contiguous()
+        k_layers.append(tensor)
+    # V layers
+    v_layers: list[torch.Tensor] = []
+    for layer_idx in range(n_layers):
+        type_v, offset = _read_i32(blob, offset)
+        if type_v != GGML_TYPE_F16:
+            raise BlobParseError(
+                f"Layer {layer_idx} V: unsupported type {type_v} (expected F16={GGML_TYPE_F16})"
+            )
+        if v_trans:
+            el_size, offset = _read_u32(blob, offset)
+            n_embd_v, offset = _read_u32(blob, offset)
+            data_bytes = el_size * n_embd_v * cell_count
+            n_elements = data_bytes // 2
+            tensor, offset = _read_f16_block(blob, offset, n_elements)
+            tensor = tensor.reshape(n_embd_v // head_dim, head_dim, cell_count)
+            tensor = tensor.permute(0, 2, 1).contiguous()
+        else:
+            row_size_v, offset = _read_u64(blob, offset)
+            data_bytes = row_size_v * cell_count
+            n_elements = data_bytes // 2
+            tensor, offset = _read_f16_block(blob, offset, n_elements)
+            tensor = tensor.reshape(cell_count, n_kv_heads, head_dim)
+            tensor = tensor.permute(1, 0, 2).contiguous()
+        v_layers.append(tensor)
+    keys = torch.stack(k_layers, dim=0)
+    values = torch.stack(v_layers, dim=0)
+    expected_shape = (n_layers, n_kv_heads, cell_count, head_dim)
+    if keys.shape != expected_shape:
+        raise BlobParseError(f"K shape {keys.shape} != expected {expected_shape}")
+    if values.shape != expected_shape:
+        raise BlobParseError(f"V shape {values.shape} != expected {expected_shape}")
+    parsed = ParsedKVCache(
+        keys=keys,
+        values=values,
+        cells=cells,
+        n_cells=cell_count,
+        n_layers=n_layers,
+        v_trans=v_trans,
+        arch=arch,
+    )
+    return parsed, offset
+def parse_multi_section_blob(
+    blob: bytes,
+    sections: tuple[CacheSection, ...],
+) -> ParsedMultiSectionCache:
+    """Parse an ISWA state blob with multiple sequential KV cache sections.
+    ISWA models (e.g., Gemma 4) serialize multiple cache sections in a single
+    blob. Each section has its own cell metadata, layer count, and KV dimensions.
+    The n_stream field in the blob header equals the number of sections.
+    Args:
+        blob: Raw bytes from save_state().llama_state
+        sections: Cache section specifications (order must match blob layout)
+    Returns:
+        ParsedMultiSectionCache with one ParsedKVCache per section.
+    """
+    if len(blob) < 20:
+        raise BlobParseError(f"Blob too small: {len(blob)} bytes")
+    offset = 0
+    # Architecture string
+    str_len, offset = _read_u32(blob, offset)
+    if str_len > 100:
+        raise BlobParseError(f"Arch string length {str_len} too large")
+    arch = blob[offset : offset + str_len].decode("ascii", errors="replace")
+    offset += str_len
+    # Stream count
+    n_stream, offset = _read_u32(blob, offset)
+    if n_stream != len(sections):
+        raise BlobParseError(
+            f"Expected {len(sections)} streams, got {n_stream}"
+        )
+    # Parse each stream
+    parsed_sections: list[ParsedKVCache] = []
+    for section in sections:
+        parsed, offset = _parse_single_stream(
+            blob, offset,
+            n_kv_heads=section.n_kv_heads,
+            head_dim=section.head_dim,
+            arch=arch,
+        )
+        parsed_sections.append(parsed)
+    return ParsedMultiSectionCache(sections=parsed_sections, arch=arch)
+# ── Legacy compat wrapper ────────────────────────────────────────────────────
+def parse_seq_state_blob(
+    blob: bytes,
+    spec: dict,
+    kv_dtype: int = GGML_TYPE_F16,
+) -> ParsedKVCache:
+    """Legacy wrapper — delegates to parse_state_blob.
+    Kept for backward compatibility with existing tests.
+    """
+    return parse_state_blob(
+        blob=blob,
+        n_kv_heads=spec["n_kv_heads"],
+        head_dim=spec["head_dim"],
+    )
+def estimate_blob_size(
+    n_kv_heads: int,
+    head_dim: int,
+    n_layers: int,
+    n_cells: int,
+    v_trans: bool = True,
+) -> int:
+    """Estimate expected blob size for validation."""
+    header = 4 + 5 + 4 + 4  # str_len + "llama" + n_stream + cell_count
+    cell_meta = n_cells * 12  # pos(4) + n_seq(4) + seq_id(4) typical
+    data_header = 4 + 4  # v_trans + n_layer
+    n_embd_kv = n_kv_heads * head_dim
+    k_per_layer = 4 + 8 + (n_embd_kv * 2 * n_cells)  # type + row_size + data
+    if v_trans:
+        v_per_layer = 4 + 4 + 4 + (n_embd_kv * 2 * n_cells)  # type + el_size + n_embd + data
+    else:
+        v_per_layer = 4 + 8 + (n_embd_kv * 2 * n_cells)
+    return header + cell_meta + data_header + n_layers * (k_per_layer + v_per_layer)

kvcos/core/block_pool.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""
+ENGRAM Protocol — 256-Token Block Pool Manager
+Segments a full KV cache into fixed-size blocks (256 tokens each) that can be:
+  - Stored independently (one .eng file per block — D7)
+  - Retrieved individually via EGR (fine-grained cache hits)
+  - Composed (assemble a context from multiple blocks)
+  - Evicted independently (LRU per block, not per session)
+Design from arXiv:2603.04428 (Persistent Q4 KV Cache, agent-memory paper).
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+import torch
+from kvcos.core.types import BLOCK_SIZE_TOKENS
+@dataclass
+class KVBlock:
+    """A single 256-token block of KV cache data."""
+    block_index: int
+    token_start: int
+    token_end: int  # exclusive
+    keys: torch.Tensor  # [n_layers, n_kv_heads, block_len, head_dim]
+    values: torch.Tensor  # [n_layers, n_kv_heads, block_len, head_dim]
+    @property
+    def block_len(self) -> int:
+        return self.token_end - self.token_start
+    @property
+    def is_full(self) -> bool:
+        return self.block_len == BLOCK_SIZE_TOKENS
+    @property
+    def n_layers(self) -> int:
+        return self.keys.shape[0]
+    @property
+    def n_kv_heads(self) -> int:
+        return self.keys.shape[1]
+    @property
+    def head_dim(self) -> int:
+        return self.keys.shape[3]
+@dataclass
+class BlockPool:
+    """Manages a collection of KV blocks for an agent session."""
+    agent_id: str
+    model_id: str
+    blocks: list[KVBlock] = field(default_factory=list)
+    @property
+    def total_tokens(self) -> int:
+        return sum(b.block_len for b in self.blocks)
+    @property
+    def n_blocks(self) -> int:
+        return len(self.blocks)
+    def segment(
+        self, keys: torch.Tensor, values: torch.Tensor,
+    ) -> list[KVBlock]:
+        """Segment a full KV cache into 256-token blocks.
+        Args:
+            keys:   [n_layers, n_kv_heads, ctx_len, head_dim]
+            values: [n_layers, n_kv_heads, ctx_len, head_dim]
+        """
+        if keys.shape != values.shape:
+            raise ValueError(f"Shape mismatch: keys {keys.shape} vs values {values.shape}")
+        ctx_len = keys.shape[2]
+        blocks: list[KVBlock] = []
+        for i in range(0, ctx_len, BLOCK_SIZE_TOKENS):
+            end = min(i + BLOCK_SIZE_TOKENS, ctx_len)
+            block = KVBlock(
+                block_index=len(blocks),
+                token_start=i,
+                token_end=end,
+                keys=keys[:, :, i:end, :].contiguous(),
+                values=values[:, :, i:end, :].contiguous(),
+            )
+            blocks.append(block)
+        self.blocks = blocks
+        return blocks
+    def assemble(
+        self, block_indices: list[int] | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Assemble KV cache from blocks (concatenate along ctx_len dim)."""
+        if not self.blocks:
+            raise ValueError("No blocks to assemble")
+        selected = self.blocks if block_indices is None else [self.blocks[i] for i in block_indices]
+        if not selected:
+            raise ValueError("No blocks selected for assembly")
+        keys = torch.cat([b.keys for b in selected], dim=2)
+        values = torch.cat([b.values for b in selected], dim=2)
+        return keys, values
+    def append_block(self, block: KVBlock) -> None:
+        block.block_index = len(self.blocks)
+        self.blocks.append(block)
+    def get_block(self, index: int) -> KVBlock:
+        if index < 0 or index >= len(self.blocks):
+            raise IndexError(f"Block index {index} out of range [0, {len(self.blocks)})")
+        return self.blocks[index]
+    def extend(
+        self, new_keys: torch.Tensor, new_values: torch.Tensor,
+    ) -> list[KVBlock]:
+        """Extend the pool with additional tokens, filling last block first."""
+        new_ctx_len = new_keys.shape[2]
+        modified_blocks: list[KVBlock] = []
+        offset = 0
+        if self.blocks and not self.blocks[-1].is_full:
+            last = self.blocks[-1]
+            space = BLOCK_SIZE_TOKENS - last.block_len
+            fill = min(space, new_ctx_len)
+            merged_k = torch.cat([last.keys, new_keys[:, :, :fill, :]], dim=2).contiguous()
+            merged_v = torch.cat([last.values, new_values[:, :, :fill, :]], dim=2).contiguous()
+            self.blocks[-1] = KVBlock(
+                block_index=last.block_index,
+                token_start=last.token_start,
+                token_end=last.token_start + merged_k.shape[2],
+                keys=merged_k,
+                values=merged_v,
+            )
+            modified_blocks.append(self.blocks[-1])
+            offset = fill
+        remaining = new_ctx_len - offset
+        if remaining > 0:
+            token_base = self.blocks[-1].token_end if self.blocks else 0
+            sub_pool = BlockPool(agent_id=self.agent_id, model_id=self.model_id)
+            new_blocks = sub_pool.segment(
+                new_keys[:, :, offset:, :], new_values[:, :, offset:, :],
+            )
+            for b in new_blocks:
+                b.block_index = len(self.blocks)
+                b.token_start += token_base
+                b.token_end += token_base
+                self.blocks.append(b)
+                modified_blocks.append(b)
+        return modified_blocks
+    def clear(self) -> None:
+        self.blocks.clear()

kvcos/core/cache_spec.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""
+ENGRAM Protocol — Model Architecture Registry
+Contains ModelCacheSpec definitions for known models and utilities
+to look up specs by model_id or infer model family from string.
+D3: extraction_layers set to middle-to-deep (8-31 for 32-layer models)
+per ShadowKV validation. Early layers (0-7) and final layer preserved.
+"""
+from __future__ import annotations
+from kvcos.core.types import AttentionType, CacheSection, ModelCacheSpec
+# ── Pre-registered Model Specs ────────────────────────────────────────────────
+# Llama 3.1 8B — Primary Phase 1 target (D1, D6)
+# GQA: 32 query heads, 8 KV heads, head_dim 128
+LLAMA_3_1_8B = ModelCacheSpec(
+    model_id="meta-llama/Llama-3.1-8B-Instruct",
+    model_family="llama",
+    n_layers=32,
+    n_heads=32,
+    n_kv_heads=8,
+    head_dim=128,
+    rope_enabled=True,
+    extraction_layers=tuple(range(8, 32)),  # layers 8-31 (D3)
+)
+# Llama 3.1 8B base (non-instruct)
+LLAMA_3_1_8B_BASE = ModelCacheSpec(
+    model_id="meta-llama/Llama-3.1-8B",
+    model_family="llama",
+    n_layers=32,
+    n_heads=32,
+    n_kv_heads=8,
+    head_dim=128,
+    rope_enabled=True,
+    extraction_layers=tuple(range(8, 32)),
+)
+# Phi-3-Mini-128K — Secondary Phase 1 target
+# ShadowKV validated SVD on this model (D3)
+# MHA: 32 query heads, 32 KV heads (no GQA), head_dim 96
+PHI_3_MINI = ModelCacheSpec(
+    model_id="microsoft/Phi-3-mini-128k-instruct",
+    model_family="phi",
+    n_layers=32,
+    n_heads=32,
+    n_kv_heads=32,  # Phi-3-Mini uses MHA, not GQA
+    head_dim=96,
+    rope_enabled=True,
+    extraction_layers=tuple(range(8, 32)),
+)
+# Gemma 2 2B — NOTE: QK-Norm model, SVD behavior may differ (T3 caveat)
+GEMMA_2_2B = ModelCacheSpec(
+    model_id="google/gemma-2-2b-it",
+    model_family="gemma",
+    n_layers=26,
+    n_heads=8,
+    n_kv_heads=4,
+    head_dim=256,
+    rope_enabled=True,
+    extraction_layers=tuple(range(6, 26)),
+)
+# Qwen 2.5 7B
+QWEN_2_5_7B = ModelCacheSpec(
+    model_id="Qwen/Qwen2.5-7B-Instruct",
+    model_family="qwen",
+    n_layers=28,
+    n_heads=28,
+    n_kv_heads=4,
+    head_dim=128,
+    rope_enabled=True,
+    extraction_layers=tuple(range(7, 28)),
+)
+# Mistral 7B v0.3
+MISTRAL_7B = ModelCacheSpec(
+    model_id="mistralai/Mistral-7B-Instruct-v0.3",
+    model_family="mistral",
+    n_layers=32,
+    n_heads=32,
+    n_kv_heads=8,
+    head_dim=128,
+    rope_enabled=True,
+    extraction_layers=tuple(range(8, 32)),
+)
+# Gemma 4 26B-A4B — ISWA model (Interleaved Sliding Window Attention)
+# Dual KV cache: Global (full context) + SWA (sliding window 1024 tokens)
+# MoE: 128 experts, 8 active — does NOT affect KV cache (FFN-only)
+# Reverse-engineered from llama.cpp b5200+ state blob format.
+GEMMA_4_26B_A4B = ModelCacheSpec(
+    model_id="google/gemma-4-26b-a4b-it",
+    model_family="gemma",
+    n_layers=30,   # total: 5 global + 25 SWA
+    n_heads=32,
+    n_kv_heads=8,  # dominant section (SWA)
+    head_dim=256,  # dominant section (SWA)
+    rope_enabled=True,
+    extraction_layers=tuple(range(8, 30)),
+    cache_sections=(
+        CacheSection(
+            attention_type=AttentionType.FULL,
+            n_layers=5,
+            n_kv_heads=2,
+            head_dim=512,
+        ),
+        CacheSection(
+            attention_type=AttentionType.SLIDING,
+            n_layers=25,
+            n_kv_heads=8,
+            head_dim=256,
+            window_size=1024,
+        ),
+    ),
+)
+# ── Registry ──────────────────────────────────────────────────────────────────
+_REGISTRY: dict[str, ModelCacheSpec] = {
+    spec["model_id"]: spec
+    for spec in [
+        LLAMA_3_1_8B,
+        LLAMA_3_1_8B_BASE,
+        PHI_3_MINI,
+        GEMMA_2_2B,
+        GEMMA_4_26B_A4B,
+        QWEN_2_5_7B,
+        MISTRAL_7B,
+    ]
+}
+_FAMILY_MAP: dict[str, str] = {
+    "llama": "llama",
+    "meta-llama": "llama",
+    "phi": "phi",
+    "microsoft/phi": "phi",
+    "gemma": "gemma",
+    "google/gemma": "gemma",
+    "qwen": "qwen",
+    "mistral": "mistral",
+    "deepseek": "deepseek",
+}
+def get_model_spec(model_id: str) -> ModelCacheSpec | None:
+    """Look up a ModelCacheSpec by exact model_id."""
+    return _REGISTRY.get(model_id)
+def register_model_spec(spec: ModelCacheSpec) -> None:
+    """Register a new model spec in the runtime registry."""
+    _REGISTRY[spec["model_id"]] = spec
+def infer_model_family(model_id: str) -> str:
+    """Infer model family from a model_id string."""
+    model_id_lower = model_id.lower()
+    for prefix, family in _FAMILY_MAP.items():
+        if prefix in model_id_lower:
+            return family
+    return "unknown"
+def make_spec_from_metadata(
+    model_id: str,
+    n_layers: int,
+    n_heads: int,
+    n_kv_heads: int,
+    head_dim: int,
+    rope_enabled: bool = True,
+) -> ModelCacheSpec:
+    """Create a ModelCacheSpec from raw parameters.
+    Automatically sets extraction_layers to middle-to-deep range (D3).
+    """
+    skip_layers = max(1, n_layers // 4)
+    extraction_layers = tuple(range(skip_layers, n_layers))
+    return ModelCacheSpec(
+        model_id=model_id,
+        model_family=infer_model_family(model_id),
+        n_layers=n_layers,
+        n_heads=n_heads,
+        n_kv_heads=n_kv_heads,
+        head_dim=head_dim,
+        rope_enabled=rope_enabled,
+        extraction_layers=extraction_layers,
+    )
+def is_iswa_spec(spec: ModelCacheSpec) -> bool:
+    """Check if a model spec describes an ISWA (multi-section) cache."""
+    return "cache_sections" in spec
+def validate_kv_shape(
+    spec: ModelCacheSpec,
+    n_layers: int,
+    n_kv_heads: int,
+    head_dim: int,
+) -> bool:
+    """Validate that KV tensor dimensions match the model spec."""
+    return (
+        spec["n_layers"] == n_layers
+        and spec["n_kv_heads"] == n_kv_heads
+        and spec["head_dim"] == head_dim
+    )

kvcos/core/compression.py ADDED Viewed

	@@ -0,0 +1,395 @@

+"""
+ENGRAM Protocol — KV Cache Compression Layer
+Implements:
+  - FP16 passthrough (no compression)
+  - Q8_0: group quantization matching llama.cpp GGML_TYPE_Q8_0
+    Phase 1 production fallback. ~2x compression, <5% speed hit (D5).
+  - PolarQuant: MSE-optimal random rotation + Lloyd-Max codebook at 3 bits.
+    QJL REMOVED — confirmed harmful by 6+ independent implementations (D5).
+    Softmax amplifies QJL variance, making two-stage worse than MSE-only.
+Reference: TheTom/turboquant_plus (511+ tests, most mature impl)
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+import numpy as np
+import torch
+from kvcos.core.types import CompressionMethod
+# ── Q8_0 Constants ────────────────────────────────────────────────────────────
+Q8_GROUP_SIZE = 32
+@dataclass(frozen=True)
+class CompressionResult:
+    """Result of compressing a KV cache tensor."""
+    data: torch.Tensor
+    method: CompressionMethod
+    original_dtype: torch.dtype
+    compression_ratio: float
+    metadata: dict[str, str]
+# ── FP16 Passthrough ──────────────────────────────────────────────────────────
+def compress_fp16(kv: torch.Tensor) -> CompressionResult:
+    """No-op compression: ensure tensor is FP16."""
+    data = kv.to(torch.float16).contiguous()
+    return CompressionResult(
+        data=data,
+        method=CompressionMethod.FP16,
+        original_dtype=kv.dtype,
+        compression_ratio=1.0,
+        metadata={},
+    )
+def decompress_fp16(data: torch.Tensor) -> torch.Tensor:
+    return data.to(torch.float16)
+# ── Q8_0 Quantization ────────────────────────────────────────────────────────
+# Matches llama.cpp GGML_TYPE_Q8_0 layout:
+#   32-element groups, 1 float16 scale per group, 32 int8 values
+#   Storage: (32*1 + 2) / (32*2) = 34/64 ≈ 1.88x compression
+def compress_q8_0(kv: torch.Tensor) -> CompressionResult:
+    """Quantize KV cache to Q8_0 (int8 with per-group scale).
+    Stores dequantized bfloat16 for safetensors compatibility —
+    safetensors doesn't support int8+scale pairs natively.
+    """
+    original_dtype = kv.dtype
+    original_bytes = kv.numel() * kv.element_size()
+    kv_flat = kv.float().contiguous()
+    orig_shape = kv_flat.shape
+    last_dim = orig_shape[-1]
+    pad_amount = (Q8_GROUP_SIZE - last_dim % Q8_GROUP_SIZE) % Q8_GROUP_SIZE
+    if pad_amount > 0:
+        kv_flat = torch.nn.functional.pad(kv_flat, (0, pad_amount))
+    new_shape = kv_flat.shape[:-1] + (-1, Q8_GROUP_SIZE)
+    grouped = kv_flat.reshape(new_shape)
+    scales = grouped.abs().amax(dim=-1, keepdim=True) / 127.0
+    scales = scales.clamp(min=1e-10)
+    quantized = torch.clamp(torch.round(grouped / scales), -127, 127)
+    dequantized = (quantized * scales).reshape(kv_flat.shape)
+    if pad_amount > 0:
+        dequantized = dequantized[..., :last_dim]
+    dequantized = dequantized.reshape(orig_shape).to(torch.bfloat16)
+    compressed_bytes = dequantized.numel() * 2
+    return CompressionResult(
+        data=dequantized,
+        method=CompressionMethod.Q8_0,
+        original_dtype=original_dtype,
+        compression_ratio=original_bytes / compressed_bytes if compressed_bytes > 0 else 1.0,
+        metadata={"q8_group_size": str(Q8_GROUP_SIZE)},
+    )
+def decompress_q8_0(data: torch.Tensor) -> torch.Tensor:
+    return data.to(torch.float16)
+# ── PolarQuant (Phase 2 — TurboQuant without QJL) ────────────────────────────
+# QJL is INTENTIONALLY ABSENT per D5.
+class PolarQuantConfig:
+    """Configuration for PolarQuant compression."""
+    def __init__(self, bits: int = 3, seed: int = 42):
+        self.bits = bits
+        self.n_centroids = 2**bits
+        self.seed = seed
+        self._rotation_cache: dict[int, torch.Tensor] = {}
+        self._codebook_cache: dict[int, torch.Tensor] = {}
+    def get_rotation_matrix(self, dim: int, device: torch.device) -> torch.Tensor:
+        """Get fixed random orthogonal rotation matrix R ∈ R^(d×d)."""
+        if dim not in self._rotation_cache:
+            rng = np.random.RandomState(self.seed)
+            gaussian = rng.randn(dim, dim).astype(np.float32)
+            q, r = np.linalg.qr(gaussian)
+            d = np.diag(r)
+            ph = np.sign(d)
+            q *= ph[np.newaxis, :]
+            self._rotation_cache[dim] = torch.from_numpy(q)
+        return self._rotation_cache[dim].to(device)
+    def get_lloyd_max_codebook(self, dim: int) -> torch.Tensor:
+        """Lloyd-Max optimal centroids for N(0,1), 3-bit (8 levels)."""
+        if dim not in self._codebook_cache:
+            codebook = torch.tensor(
+                [-1.748, -1.050, -0.501, -0.000, 0.000, 0.501, 1.050, 1.748],
+                dtype=torch.float32,
+            )
+            self._codebook_cache[dim] = codebook
+        return self._codebook_cache[dim]
+_POLAR_CONFIG = PolarQuantConfig()
+def compress_polarquant(kv: torch.Tensor) -> CompressionResult:
+    """Compress using PolarQuant (3-bit Lloyd-Max after random rotation).
+    Phase 2 implementation. Currently stores dequantized bfloat16.
+    True 3-bit packed storage is Phase 2+.
+    """
+    original_dtype = kv.dtype
+    original_bytes = kv.numel() * kv.element_size()
+    device = kv.device
+    kv_float = kv.float().contiguous()
+    orig_shape = kv_float.shape
+    head_dim = orig_shape[-1]
+    flat = kv_float.reshape(-1, head_dim)
+    R = _POLAR_CONFIG.get_rotation_matrix(head_dim, device)
+    rotated = flat @ R
+    dim_std = rotated.std(dim=0, keepdim=True).clamp(min=1e-10)
+    normalized = rotated / dim_std
+    codebook = _POLAR_CONFIG.get_lloyd_max_codebook(head_dim).to(device)
+    distances = (normalized.unsqueeze(-1) - codebook.unsqueeze(0).unsqueeze(0)) ** 2
+    indices = distances.argmin(dim=-1)
+    dequantized = codebook[indices]
+    dequantized = dequantized * dim_std
+    R_inv = R.T
+    dequantized = dequantized @ R_inv
+    dequantized = dequantized.reshape(orig_shape).to(torch.bfloat16)
+    compressed_bytes = dequantized.numel() * 2
+    return CompressionResult(
+        data=dequantized,
+        method=CompressionMethod.POLARQUANT,
+        original_dtype=original_dtype,
+        compression_ratio=original_bytes / compressed_bytes if compressed_bytes > 0 else 1.0,
+        metadata={
+            "polarquant_bits": "3",
+            "polarquant_seed": str(_POLAR_CONFIG.seed),
+            "qjl_enabled": "false",  # D5: QJL permanently disabled
+        },
+    )
+def decompress_polarquant(data: torch.Tensor) -> torch.Tensor:
+    return data.to(torch.float16)
+# ── INT8 Quantization (Phase 2 — true on-disk compression) ───────────────────
+# Stores actual int8 tensors in safetensors (1 byte/element vs 2 for fp16).
+# Per-row symmetric quantization: scale = max(abs(row)) / 127.
+# Separate scale tensor stored alongside quantized data.
+# 2x on-disk compression with cos_sim > 0.999.
+@dataclass(frozen=True)
+class Int8CompressedPair:
+    """INT8 quantized tensor + per-row scales."""
+    quantized: torch.Tensor  # int8 [same shape as input]
+    scales: torch.Tensor  # float16 [shape[:-1]] — one scale per row
+def compress_int8_tensor(kv: torch.Tensor) -> Int8CompressedPair:
+    """Quantize a KV tensor to int8 with per-row scales.
+    Args:
+        kv: [..., head_dim] tensor (any dtype)
+    Returns:
+        Int8CompressedPair with int8 data and float16 scales
+    """
+    orig_shape = kv.shape
+    flat = kv.float().reshape(-1, orig_shape[-1])
+    row_max = flat.abs().amax(dim=1, keepdim=True).clamp(min=1e-8)
+    scales = row_max / 127.0
+    quantized = (flat / scales).round().clamp(-127, 127).to(torch.int8)
+    scales_f16 = scales.squeeze(1).to(torch.float16)
+    return Int8CompressedPair(
+        quantized=quantized.reshape(orig_shape),
+        scales=scales_f16.reshape(orig_shape[:-1]),
+    )
+def decompress_int8_tensor(quantized: torch.Tensor, scales: torch.Tensor) -> torch.Tensor:
+    """Dequantize int8 tensor using per-row scales.
+    Returns float16 tensor of the original shape.
+    """
+    return (quantized.float() * scales.float().unsqueeze(-1)).to(torch.float16)
+def compress_int8(kv: torch.Tensor) -> CompressionResult:
+    """INT8 compression — returns dequantized float16 for CompressionResult compat.
+    The actual int8 storage is handled by the serializer which calls
+    compress_int8_tensor() directly for true on-disk compression.
+    This wrapper exists for the dispatcher API.
+    """
+    pair = compress_int8_tensor(kv)
+    dequantized = decompress_int8_tensor(pair.quantized, pair.scales)
+    original_bytes = kv.numel() * kv.element_size()
+    # True on-disk: int8 data + float16 scales
+    compressed_bytes = pair.quantized.numel() * 1 + pair.scales.numel() * 2
+    return CompressionResult(
+        data=dequantized,
+        method=CompressionMethod.INT8,
+        original_dtype=kv.dtype,
+        compression_ratio=original_bytes / compressed_bytes if compressed_bytes > 0 else 1.0,
+        metadata={"int8_scale_dtype": "float16"},
+    )
+# ── LAYER_DELTA Compression ──────────────────────────────────────────────────
+# Stores layer 0 as fp16 baseline, layers 1..N as int8 deltas from previous.
+# Inter-layer residuals are typically small (adjacent layers are correlated),
+# so int8 quantization of deltas achieves better fidelity than direct int8.
+# On-disk: ~(1/N) fp16 + ((N-1)/N) int8 ≈ slightly better than straight INT8.
+@dataclass(frozen=True)
+class LayerDeltaCompressed:
+    """Layer-delta compressed: fp16 baseline + int8 deltas."""
+    baseline: torch.Tensor  # [n_kv_heads, n_cells, head_dim] fp16
+    delta_quantized: list[torch.Tensor]  # each int8 [n_kv_heads, n_cells, head_dim]
+    delta_scales: list[torch.Tensor]  # each fp16 [n_kv_heads, n_cells]
+    n_layers: int
+def compress_layer_delta(kv: torch.Tensor) -> LayerDeltaCompressed:
+    """Compress KV tensor using inter-layer delta encoding.
+    Args:
+        kv: [n_layers, n_kv_heads, n_cells, head_dim]
+    Returns:
+        LayerDeltaCompressed with fp16 baseline + int8 deltas
+    """
+    n_layers = kv.shape[0]
+    baseline = kv[0].to(torch.float16)
+    deltas: list[torch.Tensor] = []
+    scales: list[torch.Tensor] = []
+    for i in range(1, n_layers):
+        delta = (kv[i].float() - kv[i - 1].float())
+        flat = delta.reshape(-1, delta.shape[-1])
+        row_max = flat.abs().amax(dim=1).clamp(min=1e-8) / 127.0
+        q = (flat / row_max.unsqueeze(1)).round().clamp(-127, 127).to(torch.int8)
+        deltas.append(q.reshape(delta.shape))
+        scales.append(row_max.to(torch.float16).reshape(delta.shape[:-1]))
+    return LayerDeltaCompressed(
+        baseline=baseline, delta_quantized=deltas,
+        delta_scales=scales, n_layers=n_layers,
+    )
+def decompress_layer_delta(data: LayerDeltaCompressed) -> torch.Tensor:
+    """Decompress layer-delta encoded KV tensor."""
+    layers = [data.baseline.float()]
+    for dq, ds in zip(data.delta_quantized, data.delta_scales):
+        flat = dq.float().reshape(-1, dq.shape[-1])
+        delta = (flat * ds.float().reshape(-1).unsqueeze(1)).reshape(dq.shape)
+        layers.append(layers[-1] + delta)
+    return torch.stack(layers).to(torch.float16)
+def compress_layer_delta_result(kv: torch.Tensor) -> CompressionResult:
+    """Layer-delta wrapper for CompressionResult API."""
+    compressed = compress_layer_delta(kv)
+    decompressed = decompress_layer_delta(compressed)
+    original_bytes = kv.numel() * kv.element_size()
+    # On-disk: baseline fp16 + (N-1) int8 deltas + (N-1) fp16 scales
+    n = compressed.n_layers
+    per_layer_elements = kv[0].numel()
+    scale_elements = kv.shape[1] * kv.shape[2]  # n_kv_heads * n_cells
+    compressed_bytes = (
+        per_layer_elements * 2  # baseline fp16
+        + (n - 1) * per_layer_elements * 1  # int8 deltas
+        + (n - 1) * scale_elements * 2  # fp16 scales
+    )
+    return CompressionResult(
+        data=decompressed,
+        method=CompressionMethod.LAYER_DELTA,
+        original_dtype=kv.dtype,
+        compression_ratio=original_bytes / compressed_bytes if compressed_bytes > 0 else 1.0,
+        metadata={"delta_n_layers": str(n)},
+    )
+# ── Dispatcher ────────────────────────────────────────────────────────────────
+def compress(kv: torch.Tensor, method: CompressionMethod) -> CompressionResult:
+    """Compress a KV cache tensor using the specified method."""
+    match method:
+        case CompressionMethod.FP16:
+            return compress_fp16(kv)
+        case CompressionMethod.Q8_0:
+            return compress_q8_0(kv)
+        case CompressionMethod.POLARQUANT:
+            return compress_polarquant(kv)
+        case CompressionMethod.INT8:
+            return compress_int8(kv)
+        case CompressionMethod.LAYER_DELTA:
+            return compress_layer_delta_result(kv)
+        case CompressionMethod.Q4_0:
+            import warnings
+            warnings.warn(
+                "Q4_0 has 92% dequantization slowdown at 64K+ context. "
+                "Using Q8_0 instead. See D5.",
+                UserWarning,
+                stacklevel=2,
+            )
+            return compress_q8_0(kv)
+        case _:
+            raise ValueError(f"Unknown compression method: {method}")
+def decompress(data: torch.Tensor, method: CompressionMethod) -> torch.Tensor:
+    """Decompress a KV cache tensor."""
+    match method:
+        case CompressionMethod.FP16:
+            return decompress_fp16(data)
+        case CompressionMethod.Q8_0 | CompressionMethod.Q4_0:
+            return decompress_q8_0(data)
+        case CompressionMethod.POLARQUANT:
+            return decompress_polarquant(data)
+        case CompressionMethod.INT8 | CompressionMethod.LAYER_DELTA:
+            # Already dequantized float16 in CompressionResult
+            return data.to(torch.float16)
+        case _:
+            raise ValueError(f"Unknown compression method: {method}")

kvcos/core/config.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+ENGRAM Protocol — Centralized Configuration
+Single source of truth for all runtime configuration.
+Uses pydantic-settings for validation and type coercion.
+"""
+from __future__ import annotations
+from functools import lru_cache
+from pathlib import Path
+from pydantic_settings import BaseSettings, SettingsConfigDict
+from kvcos.core.types import CompressionMethod, IndexBackend, StorageBackend
+class EngramConfig(BaseSettings):
+    """ENGRAM runtime configuration.
+    Loaded from environment variables with ENGRAM_ prefix,
+    or from a .env file in the project root.
+    """
+    model_config = SettingsConfigDict(
+        env_prefix="ENGRAM_",
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore",
+    )
+    # ── Server ────────────────────────────────────────────────
+    port: int = 8080
+    host: str = "0.0.0.0"
+    # ── Storage ───────────────────────────────────────────────
+    data_dir: Path = Path.home() / ".engram" / "data"
+    backend: StorageBackend = StorageBackend.LOCAL
+    default_compression: CompressionMethod = CompressionMethod.Q8_0
+    # ── FAISS Index (D2) ──────────────────────────────────────
+    index_backend: IndexBackend = IndexBackend.FAISS_FLAT_IP
+    index_dir: Path = Path.home() / ".engram" / "index"
+    # State vector dimension — must match extraction output
+    # 128 for mean_pool (head_dim), 160 for svd_project (rank-160)
+    state_vec_dim: int = 160
+    # ── LLM Runtime (D1) ──────────────────────────────────────
+    model_path: str = ""  # Path to GGUF model file
+    n_gpu_layers: int = 0  # D1: CPU-only Phase 1 (avoids Issue #743)
+    n_ctx: int = 16384  # D6: 16K context for Phase 1 demo target
+    # ── Phase 2: Remote backends ──────────────────────────────
+    redis_url: str = "redis://localhost:6379"
+    redis_max_memory_gb: float = 2.0
+    s3_bucket: str = "engram-cache"
+    s3_region: str = "eu-central-1"
+    cloudflare_r2_endpoint: str = ""
+    # ── Phase 2: Semantic index ───────────────────────────────
+    qdrant_url: str = "http://localhost:6333"
+    qdrant_api_key: str = ""
+    qdrant_collection: str = "engram_states"
+    cohere_api_key: str = ""
+    # ── Phase 4: Cross-model transfer ─────────────────────────
+    adapter_enabled: bool = False
+    adapter_checkpoint_dir: Path = Path.home() / ".engram" / "adapters"
+    def ensure_dirs(self) -> None:
+        """Create required directories if they don't exist."""
+        self.data_dir.mkdir(parents=True, exist_ok=True)
+        self.index_dir.mkdir(parents=True, exist_ok=True)
+@lru_cache(maxsize=1)
+def get_config() -> EngramConfig:
+    """Get the singleton config instance. Cached after first call."""
+    config = EngramConfig()
+    config.ensure_dirs()
+    return config

kvcos/core/fingerprint.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""
+ENGRAM Protocol — Standalone State Extraction Functions
+Contains the Engram Absolute fingerprint: compute_fourier_fingerprint().
+This is the primary cross-model retrieval fingerprint, validated at
+98% recall@1 at N=1000 with power-law decay N^-0.207.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import torch
+import torch.nn.functional as F
+if TYPE_CHECKING:
+    from kvcos.core.blob_parser import ParsedMultiSectionCache
+def compute_fourier_fingerprint(
+    layer_keys: torch.Tensor,
+    freqs: list[int] | None = None,
+) -> torch.Tensor:
+    """Compute the Engram Absolute fingerprint (f0+f1) from per-layer mean keys.
+    Takes the real DFT over the layer dimension, extracts amplitude at
+    the specified frequencies, normalizes each, and concatenates.
+    Args:
+        layer_keys: [n_layers, dim] where dim = n_kv_heads * head_dim.
+                    Must be per-layer MEAN across token positions.
+                    If shape is [n_layers, n_kv, hd], reshapes automatically.
+        freqs:      DFT frequency indices to concatenate.
+                    Default [0, 1] = DC (f0) + first harmonic (f1).
+    Returns:
+        Fingerprint tensor [dim * len(freqs)], L2-normalized, float32.
+    Properties:
+        - Cross-model invariant within Llama-3.x family (cos ~0.89)
+        - Zero corpus dependency: no centroid, no basis, no training data
+        - Recall@1 N=200: 98%  N=1000: 98%  decay: N^-0.207
+    """
+    if freqs is None:
+        freqs = [0, 1]
+    if layer_keys.dim() == 3:
+        n_layers = layer_keys.shape[0]
+        layer_keys = layer_keys.reshape(n_layers, -1)
+    layer_keys = layer_keys.float()
+    F_complex = torch.fft.rfft(layer_keys, dim=0)
+    F_amp = F_complex.abs()
+    components = []
+    for f in freqs:
+        if f >= F_amp.shape[0]:
+            raise ValueError(
+                f"Requested freq={f} but rfft produced only "
+                f"{F_amp.shape[0]} components for {layer_keys.shape[0]} layers."
+            )
+        components.append(F.normalize(F_amp[f], dim=-1))
+    return torch.cat(components, dim=-1)
+def compute_eigenform_score(
+    layer_keys: torch.Tensor,
+    noise_sigma: float = 0.001,
+    n_trials: int = 3,
+    freqs: list | None = None,
+) -> float:
+    """Compute eigenform stability score via noise perturbation.
+    Measures how stable the Fourier fingerprint is under small noise.
+    Score near 1.0 = stable. Below 0.95 = fragile fingerprint.
+    Args:
+        layer_keys: [n_layers, dim] per-layer mean key vectors.
+        noise_sigma: Gaussian noise standard deviation.
+        n_trials: Number of perturbed copies to compare.
+        freqs: DFT frequencies. Default [0, 1].
+    Returns:
+        float in [0, 1]. Mean pairwise cosine across noise trials.
+    """
+    if freqs is None:
+        freqs = [0, 1]
+    fps = []
+    for t in range(n_trials):
+        noisy = layer_keys if t == 0 else layer_keys + torch.randn_like(layer_keys) * noise_sigma
+        fps.append(compute_fourier_fingerprint(noisy.float(), freqs=freqs))
+    pairs = [(i, j) for i in range(n_trials) for j in range(i+1, n_trials)]
+    if not pairs:
+        return 1.0
+    return float(sum(F.cosine_similarity(fps[a].unsqueeze(0), fps[b].unsqueeze(0)).item() for a, b in pairs) / len(pairs))
+def compute_iswa_fingerprint(
+    parsed: "ParsedMultiSectionCache",
+    freqs: list | None = None,
+    normalize_layers: bool = True,
+) -> torch.Tensor:
+    """Compute concatenated Fourier fingerprint for ISWA multi-section caches.
+    Strategy A (per-section concatenation):
+      For each cache section, compute mean over tokens, then Fourier FP.
+      Concatenate section FPs into one vector.
+    For Gemma 4 with freqs=[0, 1]:
+      Global (5 layers, 2 heads, 512 dim) → 1024 * 2 = 2048
+      SWA   (25 layers, 8 heads, 256 dim) → 2048 * 2 = 4096
+      Total: 6144-dim fingerprint
+    Each section's sub-fingerprint is independently L2-normalized,
+    preserving the relative geometry within each attention type.
+    Args:
+        parsed: ParsedMultiSectionCache from parse_multi_section_blob()
+        freqs: DFT frequency indices. Default [0, 1].
+        normalize_layers: L2-normalize each layer before DFT (v2 behavior).
+    Returns:
+        Concatenated fingerprint tensor, float32.
+    """
+    if freqs is None:
+        freqs = [0, 1]
+    section_fps: list[torch.Tensor] = []
+    for section in parsed.sections:
+        # Mean over tokens: [n_layers, n_kv_heads, n_cells, head_dim] → [n_layers, n_kv_heads * head_dim]
+        layer_keys = section.keys.float().mean(dim=2)
+        fp = compute_fourier_fingerprint_v2(layer_keys, freqs=freqs, normalize_layers=normalize_layers)
+        section_fps.append(fp)
+    return torch.cat(section_fps, dim=-1)
+def compute_fourier_fingerprint_v2(
+    layer_keys: torch.Tensor,
+    freqs: list | None = None,
+    normalize_layers: bool = True,
+) -> torch.Tensor:
+    """Fourier fingerprint v2: L2-normalize each layer before DFT.
+    Removes absolute magnitude scale (which differs by KV head count
+    across model families), preserves layer-progression shape.
+    Within-family: same recall as v1 (98%).
+    Cross-family: f0+f1 cross-sim expected >>0.26 (v1 baseline).
+    """
+    if freqs is None:
+        freqs = [0, 1]
+    if layer_keys.dim() == 3:
+        layer_keys = layer_keys.reshape(layer_keys.shape[0], -1)
+    layer_keys = layer_keys.float()
+    if normalize_layers:
+        layer_keys = F.normalize(layer_keys, dim=-1)
+    F_complex = torch.fft.rfft(layer_keys, dim=0)
+    F_amp = F_complex.abs()
+    components = []
+    for f in freqs:
+        if f >= F_amp.shape[0]:
+            raise ValueError(f"freq={f} out of range for {layer_keys.shape[0]} layers")
+        components.append(F.normalize(F_amp[f], dim=-1))
+    return torch.cat(components, dim=-1)

kvcos/core/manifold_index.py ADDED Viewed

	@@ -0,0 +1,294 @@

+"""
+Engrammatic Geometry Retrieval — Manifold Index
+FAISS-backed MIPS (Maximum Inner Product Search) index for EGR retrieval.
+Indexes state vectors extracted from .eng files by MARStateExtractor.
+D2: FAISS IndexFlatIP for K→K retrieval only. Never Q→K.
+    faiss.serialize_index() for persistence (not write_index — avoids
+    platform incompatibility Issue #3888). Atomic write via temp + rename.
+    MKL build enforced at import time.
+D4: No L2 normalization. True MIPS. Raw inner product scores.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+import faiss
+import numpy as np
+import torch
+from kvcos.core.types import CacheSearchResult
+@dataclass
+class IndexEntry:
+    """Metadata associated with an indexed state vector."""
+    cache_id: str
+    task_description: str
+    model_id: str
+    created_at: str
+    context_len: int
+    l2_norm: float  # D4: stored for optional downstream use
+class ManifoldIndex:
+    """FAISS-backed inner product index for EGR state vectors.
+    Stores state vectors and associated metadata for MIPS retrieval.
+    Persistence via faiss.serialize_index() with atomic file writes.
+    Usage:
+        index = ManifoldIndex(dim=160)
+        index.add(state_vec, entry)
+        results = index.search(query_vec, top_k=5)
+        index.save(Path("~/.engram/index/egr.faiss"))
+    """
+    def __init__(self, dim: int, index_path: Path | None = None):
+        """Initialize the manifold index.
+        Args:
+            dim: Dimension of state vectors (must match MARStateExtractor output).
+            index_path: Optional path to load an existing index from disk.
+        """
+        self.dim = dim
+        self._entries: list[IndexEntry] = []
+        self._id_to_position: dict[str, int] = {}  # cache_id → FAISS row position
+        if index_path and index_path.exists():
+            self._index = self._load_index(index_path)
+        else:
+            # D2: IndexFlatIP — exact MIPS, correct for Phase 1 corpus sizes (<100K)
+            self._index = faiss.IndexFlatIP(dim)
+    @property
+    def n_entries(self) -> int:
+        """Number of indexed state vectors."""
+        return self._index.ntotal
+    def add(
+        self,
+        state_vec: torch.Tensor | np.ndarray,
+        entry: IndexEntry,
+    ) -> None:
+        """Add a state vector and its metadata to the index.
+        Args:
+            state_vec: [dim] state vector (D4: NOT normalized)
+            entry: Associated metadata for this engram
+        """
+        vec = self._to_numpy(state_vec)
+        if vec.shape != (self.dim,):
+            raise ValueError(
+                f"State vector dim {vec.shape} != index dim ({self.dim},)"
+            )
+        # Check for duplicate cache_id
+        if entry.cache_id in self._id_to_position:
+            # Update: remove old entry position tracking, add at new position
+            # FAISS IndexFlat doesn't support in-place update, so we just
+            # track the latest position. Old vector remains but is shadowed.
+            pass
+        position = self._index.ntotal
+        self._index.add(vec.reshape(1, -1).astype(np.float32))
+        self._entries.append(entry)
+        self._id_to_position[entry.cache_id] = position
+    def search(
+        self,
+        query_vec: torch.Tensor | np.ndarray,
+        top_k: int = 5,
+        min_similarity: float | None = None,
+        model_id: str | None = None,
+    ) -> list[CacheSearchResult]:
+        """Search for the most similar engram states via MIPS.
+        Args:
+            query_vec: [dim] query state vector
+            top_k: Number of results to return
+            min_similarity: Minimum inner product score threshold
+            model_id: Optional filter by model ID
+        Returns:
+            List of CacheSearchResult sorted by similarity (descending)
+        """
+        if self._index.ntotal == 0:
+            return []
+        vec = self._to_numpy(query_vec)
+        if vec.shape != (self.dim,):
+            raise ValueError(
+                f"Query vector dim {vec.shape} != index dim ({self.dim},)"
+            )
+        # Search more than top_k to account for filtering
+        search_k = min(top_k * 3, self._index.ntotal) if model_id else min(top_k, self._index.ntotal)
+        scores, indices = self._index.search(
+            vec.reshape(1, -1).astype(np.float32), search_k
+        )
+        results: list[CacheSearchResult] = []
+        for score, idx in zip(scores[0], indices[0]):
+            if idx < 0 or idx >= len(self._entries):
+                continue
+            entry = self._entries[idx]
+            # Skip if this cache_id has been superseded by a later add
+            if self._id_to_position.get(entry.cache_id) != idx:
+                continue
+            # Apply filters
+            if model_id and entry.model_id != model_id:
+                continue
+            if min_similarity is not None and score < min_similarity:
+                continue
+            results.append(CacheSearchResult(
+                cache_id=entry.cache_id,
+                similarity=float(score),
+                task_description=entry.task_description,
+                model_id=entry.model_id,
+                created_at=entry.created_at,
+                context_len=entry.context_len,
+            ))
+            if len(results) >= top_k:
+                break
+        return results
+    def remove(self, cache_id: str) -> bool:
+        """Mark a cache entry as removed from the index.
+        FAISS IndexFlat doesn't support deletion. We remove from the
+        metadata tracking so the entry is filtered out of search results.
+        The vector remains in FAISS until the next rebuild.
+        Args:
+            cache_id: ID to remove
+        Returns:
+            True if the entry was found and removed from tracking
+        """
+        if cache_id in self._id_to_position:
+            del self._id_to_position[cache_id]
+            return True
+        return False
+    def rebuild(self) -> int:
+        """Rebuild the index from only active entries.
+        Removes gaps left by remove() calls. Returns count of active entries.
+        """
+        active_positions = set(self._id_to_position.values())
+        if len(active_positions) == len(self._entries):
+            return len(active_positions)  # No gaps
+        # Collect active vectors and entries
+        new_entries: list[IndexEntry] = []
+        vectors: list[np.ndarray] = []
+        for pos, entry in enumerate(self._entries):
+            if pos in active_positions and entry.cache_id in self._id_to_position:
+                if self._id_to_position[entry.cache_id] == pos:
+                    vec = faiss.rev_swig_ptr(
+                        self._index.get_xb(), self._index.ntotal * self.dim
+                    ).reshape(-1, self.dim)[pos]
+                    vectors.append(vec.copy())
+                    new_entries.append(entry)
+        # Rebuild
+        self._index = faiss.IndexFlatIP(self.dim)
+        self._entries = []
+        self._id_to_position = {}
+        for vec, entry in zip(vectors, new_entries):
+            self.add(torch.from_numpy(vec), entry)
+        return self.n_entries
+    def save(self, path: Path) -> None:
+        """Persist the index to disk.
+        D2: Uses faiss.serialize_index() (not write_index) to avoid
+        platform incompatibility. Atomic write via temp file + rename.
+        Metadata saved as a sidecar .json file.
+        """
+        import json
+        path.parent.mkdir(parents=True, exist_ok=True)
+        # D2: serialize_index returns numpy uint8 array — write raw bytes
+        index_bytes: np.ndarray = faiss.serialize_index(self._index)
+        # Atomic write for FAISS index
+        tmp_path = path.with_suffix(".faiss.tmp")
+        try:
+            tmp_path.write_bytes(index_bytes.tobytes())
+            tmp_path.rename(path)
+        except Exception:
+            tmp_path.unlink(missing_ok=True)
+            raise
+        # Save metadata sidecar
+        meta_path = path.with_suffix(".meta.json")
+        meta_tmp = meta_path.with_suffix(".json.tmp")
+        try:
+            sidecar = {
+                "dim": self.dim,
+                "entries": [
+                    {
+                        "cache_id": e.cache_id,
+                        "task_description": e.task_description,
+                        "model_id": e.model_id,
+                        "created_at": e.created_at,
+                        "context_len": e.context_len,
+                        "l2_norm": e.l2_norm,
+                    }
+                    for e in self._entries
+                ],
+                "id_to_position": self._id_to_position,
+            }
+            meta_tmp.write_text(json.dumps(sidecar, indent=2))
+            meta_tmp.rename(meta_path)
+        except Exception:
+            meta_tmp.unlink(missing_ok=True)
+            raise
+    def _load_index(self, path: Path) -> faiss.IndexFlatIP:
+        """Load a FAISS index and its metadata sidecar from disk.
+        D2: Uses faiss.deserialize_index() from raw bytes (not read_index).
+        """
+        import json
+        raw = np.frombuffer(path.read_bytes(), dtype=np.uint8)
+        index = faiss.deserialize_index(raw)
+        meta_path = path.with_suffix(".meta.json")
+        if meta_path.exists():
+            sidecar = json.loads(meta_path.read_text())
+            self._entries = [
+                IndexEntry(**e) for e in sidecar.get("entries", [])
+            ]
+            self._id_to_position = {
+                k: int(v) for k, v in sidecar.get("id_to_position", {}).items()
+            }
+        return index
+    @staticmethod
+    def _to_numpy(vec: torch.Tensor | np.ndarray) -> np.ndarray:
+        """Convert a vector to numpy float32."""
+        if isinstance(vec, torch.Tensor):
+            return vec.detach().cpu().float().numpy()
+        return vec.astype(np.float32)

kvcos/core/retriever.py ADDED Viewed

	@@ -0,0 +1,263 @@

+"""
+Engrammatic Geometry Retrieval — Retriever
+Orchestrates the full EGR retrieval pipeline:
+  1. Extract state vector from query KV cache (MARStateExtractor)
+  2. Search manifold index for similar engram states (ManifoldIndex)
+  3. Load matched .eng files from storage (StorageBackend)
+  4. Return ranked results with KV tensors ready for injection
+This is the primary interface agents use for retrieval.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+import torch
+from kvcos.core.serializer import EngramSerializer
+from kvcos.core.types import (
+    CacheSearchResult,
+    CompressionMethod,
+    EngramMetadata,
+    ModelCacheSpec,
+    StateExtractionMode,
+)
+from kvcos.core.manifold_index import IndexEntry, ManifoldIndex
+from kvcos.core.state_extractor import ExtractionResult, MARStateExtractor
+from kvcos.storage.backends import StorageBackend
+@dataclass
+class RetrievalResult:
+    """A single retrieval result with loaded KV tensors."""
+    cache_id: str
+    similarity: float
+    task_description: str
+    model_id: str
+    keys: torch.Tensor  # [n_layers, n_kv_heads, ctx_len, head_dim]
+    values: torch.Tensor  # [n_layers, n_kv_heads, ctx_len, head_dim]
+    metadata: EngramMetadata
+@dataclass
+class RetrievalResponse:
+    """Full response from a retrieval query."""
+    query_extraction: ExtractionResult
+    results: list[RetrievalResult]
+    n_searched: int  # total entries in the index
+class EGRRetriever:
+    """Engrammatic Geometry Retrieval — full pipeline.
+    Connects MARStateExtractor → ManifoldIndex → StorageBackend
+    into a single retrieval call.
+    Usage:
+        retriever = EGRRetriever(extractor, index, storage)
+        # Store an engram
+        retriever.index_engram(keys, values, spec, agent_id, task_desc, model_id)
+        # Retrieve similar engrams
+        response = retriever.retrieve(query_keys, spec, top_k=3)
+        for result in response.results:
+            print(result.similarity, result.task_description)
+            # result.keys / result.values ready for injection
+    """
+    def __init__(
+        self,
+        extractor: MARStateExtractor,
+        index: ManifoldIndex,
+        storage: StorageBackend,
+        serializer: EngramSerializer | None = None,
+    ):
+        self.extractor = extractor
+        self.index = index
+        self.storage = storage
+        self._serializer = serializer or EngramSerializer()
+    def index_engram(
+        self,
+        keys: torch.Tensor,
+        values: torch.Tensor,
+        spec: ModelCacheSpec,
+        agent_id: str,
+        task_description: str,
+        model_id: str,
+        cache_id: str | None = None,
+        compression: CompressionMethod = CompressionMethod.Q8_0,
+        output_dir: Path | None = None,
+        extra_metadata: dict[str, str] | None = None,
+    ) -> str:
+        """Extract state vector, store .eng file, and add to index.
+        This is the "write" path: compute once → store → index → reuse forever.
+        Args:
+            keys: [n_layers, n_kv_heads, ctx_len, head_dim]
+            values: same shape as keys
+            spec: Model architecture spec
+            agent_id: Agent identifier
+            task_description: Human-readable task description (searchable)
+            model_id: Full model identifier
+            cache_id: Explicit ID (auto-generated if None)
+            compression: Compression method for storage
+            output_dir: Directory for .eng file (uses storage backend default if None)
+            extra_metadata: Additional metadata key-value pairs
+        Returns:
+            cache_id of the stored engram
+        """
+        import uuid
+        from datetime import datetime, timezone
+        from kvcos.core.types import ENG_FILE_EXTENSION
+        cid = cache_id or str(uuid.uuid4())
+        # 1. Extract state vector
+        extraction = self.extractor.extract(keys, spec)
+        # 2. Serialize to .eng file
+        if output_dir:
+            output_path = output_dir / f"{cid}{ENG_FILE_EXTENSION}"
+        else:
+            # Use a temp path; storage backend will move it
+            import tempfile
+            output_path = Path(tempfile.mkdtemp()) / f"{cid}{ENG_FILE_EXTENSION}"
+        merge_meta = {
+            "state_vec_norm": str(extraction.l2_norm),
+            "extraction_mode": extraction.mode.value,
+        }
+        if extra_metadata:
+            merge_meta.update(extra_metadata)
+        result = self._serializer.serialize(
+            keys=keys,
+            values=values,
+            agent_id=agent_id,
+            task_description=task_description,
+            model_id=model_id,
+            output_path=output_path,
+            compression=compression,
+            cache_id=cid,
+            extra_metadata=merge_meta,
+        )
+        # 3. Store in backend
+        metadata = self._serializer.read_metadata_only(output_path)
+        self.storage.store_file(cid, output_path, metadata)
+        # 4. Add to manifold index
+        now = datetime.now(timezone.utc).isoformat()
+        entry = IndexEntry(
+            cache_id=cid,
+            task_description=task_description,
+            model_id=model_id,
+            created_at=now,
+            context_len=keys.shape[2],
+            l2_norm=extraction.l2_norm,
+        )
+        self.index.add(extraction.state_vec, entry)
+        return cid
+    def retrieve(
+        self,
+        query_keys: torch.Tensor,
+        spec: ModelCacheSpec,
+        top_k: int = 5,
+        min_similarity: float | None = None,
+        model_id: str | None = None,
+        load_tensors: bool = True,
+    ) -> RetrievalResponse:
+        """Retrieve similar engram states for a query KV cache.
+        This is the "read" path: extract query vector → search index →
+        load matching .eng files.
+        Args:
+            query_keys: [n_layers, n_kv_heads, ctx_len, head_dim] query K cache
+            spec: Model architecture spec
+            top_k: Number of results to return
+            min_similarity: Minimum MIPS score threshold
+            model_id: Filter by model ID
+            load_tensors: If True, load full KV tensors from storage.
+                If False, return metadata only (faster for previewing).
+        Returns:
+            RetrievalResponse with ranked results
+        """
+        # 1. Extract query state vector
+        query_extraction = self.extractor.extract(query_keys, spec)
+        # 2. Search manifold index
+        search_results = self.index.search(
+            query_vec=query_extraction.state_vec,
+            top_k=top_k,
+            min_similarity=min_similarity,
+            model_id=model_id,
+        )
+        # 3. Load matching engrams from storage
+        results: list[RetrievalResult] = []
+        for sr in search_results:
+            if load_tensors:
+                path = self.storage.get_path(sr["cache_id"])
+                if path is None:
+                    continue
+                try:
+                    keys, values, metadata = self._serializer.deserialize(path)
+                except Exception:
+                    continue
+                results.append(RetrievalResult(
+                    cache_id=sr["cache_id"],
+                    similarity=sr["similarity"],
+                    task_description=sr["task_description"],
+                    model_id=sr["model_id"],
+                    keys=keys,
+                    values=values,
+                    metadata=metadata,
+                ))
+            else:
+                # Metadata-only mode
+                metadata = self.storage.get_metadata(sr["cache_id"])
+                if metadata is None:
+                    continue
+                results.append(RetrievalResult(
+                    cache_id=sr["cache_id"],
+                    similarity=sr["similarity"],
+                    task_description=sr["task_description"],
+                    model_id=sr["model_id"],
+                    keys=torch.empty(0),
+                    values=torch.empty(0),
+                    metadata=metadata,
+                ))
+        return RetrievalResponse(
+            query_extraction=query_extraction,
+            results=results,
+            n_searched=self.index.n_entries,
+        )
+    def delete_engram(self, cache_id: str) -> bool:
+        """Remove an engram from both index and storage."""
+        idx_removed = self.index.remove(cache_id)
+        store_removed = self.storage.delete(cache_id)
+        return idx_removed or store_removed
+    def save_index(self, path: Path) -> None:
+        """Persist the manifold index to disk."""
+        self.index.save(path)

kvcos/core/serializer.py ADDED Viewed

	@@ -0,0 +1,274 @@

+"""
+ENGRAM Protocol — .eng File Serializer
+.eng = safetensors container with:
+  - __metadata__: JSON-stringified EngramMetadata (all string values per D7)
+  - Tensor keys: layer_{i}_keys, layer_{i}_values
+  - Each tensor: [n_kv_heads, ctx_len, head_dim] at compressed dtype
+D7: safetensors confirmed. GGUF rejected. String-only metadata values.
+Reference: arXiv:2603.04428 uses identical safetensors approach.
+"""
+from __future__ import annotations
+import hashlib
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+import torch
+from safetensors.torch import load_file, save_file
+from kvcos.core.cache_spec import infer_model_family
+from kvcos.core.compression import CompressionResult, compress, decompress
+from kvcos.core.types import (
+    ENGRAM_VERSION,
+    ENG_FILE_EXTENSION,
+    CompressionMethod,
+    EngramMetadata,
+)
+class SerializationError(Exception):
+    """Raised when serialization or deserialization fails."""
+class EngramSerializer:
+    """Serializes/deserializes KV cache tensors to/from .eng files.
+    Canonical shape for KV tensors in ENGRAM:
+        keys:   [n_layers, n_kv_heads, ctx_len, head_dim]
+        values: [n_layers, n_kv_heads, ctx_len, head_dim]
+    """
+    def serialize(
+        self,
+        keys: torch.Tensor,
+        values: torch.Tensor,
+        agent_id: str,
+        task_description: str,
+        model_id: str,
+        output_path: Path,
+        compression: CompressionMethod = CompressionMethod.Q8_0,
+        cache_id: str | None = None,
+        parent_cache_id: str | None = None,
+        input_tokens: list[int] | None = None,
+        extra_metadata: dict[str, str] | None = None,
+    ) -> dict[str, Any]:
+        """Serialize KV cache tensors to a .eng file.
+        Args:
+            keys: [n_layers, n_kv_heads, ctx_len, head_dim]
+            values: [n_layers, n_kv_heads, ctx_len, head_dim]
+            agent_id: Identifier for the agent that produced this state
+            task_description: Human-readable description (used for EGR search)
+            model_id: Full model identifier
+            output_path: Path to write .eng file
+            compression: Compression method to apply
+            cache_id: Explicit cache ID (auto-generated if None)
+            parent_cache_id: ID of parent for delta chains
+            input_tokens: Token IDs that generated this state (for hash)
+            extra_metadata: Additional string key-value pairs
+        Returns:
+            Dict with cache_id, size_bytes, compression_ratio, path
+        """
+        if keys.shape != values.shape:
+            raise SerializationError(
+                f"Keys/values shape mismatch: {keys.shape} vs {values.shape}"
+            )
+        if keys.dim() != 4:
+            raise SerializationError(
+                f"Expected 4D [n_layers, n_kv_heads, ctx_len, head_dim], "
+                f"got {keys.dim()}D: {keys.shape}"
+            )
+        n_layers, n_kv_heads, ctx_len, head_dim = keys.shape
+        tensors: dict[str, torch.Tensor] = {}
+        if compression == CompressionMethod.INT8:
+            from kvcos.core.compression import compress_int8_tensor
+            k_pair = compress_int8_tensor(keys)
+            v_pair = compress_int8_tensor(values)
+            for i in range(n_layers):
+                tensors[f"layer_{i}_keys"] = k_pair.quantized[i].contiguous()
+                tensors[f"layer_{i}_keys_scale"] = k_pair.scales[i].contiguous()
+                tensors[f"layer_{i}_values"] = v_pair.quantized[i].contiguous()
+                tensors[f"layer_{i}_values_scale"] = v_pair.scales[i].contiguous()
+            # Reuse k_compressed for metadata only — actual INT8 data is
+            # already written per-layer above via k_pair/v_pair.
+            k_compressed = compress(keys, compression)
+            v_compressed = k_compressed
+        elif compression == CompressionMethod.LAYER_DELTA:
+            from kvcos.core.compression import compress_layer_delta
+            k_ld = compress_layer_delta(keys)
+            v_ld = compress_layer_delta(values)
+            # Layer 0: fp16 baseline
+            tensors["layer_0_keys"] = k_ld.baseline.contiguous()
+            tensors["layer_0_values"] = v_ld.baseline.contiguous()
+            # Layers 1..N: int8 deltas + fp16 scales
+            for i in range(n_layers - 1):
+                tensors[f"layer_{i+1}_keys"] = k_ld.delta_quantized[i].contiguous()
+                tensors[f"layer_{i+1}_keys_scale"] = k_ld.delta_scales[i].contiguous()
+                tensors[f"layer_{i+1}_values"] = v_ld.delta_quantized[i].contiguous()
+                tensors[f"layer_{i+1}_values_scale"] = v_ld.delta_scales[i].contiguous()
+            # Reuse k_compressed for metadata only — actual layer-delta data
+            # is already written above via k_ld/v_ld.
+            k_compressed = compress(keys, compression)
+            v_compressed = k_compressed
+        else:
+            k_compressed = compress(keys, compression)
+            v_compressed = compress(values, compression)
+            for i in range(n_layers):
+                tensors[f"layer_{i}_keys"] = k_compressed.data[i].contiguous()
+                tensors[f"layer_{i}_values"] = v_compressed.data[i].contiguous()
+        cid = cache_id or str(uuid.uuid4())
+        now = datetime.now(timezone.utc).isoformat()
+        token_hash = ""
+        if input_tokens:
+            token_bytes = b"".join(t.to_bytes(4, "little") for t in input_tokens)
+            token_hash = f"sha256:{hashlib.sha256(token_bytes).hexdigest()}"
+        metadata: EngramMetadata = {
+            "engram_version": ENGRAM_VERSION,
+            "cache_id": cid,
+            "compression": compression.value,
+            "model_id": model_id,
+            "model_family": infer_model_family(model_id),
+            "n_layers": str(n_layers),
+            "n_heads": str(n_kv_heads),
+            "n_kv_heads": str(n_kv_heads),
+            "head_dim": str(head_dim),
+            "context_len": str(ctx_len),
+            "agent_id": agent_id,
+            "task_description": task_description,
+            "created_at": now,
+        }
+        if parent_cache_id:
+            metadata["parent_cache_id"] = parent_cache_id
+        if token_hash:
+            metadata["token_hash"] = token_hash
+        for key, val in k_compressed.metadata.items():
+            metadata[f"compression_{key}"] = val  # type: ignore[literal-required]
+        if extra_metadata:
+            for key, val in extra_metadata.items():
+                metadata[key] = val  # type: ignore[literal-required]
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        str_metadata: dict[str, str] = {k: str(v) for k, v in metadata.items()}
+        save_file(tensors, str(output_path), metadata=str_metadata)
+        original_bytes = (keys.numel() + values.numel()) * keys.element_size()
+        compressed_bytes = output_path.stat().st_size
+        return {
+            "cache_id": cid,
+            "size_bytes": compressed_bytes,
+            "compression_ratio": original_bytes / compressed_bytes if compressed_bytes > 0 else 1.0,
+            "path": str(output_path),
+            "n_layers": n_layers,
+            "context_len": ctx_len,
+        }
+    def deserialize(
+        self,
+        path: Path,
+        target_compression: CompressionMethod | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, EngramMetadata]:
+        """Deserialize a .eng file into KV cache tensors.
+        Returns (keys, values, metadata) where tensors are
+        [n_layers, n_kv_heads, ctx_len, head_dim].
+        """
+        if not path.exists():
+            raise SerializationError(f"Engram file not found: {path}")
+        tensors = load_file(str(path))
+        metadata = self._read_metadata(path)
+        n_layers = int(metadata.get("n_layers", "0"))
+        if n_layers == 0:
+            n_layers = (
+                max(int(k.split("_")[1]) for k in tensors if k.startswith("layer_")) + 1
+            )
+        stored_compression = metadata.get("compression", "fp16")
+        is_int8 = stored_compression == CompressionMethod.INT8.value
+        is_layer_delta = stored_compression == CompressionMethod.LAYER_DELTA.value
+        k_layers: list[torch.Tensor] = []
+        v_layers: list[torch.Tensor] = []
+        if is_layer_delta:
+            from kvcos.core.compression import decompress_int8_tensor
+            # Layer 0: fp16 baseline
+            k_layers.append(tensors["layer_0_keys"].float())
+            v_layers.append(tensors["layer_0_values"].float())
+            # Layers 1..N: accumulate int8 deltas
+            for i in range(1, n_layers):
+                k_delta = decompress_int8_tensor(
+                    tensors[f"layer_{i}_keys"], tensors[f"layer_{i}_keys_scale"]
+                )
+                v_delta = decompress_int8_tensor(
+                    tensors[f"layer_{i}_values"], tensors[f"layer_{i}_values_scale"]
+                )
+                k_layers.append(k_layers[-1] + k_delta.float())
+                v_layers.append(v_layers[-1] + v_delta.float())
+            keys = torch.stack([l.to(torch.float16) for l in k_layers], dim=0)
+            values = torch.stack([l.to(torch.float16) for l in v_layers], dim=0)
+        else:
+            for i in range(n_layers):
+                k_key = f"layer_{i}_keys"
+                v_key = f"layer_{i}_values"
+                if k_key not in tensors or v_key not in tensors:
+                    raise SerializationError(f"Missing tensor for layer {i}")
+                if is_int8:
+                    from kvcos.core.compression import decompress_int8_tensor
+                    k_scale_key = f"layer_{i}_keys_scale"
+                    v_scale_key = f"layer_{i}_values_scale"
+                    if k_scale_key not in tensors or v_scale_key not in tensors:
+                        raise SerializationError(f"Missing INT8 scale for layer {i}")
+                    k_layers.append(decompress_int8_tensor(tensors[k_key], tensors[k_scale_key]))
+                    v_layers.append(decompress_int8_tensor(tensors[v_key], tensors[v_scale_key]))
+                else:
+                    k_layers.append(tensors[k_key])
+                    v_layers.append(tensors[v_key])
+            keys = torch.stack(k_layers, dim=0)
+            values = torch.stack(v_layers, dim=0)
+        if target_compression is not None:
+            stored = CompressionMethod(metadata.get("compression", "fp16"))
+            keys = decompress(keys, stored)
+            values = decompress(values, stored)
+        return keys, values, metadata  # type: ignore[return-value]
+    def _read_metadata(self, path: Path) -> dict[str, str]:
+        """Read only the metadata header (no tensor data loaded)."""
+        from safetensors import safe_open
+        metadata: dict[str, str] = {}
+        with safe_open(str(path), framework="pt") as f:
+            raw_meta = f.metadata()
+            if raw_meta:
+                metadata = dict(raw_meta)
+        return metadata
+    def read_metadata_only(self, path: Path) -> EngramMetadata:
+        """Read just the metadata from a .eng file. Efficient for indexing."""
+        raw = self._read_metadata(path)
+        return raw  # type: ignore[return-value]

kvcos/core/state_extractor.py ADDED Viewed

	@@ -0,0 +1,489 @@

+"""
+Engrammatic Geometry Retrieval — State Extraction Layer
+Extracts a retrieval state vector from a KV cache tensor for MIPS-based
+retrieval in EGR (Engrammatic Geometry Retrieval). The state vector is
+a compact geometric fingerprint of a cognitive state — positioned in the
+model's own pre-RoPE key manifold for geometrically consistent retrieval.
+Three extraction modes:
+  mean_pool:   Fast baseline. Mean over heads + context of key matrices
+               across extraction layers. Output: [head_dim]. No learned
+               parameters. Use for bootstrapping and smoke tests.
+  svd_project: Truncated SVD on pre-RoPE keys, extraction layers (D3: 8-31),
+               rank-160 for 8B models. Validated by ShadowKV (ICML 2025,
+               ByteDance) on Llama-3.1-8B and Phi-3-Mini-128K.
+               Output: [rank]. Projection is prompt-dependent — W computed
+               per cache via online SVD, not precomputed globally.
+               Reference: github.com/ByteDance-Seed/ShadowKV
+  xkv_project: Grouped cross-layer SVD. Groups 4 adjacent extraction layers,
+               extracts shared basis vectors across the group. Achieves
+               6.8x compression vs 2.5x single-layer SVD. K:V rank ratio
+               1:1.5 is optimal per xKV paper.
+               Reference: github.com/abdelfattah-lab/xKV
+               arXiv:2503.18893
+REMOVED: sals_project — last-layer-only extraction invalidated by
+Layer-Condensed KV Cache (ACL 2024). See D3.
+D4: No L2 normalization. True MIPS. L2 norm stored as metadata for
+    optional downstream use.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+import torch
+from einops import rearrange
+from kvcos.core.types import (
+    DEFAULT_SVD_RANK,
+    ModelCacheSpec,
+    StateExtractionMode,
+)
+@dataclass
+class ExtractionResult:
+    """Result of state vector extraction from a KV cache."""
+    state_vec: torch.Tensor  # [d_out] — the retrieval vector
+    l2_norm: float  # stored as metadata per D4
+    mode: StateExtractionMode
+    n_layers_used: int
+    n_tokens: int
+@dataclass
+class SVDProjection:
+    """Learned SVD projection matrix for a specific cache.
+    ShadowKV finding: pre-RoPE keys share low-rank subspaces WITHIN
+    sequences but differ ACROSS sequences. Projection must be computed
+    online per cache, not precomputed globally.
+    """
+    W: torch.Tensor  # [head_dim, rank] — right singular vectors
+    singular_values: torch.Tensor  # [rank] — for diagnostics
+    explained_variance_ratio: float  # fraction of variance captured
+    source_shape: tuple[int, ...]  # shape of the keys used to compute this
+class MARStateExtractor:
+    """Extracts retrieval state vectors from KV cache tensors for EGR.
+    Usage:
+        extractor = MARStateExtractor(mode="svd_project", rank=160)
+        result = extractor.extract(keys, spec)
+        # result.state_vec is the retrieval vector for FAISS IndexFlatIP
+        # result.l2_norm goes into .eng metadata (D4)
+    """
+    # Max rows fed to SVD. 8192 rows on a 128-dim matrix runs in ~15ms
+    # vs ~2000ms for the full 786K-row matrix. Subspace quality is
+    # preserved because SVD only needs O(head_dim²) samples to recover
+    # the top singular vectors of a low-rank matrix.
+    MAX_SVD_ROWS: int = 8192
+    def __init__(
+        self,
+        mode: StateExtractionMode = StateExtractionMode.SVD_PROJECT,
+        rank: int = DEFAULT_SVD_RANK,
+        xkv_group_size: int = 4,
+        xkv_kv_rank_ratio: float = 1.5,
+        max_svd_rows: int | None = None,
+        layer_range: tuple[int, int] | None = None,
+        gate_start: int = 0,
+    ):
+        self.mode = mode
+        self.rank = rank
+        self.xkv_group_size = xkv_group_size
+        self.xkv_kv_rank_ratio = xkv_kv_rank_ratio
+        self.max_svd_rows = max_svd_rows or self.MAX_SVD_ROWS
+        # Override spec extraction_layers when set. (8, 24) uses middle
+        # layers which encode semantic content (Tenney 2019, Huh 2024).
+        self.layer_range = layer_range
+        # Skip top gate_start singular values in SVD projection.
+        # Top SVs encode shared positional/syntactic structure;
+        # skipping them isolates semantic content (gate_start=6 optimal).
+        self.gate_start = gate_start
+        # Cached projection from last extract call (for inspection/reuse)
+        self._last_projection: SVDProjection | None = None
+    def extract(
+        self,
+        keys: torch.Tensor,
+        spec: ModelCacheSpec,
+    ) -> ExtractionResult:
+        """Extract a state vector from KV cache key tensors.
+        Args:
+            keys: [n_layers, n_kv_heads, ctx_len, head_dim] — the K cache.
+                  Must be pre-RoPE if available. Post-RoPE works but with
+                  reduced retrieval quality due to position-dependent distortion.
+            spec: Model architecture spec (provides extraction_layers).
+        Returns:
+            ExtractionResult with state vector and metadata.
+        """
+        n_layers, n_kv_heads, ctx_len, head_dim = keys.shape
+        # Layer selection: layer_range overrides spec extraction_layers
+        if self.layer_range is not None:
+            start, end = self.layer_range
+            start = max(0, min(start, n_layers))
+            end = max(start, min(end, n_layers))
+            layer_indices = list(range(start, end))
+        else:
+            extraction_layers = spec["extraction_layers"]
+            layer_indices = [l for l in extraction_layers if l < n_layers]
+        if not layer_indices:
+            layer_indices = list(range(n_layers))
+        selected_keys = keys[layer_indices]  # [n_selected, n_kv_heads, ctx_len, head_dim]
+        match self.mode:
+            case StateExtractionMode.MEAN_POOL:
+                state_vec = self._mean_pool(selected_keys)
+            case StateExtractionMode.SVD_PROJECT:
+                state_vec = self._svd_project(selected_keys)
+            case StateExtractionMode.XKV_PROJECT:
+                state_vec = self._xkv_project(selected_keys)
+            case _:
+                raise ValueError(f"Unknown extraction mode: {self.mode}")
+        # D4: No normalization. True MIPS. Store norm as metadata.
+        l2_norm = float(torch.linalg.vector_norm(state_vec).item())
+        return ExtractionResult(
+            state_vec=state_vec,
+            l2_norm=l2_norm,
+            mode=self.mode,
+            n_layers_used=len(layer_indices),
+            n_tokens=ctx_len,
+        )
+    def _mean_pool(self, keys: torch.Tensor) -> torch.Tensor:
+        """Fast baseline: mean over layers, heads, and context positions.
+        Input:  [n_layers, n_kv_heads, ctx_len, head_dim]
+        Output: [head_dim]
+        """
+        return keys.float().mean(dim=(0, 1, 2))
+    def _svd_project(self, keys: torch.Tensor) -> torch.Tensor:
+        """Truncated SVD projection on pre-RoPE keys.
+        ShadowKV approach: flatten all extraction layers' keys into a 2D matrix
+        [N, head_dim], compute truncated SVD, project onto top-rank singular vectors,
+        then mean-pool the projected vectors.
+        For large contexts (N > max_svd_rows), we subsample rows before SVD.
+        SVD only needs O(head_dim²) samples to recover the top singular vectors
+        of a low-rank matrix, so subsampling to 8K rows preserves subspace quality
+        while reducing SVD from ~2000ms to ~15ms at 4K context.
+        Input:  [n_layers, n_kv_heads, ctx_len, head_dim]
+        Output: [rank]
+        """
+        n_layers, n_kv_heads, ctx_len, head_dim = keys.shape
+        # Total rows in the flattened matrix
+        n_rows = n_layers * n_kv_heads * ctx_len
+        if n_rows > self.max_svd_rows:
+            # Subsample BEFORE flatten+cast to avoid allocating the full
+            # float32 matrix (saves ~30ms rearrange + 100MB at 4K context).
+            gen = torch.Generator()
+            gen.manual_seed(42)
+            indices = torch.randperm(n_rows, generator=gen)[:self.max_svd_rows]
+            flat_keys = keys.reshape(n_rows, head_dim)[indices].float()
+            svd_input = flat_keys
+        else:
+            flat_keys = rearrange(keys.float(), 'l h t d -> (l h t) d')
+            svd_input = flat_keys
+        # Clamp rank to not exceed matrix dimensions
+        max_rank = min(head_dim, svd_input.shape[0])
+        effective_rank = min(self.gate_start + self.rank, max_rank)
+        # Truncated SVD on (subsampled) matrix
+        U, S, Vh = torch.linalg.svd(svd_input, full_matrices=False)
+        # W = right singular vectors with gating: skip top gate_start SVs
+        # to remove shared positional/syntactic structure
+        W = Vh[self.gate_start:effective_rank, :].T
+        # Store projection for inspection
+        total_var = (S ** 2).sum()
+        explained_var = (S[:effective_rank] ** 2).sum()
+        self._last_projection = SVDProjection(
+            W=W,
+            singular_values=S[:effective_rank],
+            explained_variance_ratio=float((explained_var / total_var).item()) if total_var > 0 else 0.0,
+            source_shape=tuple(keys.shape),
+        )
+        # Project subsampled rows and mean-pool → [rank]
+        # Using the subsample for projection too avoids the expensive
+        # 786K × 128 matmul + mean that dominates at large contexts.
+        projected = svd_input @ W
+        state_vec = projected.mean(dim=0)
+        return state_vec
+    def _xkv_project(self, keys: torch.Tensor) -> torch.Tensor:
+        """Grouped cross-layer SVD (xKV approach).
+        Groups adjacent layers (default 4), computes shared SVD basis
+        per group, projects keys onto that basis, then concatenates
+        group state vectors.
+        This captures cross-layer structure that single-layer SVD misses.
+        Achieves 6.8x vs 2.5x for single-layer SVD on Llama-3.1-8B.
+        K:V rank ratio 1:1.5 is optimal per xKV paper, but since we
+        only index keys (D2: K→K retrieval), we use the K rank only.
+        Input:  [n_layers, n_kv_heads, ctx_len, head_dim]
+        Output: [n_groups * rank_per_group]
+        """
+        n_layers, n_kv_heads, ctx_len, head_dim = keys.shape
+        # Compute rank per group
+        # xKV finding: K rank is lower than V rank by factor 1:1.5
+        # For 160 total rank budget across groups, allocate per group
+        n_groups = max(1, n_layers // self.xkv_group_size)
+        rank_per_group = max(1, self.rank // n_groups)
+        rank_per_group = min(rank_per_group, head_dim)
+        group_vecs: list[torch.Tensor] = []
+        for g in range(n_groups):
+            start = g * self.xkv_group_size
+            end = min(start + self.xkv_group_size, n_layers)
+            group_keys = keys[start:end]  # [group_size, n_kv_heads, ctx_len, head_dim]
+            # Flatten group
+            n_group_rows = group_keys.shape[0] * n_kv_heads * ctx_len
+            if n_group_rows > self.max_svd_rows:
+                gen = torch.Generator()
+                gen.manual_seed(42 + g)
+                indices = torch.randperm(n_group_rows, generator=gen)[:self.max_svd_rows]
+                svd_input = group_keys.reshape(n_group_rows, head_dim)[indices].float()
+            else:
+                svd_input = rearrange(group_keys.float(), 'l h t d -> (l h t) d')
+            effective_rank = min(rank_per_group, svd_input.shape[0], head_dim)
+            # Truncated SVD for this group (on subsampled data)
+            U, S, Vh = torch.linalg.svd(svd_input, full_matrices=False)
+            W_group = Vh[:effective_rank, :].T  # [head_dim, rank_per_group]
+            # Project subsampled rows and mean-pool → [rank_per_group]
+            projected = svd_input @ W_group
+            group_vec = projected.mean(dim=0)
+            group_vecs.append(group_vec)
+        # Handle remainder layers (if n_layers not divisible by group_size)
+        remainder_start = n_groups * self.xkv_group_size
+        if remainder_start < n_layers:
+            remainder_keys = keys[remainder_start:]
+            n_rem_rows = remainder_keys.shape[0] * n_kv_heads * ctx_len
+            if n_rem_rows > self.max_svd_rows:
+                gen = torch.Generator()
+                gen.manual_seed(42 + n_groups)
+                indices = torch.randperm(n_rem_rows, generator=gen)[:self.max_svd_rows]
+                svd_input = remainder_keys.reshape(n_rem_rows, head_dim)[indices].float()
+            else:
+                svd_input = rearrange(remainder_keys.float(), 'l h t d -> (l h t) d')
+            effective_rank = min(rank_per_group, svd_input.shape[0], head_dim)
+            U, S, Vh = torch.linalg.svd(svd_input, full_matrices=False)
+            W_rem = Vh[:effective_rank, :].T
+            projected = svd_input @ W_rem
+            group_vecs.append(projected.mean(dim=0))
+        # Concatenate all group vectors → [n_groups * rank_per_group + remainder]
+        state_vec = torch.cat(group_vecs, dim=0)
+        return state_vec
+    # ── Fixed Corpus Basis (FCB) ────────────────────────────────────────────
+    @classmethod
+    def compute_corpus_basis(
+        cls,
+        key_tensors: list[torch.Tensor],
+        layer_range: tuple[int, int],
+        gate_start: int,
+        rank: int,
+        max_rows: int = 32768,
+        seed: int = 42,
+    ) -> torch.Tensor:
+        """Compute a fixed projection matrix from a corpus of key tensors.
+        Returns P: [rank, head_dim] — the global semantic basis.
+        Unlike per-document SVD, this basis is document-independent.
+        All documents projected with P exist in the same coordinate system,
+        enabling stable cross-document and cross-model comparison.
+        """
+        l_start, l_end = layer_range
+        gen = torch.Generator()
+        gen.manual_seed(seed)
+        all_rows: list[torch.Tensor] = []
+        per_doc_max = max(1, max_rows // len(key_tensors))
+        for keys in key_tensors:
+            k = keys[l_start:l_end].float()
+            n_rows = k.shape[0] * k.shape[1] * k.shape[2]
+            flat = k.reshape(n_rows, k.shape[3])
+            if flat.shape[0] > per_doc_max:
+                idx = torch.randperm(flat.shape[0], generator=gen)[:per_doc_max]
+                flat = flat[idx]
+            all_rows.append(flat)
+        corpus = torch.cat(all_rows, dim=0)
+        if corpus.shape[0] > max_rows:
+            idx = torch.randperm(corpus.shape[0], generator=gen)[:max_rows]
+            corpus = corpus[idx]
+        _, S, Vh = torch.linalg.svd(corpus, full_matrices=False)
+        P = Vh[gate_start : gate_start + rank]  # [rank, head_dim]
+        return P
+    def extract_with_basis(
+        self,
+        keys: torch.Tensor,
+        spec: ModelCacheSpec,
+        basis: torch.Tensor,
+    ) -> ExtractionResult:
+        """Extract state vector using a pre-computed fixed corpus basis.
+        All vectors computed with the same basis share a coordinate system,
+        which is required for cross-model transfer via adapter.
+        Args:
+            keys: [n_layers, n_kv_heads, n_cells, head_dim]
+            spec: Model spec (used for layer_range fallback)
+            basis: [rank, head_dim] from compute_corpus_basis()
+        Returns:
+            ExtractionResult with L2-normalized state vector
+        """
+        if self.layer_range is not None:
+            l_start, l_end = self.layer_range
+        else:
+            l_start, l_end = 0, keys.shape[0]
+        l_start = max(0, min(l_start, keys.shape[0]))
+        l_end = max(l_start, min(l_end, keys.shape[0]))
+        k = keys[l_start:l_end].float()
+        n_rows = k.shape[0] * k.shape[1] * k.shape[2]
+        flat = k.reshape(n_rows, k.shape[3])
+        proj = flat @ basis.T  # [N_rows, rank]
+        vec = proj.mean(dim=0)  # [rank]
+        norm = float(torch.linalg.vector_norm(vec).item())
+        vec_normed = vec / (norm + 1e-8)
+        return ExtractionResult(
+            state_vec=vec_normed.to(torch.float32),
+            l2_norm=norm,
+            mode=self.mode,
+            n_layers_used=l_end - l_start,
+            n_tokens=k.shape[2],
+        )
+    # ── Fourier Fingerprint (Engram Absolute) ────────────────────────
+    @staticmethod
+    def compute_fourier_fingerprint(
+        keys: torch.Tensor,
+        freqs: tuple[int, ...] = (0, 1),
+    ) -> torch.Tensor:
+        """Compute the Fourier Absolute fingerprint from KV cache keys.
+        Takes the real DFT over the layer dimension, extracts the
+        amplitude at the specified frequencies, normalizes each, and
+        concatenates them into a single fingerprint vector.
+        This fingerprint is:
+          - Cross-model invariant (cos ~0.90 between 3B and 8B)
+          - Corpus-independent (no basis, no center, no training)
+          - Scale-stable (98% recall@1 at N=1000, decay N^-0.207)
+        Args:
+            keys: [n_layers, n_kv_heads, n_cells, head_dim] — full KV keys.
+                  All layers are used (not sliced by layer_range).
+            freqs: Frequency indices to extract. Default (0, 1) = DC + 1st harmonic.
+                   f=0 captures overall key magnitude profile.
+                   f=1 captures dominant oscillation across depth.
+        Returns:
+            Fingerprint vector [dim * len(freqs)], L2-normalized.
+        """
+        # Mean over cells (tokens) per layer: [n_layers, n_kv_heads * head_dim]
+        n_layers = keys.shape[0]
+        layer_means = keys.float().mean(dim=2).reshape(n_layers, -1)
+        # DFT over layer dimension
+        F_complex = torch.fft.rfft(layer_means, dim=0)  # [n_freq, dim]
+        F_amp = F_complex.abs()  # amplitude spectrum
+        # Extract and normalize each frequency component
+        parts = []
+        for f in freqs:
+            if f >= F_amp.shape[0]:
+                # Frequency out of range — use zeros
+                parts.append(torch.zeros(F_amp.shape[1]))
+            else:
+                v = F_amp[f]
+                parts.append(v / (v.norm() + 1e-8))
+        fingerprint = torch.cat(parts, dim=0)
+        return fingerprint / (fingerprint.norm() + 1e-8)
+    @property
+    def last_projection(self) -> SVDProjection | None:
+        """Access the SVD projection from the last svd_project call.
+        Useful for diagnostics: check explained_variance_ratio to validate
+        that the rank is sufficient for this particular cache.
+        """
+        return self._last_projection
+    def output_dim(self, spec: ModelCacheSpec) -> int:
+        """Compute the output dimension of the state vector for a given spec.
+        This is needed to initialize the FAISS index with the correct dimension.
+        """
+        match self.mode:
+            case StateExtractionMode.MEAN_POOL:
+                return spec["head_dim"]
+            case StateExtractionMode.SVD_PROJECT:
+                max_rank = min(self.gate_start + self.rank, spec["head_dim"])
+                return max_rank - self.gate_start
+            case StateExtractionMode.XKV_PROJECT:
+                extraction_layers = spec["extraction_layers"]
+                n_layers = len(extraction_layers)
+                n_groups = max(1, n_layers // self.xkv_group_size)
+                rank_per_group = max(1, self.rank // n_groups)
+                rank_per_group = min(rank_per_group, spec["head_dim"])
+                # Groups + possible remainder group
+                has_remainder = (n_layers % self.xkv_group_size) != 0
+                total_groups = n_groups + (1 if has_remainder else 0)
+                return total_groups * rank_per_group
+            case _:
+                raise ValueError(f"Unknown mode: {self.mode}")

kvcos/core/types.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""
+ENGRAM Protocol — Core Type Definitions
+All enums, TypedDicts, constants, and type aliases live here.
+Every downstream module imports from this file. No circular dependencies.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from enum import StrEnum
+from typing import TypedDict
+# ── Constants ─────────────────────────────────────────────────────────────────
+ENGRAM_VERSION = "0.1.0"
+ENG_FILE_EXTENSION = ".eng"  # ENGRAM file format extension
+BLOCK_SIZE_TOKENS = 256  # 256-token blocks per arXiv:2603.04428
+DEFAULT_SVD_RANK = 160  # ShadowKV default for 8B models
+DEFAULT_LATENT_DIM = 512  # MLA: full KV info recoverable from 512-dim
+MAX_CONTEXT_TOKENS = 131072  # 128K max supported context
+# ── Enums ─────────────────────────────────────────────────────────────────────
+class CompressionMethod(StrEnum):
+    """Supported KV cache compression methods.
+    Phase 1: Q8_0, FP16
+    Phase 2: POLARQUANT (TurboQuant without QJL — QJL removed per D5)
+    """
+    FP16 = "fp16"
+    Q8_0 = "q8_0"  # llama.cpp GGML_TYPE_Q8_0: ~2x compression, <5% speed hit
+    Q4_0 = "q4_0"  # NOT recommended at 64K+ (92% dequant slowdown)
+    POLARQUANT = "polarquant_3bit"  # PolarQuant only, no QJL
+    INT8 = "int8"  # Phase 2: true int8 + per-row scale, 2x on-disk compression
+    LAYER_DELTA = "layer_delta"  # Phase 2: fp16 baseline + int8 inter-layer deltas
+class StorageBackend(StrEnum):
+    """Supported storage backends."""
+    LOCAL = "local"
+    REDIS = "redis"  # Phase 2
+    S3 = "s3"  # Phase 2
+class StateExtractionMode(StrEnum):
+    """EGR (Engrammatic Geometry Retrieval) state vector extraction modes.
+    mean_pool:   Fast baseline. Mean over heads + context of key matrices.
+    svd_project: Truncated SVD on pre-RoPE keys, layers 8-31, rank-160.
+                 Validated by ShadowKV (ICML 2025) on Llama-3.1-8B.
+    xkv_project: Grouped cross-layer SVD, 4-layer groups, K:V rank 1:1.5.
+                 From xKV (arXiv:2503.18893). 6.8x compression.
+    REMOVED: sals_project — last-layer-only extraction invalidated by
+    Layer-Condensed KV Cache (ACL 2024). See D3.
+    """
+    MEAN_POOL = "mean_pool"
+    SVD_PROJECT = "svd_project"
+    XKV_PROJECT = "xkv_project"
+class IndexBackend(StrEnum):
+    """EGR manifold index backends."""
+    FAISS_FLAT_IP = "faiss_flat_ip"  # Phase 1: exact MIPS
+    FAISS_IVF_IP = "faiss_ivf_ip"  # Phase 2: approximate MIPS for >100K vectors
+    QDRANT_DOT = "qdrant_dot"  # Phase 2: production persistent index
+class AttentionType(StrEnum):
+    """KV cache attention mechanism per layer group.
+    Standard models use FULL for all layers.
+    ISWA models (Gemma 4) interleave FULL (global) and SLIDING (SWA) sections.
+    """
+    FULL = "full"        # Full-context attention (standard)
+    SLIDING = "sliding"  # Sliding window attention (SWA)
+# ── Data Classes ─────────────────────────────────────────────────────────────
+@dataclass(frozen=True)
+class CacheSection:
+    """One section of a multi-section KV cache.
+    Standard models have a single implicit section covering all layers.
+    ISWA models serialize multiple sections sequentially in the state blob,
+    each with its own n_layers, n_kv_heads, and head_dim.
+    Reverse-engineered from Gemma 4 26B-A4B (llama.cpp b5200+):
+      Section 0 (Global): 5 layers, 2 KV heads, head_dim=512
+      Section 1 (SWA):   25 layers, 8 KV heads, head_dim=256
+    """
+    attention_type: AttentionType
+    n_layers: int
+    n_kv_heads: int
+    head_dim: int
+    window_size: int | None = None  # SWA window size in tokens (None for full)
+    @property
+    def n_embd_kv(self) -> int:
+        """Total KV embedding dimension for this section."""
+        return self.n_kv_heads * self.head_dim
+# ── TypedDicts ────────────────────────────────────────────────────────────────
+class _ModelCacheSpecRequired(TypedDict):
+    """Required fields for ModelCacheSpec (internal base)."""
+    model_id: str  # e.g. "meta-llama/Llama-3.1-8B-Instruct"
+    model_family: str  # e.g. "llama"
+    n_layers: int  # total transformer layers
+    n_heads: int  # query heads (may differ from KV heads in GQA)
+    n_kv_heads: int  # key/value heads (GQA-aware)
+    head_dim: int  # dimension per head
+    rope_enabled: bool  # whether model uses RoPE
+    extraction_layers: tuple[int, ...]  # layers for EGR state extraction (D3)
+class ModelCacheSpec(_ModelCacheSpecRequired, total=False):
+    """Architecture-agnostic specification of a model's KV cache layout.
+    Used to validate .eng files and ensure correct tensor shapes.
+    For standard models (Llama, Phi, Qwen, Mistral):
+        n_kv_heads and head_dim describe the single uniform KV cache.
+        cache_sections is absent.
+    For ISWA models (Gemma 4):
+        cache_sections lists per-section dimensions. Each section has its
+        own n_layers, n_kv_heads, and head_dim. The top-level n_kv_heads
+        and head_dim reflect the dominant (largest) section.
+        The state blob contains multiple sequential KV streams.
+    """
+    cache_sections: tuple[CacheSection, ...]
+class EngramMetadata(TypedDict, total=False):
+    """Metadata stored in .eng file header (safetensors __metadata__).
+    All values are strings per safetensors spec (D7).
+    Optional fields use total=False.
+    """
+    # Required
+    engram_version: str
+    cache_id: str
+    compression: str  # CompressionMethod value
+    model_id: str
+    model_family: str
+    n_layers: str  # stringified int
+    n_heads: str
+    n_kv_heads: str
+    head_dim: str
+    context_len: str
+    agent_id: str
+    task_description: str
+    created_at: str  # ISO 8601
+    # Optional
+    expires_at: str
+    parent_cache_id: str
+    delta_from: str
+    token_hash: str  # SHA-256 of input tokens
+    state_vec_norm: str  # L2 norm of state vector (D4: stored as metadata)
+    extraction_mode: str  # StateExtractionMode value
+    block_index: str  # block position within a multi-block cache
+    total_blocks: str
+class CacheSearchResult(TypedDict):
+    """Result from EGR manifold search over cached engram states."""
+    cache_id: str
+    similarity: float  # raw inner product score (not normalized — D4)
+    task_description: str
+    model_id: str
+    created_at: str
+    context_len: int
+class CacheStats(TypedDict):
+    """Aggregate statistics for the engram store."""
+    total_entries: int
+    total_size_bytes: int
+    avg_compression_ratio: float
+    model_breakdown: dict[str, int]  # model_family → count

kvcos/engram/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# EIGENGRAM format package
+from .format import EigramEncoder, EigramDecoder, EIGENGRAM_MAGIC, EIGENGRAM_VERSION
+from .writer import write_eigengram
+from .reader import read_eigengram, load_eigengram_index

kvcos/engram/__main__.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""
+EIGENGRAM command-line interface.
+Usage:
+    python -m kvcos.engram encode  --model <gguf> --text "..." --out doc.eng
+    python -m kvcos.engram search  --model <gguf> --query "..." index/*.eng
+    python -m kvcos.engram inspect doc.eng
+    python -m kvcos.engram list    index/*.eng
+Commands:
+    encode    Run a document through a GGUF model and write a .eng file.
+    search    Query .eng files using a text query and a model.
+    inspect   Print all fields from .eng files (no model needed).
+    list      Print a summary table of .eng files (no model needed).
+"""
+from __future__ import annotations
+import argparse
+import gc
+import glob
+import os
+import sys
+import torch
+def _resolve_paths(patterns: list[str]) -> list[str]:
+    """Expand glob patterns, return sorted list of .eng paths."""
+    paths = []
+    for p in patterns:
+        expanded = glob.glob(p)
+        if expanded:
+            paths.extend(expanded)
+        elif os.path.exists(p):
+            paths.append(p)
+        else:
+            print(f"Warning: no files matched '{p}'", file=sys.stderr)
+    return sorted(set(paths))
+def cmd_encode(args: argparse.Namespace) -> None:
+    """Encode a document as a .eng EIGENGRAM file."""
+    from kvcos.engram.writer import write_eigengram
+    if args.text:
+        text = args.text
+    elif args.file:
+        if not os.path.exists(args.file):
+            print(f"Error: input file not found: {args.file}", file=sys.stderr)
+            sys.exit(1)
+        text = open(args.file).read().strip()
+    else:
+        print("Error: provide --text or --file", file=sys.stderr)
+        sys.exit(1)
+    output_path = args.out or (
+        os.path.splitext(args.file)[0] + ".eng" if args.file else "output.eng"
+    )
+    task_desc = args.description or text[:80]
+    cache_id_val = args.id or text[:64]
+    print(f"Encoding document...")
+    print(f"  Model:  {args.model}")
+    print(f"  Text:   {text[:60]}{'...' if len(text) > 60 else ''}")
+    print(f"  Output: {output_path}")
+    print()
+    result = write_eigengram(
+        model_path=args.model,
+        text=text,
+        output_path=output_path,
+        cache_id=cache_id_val,
+        task_description=task_desc,
+        basis_path=args.basis,
+    )
+    print(f"Done.")
+    print(f"  File size : {result['file_size_bytes']} bytes")
+    print(f"  Model ID  : {result['model_id']}")
+    print(f"  SCS       : {result['scs']:.4f}")
+    print(f"  Basis rank: {result['basis_rank']}")
+def cmd_search(args: argparse.Namespace) -> None:
+    """Search .eng files using a text query."""
+    from llama_cpp import Llama
+    from kvcos.core.blob_parser import parse_state_blob
+    from kvcos.engram.reader import load_eigengram_index
+    from kvcos.core.manifold_index import ManifoldIndex
+    paths = _resolve_paths(args.eng_files)
+    if not paths:
+        print("No .eng files found.", file=sys.stderr)
+        sys.exit(1)
+    fingerprint = args.fingerprint
+    saved = torch.load(args.basis, weights_only=False)
+    P = saved["basis"]
+    center = saved["joint_center"]
+    LR = (8, 24)
+    GATE = 6
+    RANK = P.shape[0]
+    print(f"Query:        {args.query}")
+    print(f"Index:        {len(paths)} files")
+    print(f"Fingerprint:  {fingerprint}")
+    print()
+    llm = Llama(model_path=args.model, n_ctx=2048, n_gpu_layers=-1, verbose=False)
+    meta = llm.metadata
+    n_kv = int(meta.get("llama.attention.head_count_kv", "8"))
+    hd = int(meta.get("llama.embedding_length", "4096")) // int(
+        meta.get("llama.attention.head_count", "32")
+    )
+    llm.reset()
+    llm(args.query.strip(), max_tokens=1, temperature=0.0)
+    p_q = parse_state_blob(
+        bytes(llm.save_state().llama_state), n_kv_heads=n_kv, head_dim=hd
+    )
+    del llm
+    gc.collect()
+    if fingerprint == "fourier":
+        from kvcos.core.fingerprint import compute_fourier_fingerprint
+        # Use ALL layers for Fourier (not sliced)
+        layer_means = p_q.keys.float().mean(dim=2).reshape(p_q.keys.shape[0], -1)
+        query_vec = compute_fourier_fingerprint(layer_means, freqs=[0, 1])
+        dim = query_vec.shape[0]
+    elif fingerprint == "perdoc":
+        k_q = p_q.keys[LR[0] : LR[1]].float().reshape(-1, hd)
+        _, _, Vh = torch.linalg.svd(k_q, full_matrices=False)
+        proj_q = (k_q @ Vh[GATE : GATE + RANK].T).mean(0)
+        query_vec = proj_q / (proj_q.norm() + 1e-8)
+        dim = RANK
+    else:  # fcdb
+        k_q = p_q.keys[LR[0] : LR[1]].float().reshape(-1, hd)
+        mean_q = k_q.mean(0)
+        delta_q = mean_q - center
+        delta_q = delta_q / (delta_q.norm() + 1e-8)
+        query_vec = delta_q @ P.T
+        query_vec = query_vec / (query_vec.norm() + 1e-8)
+        dim = RANK
+    vecs, entries = load_eigengram_index(paths, fingerprint=fingerprint)
+    idx = ManifoldIndex(dim=dim)
+    for v, e in zip(vecs, entries):
+        idx.add(v, e)
+    top_k = min(args.top_k, len(paths))
+    results = idx.search(query_vec, top_k=top_k)
+    print(f"Results (top {top_k}):")
+    print(f"  {'#':<3} {'sim':>7}  {'cache_id':<20}  description")
+    print(f"  {'---'} {'-------'}  {'--------------------'}  {'----------------------------------------'}")
+    for i, r in enumerate(results):
+        desc = r.get("task_description", "")[:40]
+        cid = r.get("cache_id", "")[:20]
+        print(f"  {i + 1:<3} {r['similarity']:>+.4f}  {cid:<20}  {desc}")
+def cmd_inspect(args: argparse.Namespace) -> None:
+    """Print all fields of .eng files in readable format."""
+    from kvcos.engram.reader import read_eigengram
+    paths = _resolve_paths(args.eng_files)
+    if not paths:
+        print("No .eng files found.", file=sys.stderr)
+        sys.exit(1)
+    for path in paths:
+        try:
+            rec = read_eigengram(path)
+        except Exception as e:
+            print(f"  {path}: ERROR - {e}")
+            continue
+        size = os.path.getsize(path)
+        print(f"{'=' * 55}")
+        print(f"  File:         {path}  ({size} bytes)")
+        print(f"  Format:       EGR1 v{rec['version']}")
+        print(f"  Created:      {rec['created_at']} UTC")
+        print(f"  Model:        {rec['model_id']}")
+        print(f"  cache_id:     {rec['cache_id']}")
+        print(f"  Description:  {rec['task_description']}")
+        print()
+        print(f"  Basis rank:   {rec['basis_rank']}")
+        print(f"  N corpus:     {rec['n_corpus']}")
+        print(f"  Layer range:  {rec['layer_range']}")
+        print(f"  Context len:  {rec['context_len']} KV rows")
+        print(f"  L2 norm:      {rec['l2_norm']:.4f}")
+        print(f"  SCS:          {rec['scs']:.4f}")
+        print(f"  Margin proof: {rec['margin_proof']:.4f}")
+        print(f"  Corpus hash:  {rec['corpus_hash']}")
+        print(f"  vec_perdoc:   [{rec['vec_perdoc'].shape[0]}] norm={rec['vec_perdoc'].norm():.4f}")
+        print(f"  vec_fcdb:     [{rec['vec_fcdb'].shape[0]}] norm={rec['vec_fcdb'].norm():.4f}")
+        print()
+def cmd_list(args: argparse.Namespace) -> None:
+    """Print a one-line summary table of .eng files."""
+    from kvcos.engram.reader import read_eigengram
+    paths = _resolve_paths(args.eng_files)
+    if not paths:
+        print("No .eng files found.", file=sys.stderr)
+        sys.exit(1)
+    hdr = f"{'filename':<30}  {'model':<14}  {'scs':>6}  {'bytes':>5}  description"
+    print(hdr)
+    print("-" * len(hdr))
+    for path in paths:
+        fname = os.path.basename(path)[:29]
+        try:
+            rec = read_eigengram(path)
+            size = os.path.getsize(path)
+            print(
+                f"{fname:<30}  {rec['model_id'][:14]:<14}  "
+                f"{rec['scs']:>6.3f}  {size:>5}  "
+                f"{rec['task_description'][:40]}"
+            )
+        except Exception as e:
+            print(f"{fname:<30}  ERROR: {e}")
+def main() -> None:
+    """EIGENGRAM CLI entry point."""
+    parser = argparse.ArgumentParser(
+        prog="python -m kvcos.engram",
+        description="EIGENGRAM CLI - encode and search KV-cache semantic certificates.",
+    )
+    sub = parser.add_subparsers(dest="command", required=True)
+    enc = sub.add_parser("encode", help="Encode a document as a .eng file.")
+    enc.add_argument("--model", required=True, help="Path to GGUF model file.")
+    enc.add_argument("--text", help="Document text.")
+    enc.add_argument("--file", help="Path to a text file to encode.")
+    enc.add_argument("--out", help="Output .eng file path.")
+    enc.add_argument("--id", help="Unique cache_id.")
+    enc.add_argument("--description", help="Human-readable description.")
+    enc.add_argument("--basis", default="results/corpus_basis_fcdb_v2.pt", help="FCDB v2 basis path.")
+    srch = sub.add_parser("search", help="Search .eng files with a query.")
+    srch.add_argument("--model", required=True, help="GGUF model for query encoding.")
+    srch.add_argument("--query", required=True, help="Query text.")
+    srch.add_argument("--fingerprint", default="fourier", choices=["perdoc", "fcdb", "fourier"])
+    srch.add_argument("--top-k", type=int, default=5, dest="top_k")
+    srch.add_argument("--basis", default="results/corpus_basis_fcdb_v2.pt")
+    srch.add_argument("eng_files", nargs="+", help=".eng file paths or globs.")
+    ins = sub.add_parser("inspect", help="Print all fields of .eng files.")
+    ins.add_argument("eng_files", nargs="+")
+    lst = sub.add_parser("list", help="Summary table of .eng files.")
+    lst.add_argument("eng_files", nargs="+")
+    args = parser.parse_args()
+    {"encode": cmd_encode, "search": cmd_search, "inspect": cmd_inspect, "list": cmd_list}[args.command](args)
+if __name__ == "__main__":
+    main()

kvcos/engram/chunker.py ADDED Viewed

	@@ -0,0 +1,327 @@

+"""
+kvcos/engram/chunker.py — Markdown-aware semantic chunker.
+Splits markdown files into chunks suitable for .eng indexing.
+Each chunk gets its own fingerprint and becomes independently
+retrievable via HNSW.
+Strategy:
+  1. Split on H1/H2 headers first (natural semantic boundaries)
+  2. If a section exceeds max_chars, split on H3/H4
+  3. If still too large, split on paragraph boundaries
+  4. Never break mid-paragraph (preserve semantic coherence)
+Each chunk carries context: the file's title + parent headers
+are prepended so the fingerprint captures the full meaning.
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from typing import Sequence
+@dataclass(frozen=True)
+class Chunk:
+    """One semantic chunk from a markdown file."""
+    text: str              # Chunk content (with context header prepended)
+    raw_text: str          # Original content without context header
+    char_start: int        # Start offset in original file
+    char_end: int          # End offset in original file
+    index: int             # 0-based chunk index
+    headers: tuple[str, ...]  # Header hierarchy (e.g., ("# Title", "## Section"))
+    @property
+    def char_count(self) -> int:
+        return len(self.text)
+# Regex for markdown headers (ATX style: # through ######)
+_HEADER_RE = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
+def _header_level(line: str) -> int:
+    """Return header level (1-6) or 0 if not a header."""
+    m = re.match(r"^(#{1,6})\s+", line)
+    return len(m.group(1)) if m else 0
+def _split_by_headers(
+    content: str,
+    max_level: int = 2,
+) -> list[tuple[int, int, list[str]]]:
+    """
+    Split content into sections by header level.
+    Returns list of (start_offset, end_offset, header_stack) tuples.
+    max_level: split on headers of this level or lower (1=H1, 2=H2, etc.)
+    """
+    sections: list[tuple[int, int, list[str]]] = []
+    header_stack: list[str] = []
+    current_start = 0
+    for m in _HEADER_RE.finditer(content):
+        level = len(m.group(1))
+        header_text = m.group(0).strip()
+        if level <= max_level and m.start() > current_start:
+            # Close previous section
+            section_text = content[current_start:m.start()].strip()
+            if section_text:
+                sections.append((
+                    current_start,
+                    m.start(),
+                    list(header_stack),
+                ))
+            current_start = m.start()
+        # Update header stack
+        if level <= max_level:
+            # Trim stack to parent level and push current
+            header_stack = [
+                h for h in header_stack
+                if _header_level(h) < level
+            ]
+            header_stack.append(header_text)
+    # Final section
+    if current_start < len(content):
+        final_text = content[current_start:].strip()
+        if final_text:
+            sections.append((
+                current_start,
+                len(content),
+                list(header_stack),
+            ))
+    return sections
+def _split_paragraphs(
+    text: str,
+    max_chars: int,
+    base_offset: int = 0,
+) -> list[tuple[int, int]]:
+    """
+    Split text into chunks at paragraph boundaries.
+    Returns list of (start_offset, end_offset) tuples relative
+    to the original file (offset by base_offset).
+    """
+    paragraphs = re.split(r"\n\n+", text)
+    chunks: list[tuple[int, int]] = []
+    current_start = 0
+    current_len = 0
+    for para in paragraphs:
+        para_len = len(para) + 2  # +2 for the \n\n separator
+        if current_len + para_len > max_chars and current_len > 0:
+            # Close current chunk
+            chunks.append((
+                base_offset + current_start,
+                base_offset + current_start + current_len,
+            ))
+            current_start = current_start + current_len
+            current_len = 0
+        current_len += para_len
+    # Final chunk
+    if current_len > 0:
+        chunks.append((
+            base_offset + current_start,
+            base_offset + current_start + current_len,
+        ))
+    return chunks
+def chunk_markdown(
+    content: str,
+    max_chars: int = 2000,
+    min_chars: int = 100,
+    context_prefix: str = "",
+) -> list[Chunk]:
+    """
+    Split a markdown file into semantic chunks.
+    Args:
+        content: Full markdown file content.
+        max_chars: Target maximum chars per chunk (soft limit).
+        min_chars: Minimum chars — smaller sections merge with next.
+        context_prefix: Prepended to each chunk for context
+                       (e.g., "Source: geodesic3.md | Project: engram").
+    Returns:
+        List of Chunk objects, ordered by position in file.
+    """
+    if not content.strip():
+        return []
+    # If the whole file fits in one chunk, return it directly
+    if len(content) <= max_chars:
+        full_text = f"{context_prefix}\n\n{content}" if context_prefix else content
+        return [Chunk(
+            text=full_text,
+            raw_text=content,
+            char_start=0,
+            char_end=len(content),
+            index=0,
+            headers=(),
+        )]
+    # Phase 1: Split on H1/H2 boundaries
+    sections = _split_by_headers(content, max_level=2)
+    # If no headers found, treat as single block
+    if not sections:
+        sections = [(0, len(content), [])]
+    # Phase 2: Sub-split large sections on H3/H4
+    refined: list[tuple[int, int, list[str]]] = []
+    for start, end, headers in sections:
+        section_text = content[start:end]
+        if len(section_text) > max_chars:
+            subsections = _split_by_headers(section_text, max_level=4)
+            if len(subsections) > 1:
+                for sub_start, sub_end, sub_headers in subsections:
+                    refined.append((
+                        start + sub_start,
+                        start + sub_end,
+                        headers + sub_headers,
+                    ))
+            else:
+                refined.append((start, end, headers))
+        else:
+            refined.append((start, end, headers))
+    # Phase 3: Paragraph-split anything still over max_chars
+    final_ranges: list[tuple[int, int, list[str]]] = []
+    for start, end, headers in refined:
+        section_text = content[start:end]
+        if len(section_text) > max_chars:
+            para_ranges = _split_paragraphs(section_text, max_chars, start)
+            for p_start, p_end in para_ranges:
+                final_ranges.append((p_start, p_end, headers))
+        else:
+            final_ranges.append((start, end, headers))
+    # Phase 4: Greedily pack sections into chunks up to max_chars.
+    # Keep merging consecutive sections while their combined size
+    # stays under max_chars. This prevents over-fragmentation of
+    # files with many small header sections.
+    merged: list[tuple[int, int, list[str]]] = []
+    for start, end, headers in final_ranges:
+        chunk_text = content[start:end].strip()
+        if not chunk_text:
+            continue
+        if merged:
+            prev_start, prev_end, prev_headers = merged[-1]
+            prev_len = prev_end - prev_start
+            curr_len = end - start
+            # Merge if combined chunk stays under max_chars
+            if (prev_len + curr_len) <= max_chars:
+                merged[-1] = (prev_start, end, prev_headers)
+                continue
+        merged.append((start, end, headers))
+    # Phase 5: Build Chunk objects with context
+    chunks: list[Chunk] = []
+    for idx, (start, end, headers) in enumerate(merged):
+        raw = content[start:end].strip()
+        if not raw:
+            continue
+        # Build context header
+        parts = []
+        if context_prefix:
+            parts.append(context_prefix)
+        if headers:
+            parts.append(" > ".join(headers))
+        prefix = "\n".join(parts)
+        text = f"{prefix}\n\n{raw}" if prefix else raw
+        chunks.append(Chunk(
+            text=text,
+            raw_text=raw,
+            char_start=start,
+            char_end=end,
+            index=idx,
+            headers=tuple(headers),
+        ))
+    # Re-index after merging
+    return [
+        Chunk(
+            text=c.text,
+            raw_text=c.raw_text,
+            char_start=c.char_start,
+            char_end=c.char_end,
+            index=i,
+            headers=c.headers,
+        )
+        for i, c in enumerate(chunks)
+    ]
+def slug_from_path(path: str) -> str:
+    """
+    Generate a kebab-case slug from a file path.
+    Examples:
+        "geodesic3.md" → "geodesic3"
+        "EIGENGRAM_SPEC.md" → "eigengram-spec"
+        "coding-style.md" → "coding-style"
+    """
+    name = path.rsplit("/", 1)[-1]  # filename only
+    name = name.rsplit(".", 1)[0]   # strip extension
+    # Convert underscores and spaces to hyphens, lowercase
+    slug = re.sub(r"[_\s]+", "-", name).lower()
+    # Strip non-alphanumeric except hyphens
+    slug = re.sub(r"[^a-z0-9-]", "", slug)
+    # Collapse multiple hyphens
+    slug = re.sub(r"-+", "-", slug).strip("-")
+    return slug
+def eng_filename(
+    project: str,
+    slug: str,
+    date: str,
+    chunk_index: int | None = None,
+    chunk_total: int | None = None,
+    time_str: str = "",
+) -> str:
+    """
+    Generate .eng filename following the naming convention.
+    Format: <slug>[_<chunk>]_<date>[_<time>].eng
+    Args:
+        project: Project namespace (used for directory, not filename)
+        slug: Kebab-case file identifier
+        date: ISO date string (YYYY-MM-DD)
+        chunk_index: 0-based chunk index (None if single chunk)
+        chunk_total: Total chunks (None if single chunk)
+        time_str: Optional HHmm time string
+    Returns:
+        Filename (not full path) like "geodesic3_001_2026-04-02.eng"
+    """
+    parts = [slug]
+    if chunk_index is not None and chunk_total is not None and chunk_total > 1:
+        parts.append(f"{chunk_index + 1:03d}")
+    parts.append(date)
+    if time_str:
+        parts.append(time_str)
+    return "_".join(parts) + ".eng"

kvcos/engram/embedder.py ADDED Viewed

	@@ -0,0 +1,221 @@

+"""
+kvcos/engram/embedder.py — Unified text-to-fingerprint embedding.
+Three strategies, tried in priority order:
+  1. llama_cpp:  Native ENGRAM KV-cache Fourier pipeline (2048-dim)
+  2. sbert:      Sentence-transformers all-MiniLM-L6-v2 (384-dim)
+  3. hash:       Deterministic SHA256-seeded pseudo-fingerprint (2048-dim)
+The chosen strategy is cached after first call. The fingerprint
+source tag travels with every .eng file so retrieval knows what
+comparison is valid.
+Usage:
+    from kvcos.engram.embedder import get_fingerprint
+    fp, source = get_fingerprint("some text")
+    # fp: torch.Tensor, source: "llama_cpp"|"sbert"|"hash-fallback"
+"""
+from __future__ import annotations
+import hashlib
+import os
+from pathlib import Path
+from typing import Protocol
+import numpy as np
+import torch
+class Embedder(Protocol):
+    """Protocol for text → fingerprint embedding."""
+    def embed(self, text: str) -> torch.Tensor: ...
+    @property
+    def source(self) -> str: ...
+    @property
+    def dim(self) -> int: ...
+# ── Strategy 1: Native ENGRAM (llama_cpp) ────────────────────────────
+class LlamaCppEmbedder:
+    """KV-cache Fourier fingerprint via local GGUF model.
+    Uses the full ENGRAM pipeline:
+      text → LlamaCppBridge (generate → KV cache) → Fourier DFT → fingerprint
+    Supports both standard and ISWA models:
+      Standard (Llama):  2048-dim (8 × 128 × 2)
+      ISWA (Gemma 4):    6144-dim (1024×2 + 2048×2)
+    """
+    def __init__(self, model_path: str) -> None:
+        from integrations.llama_cpp_bridge import LlamaCppBridge
+        from kvcos.core.cache_spec import is_iswa_spec
+        from kvcos.core.fingerprint import compute_fourier_fingerprint_v2, compute_iswa_fingerprint
+        self._bridge = LlamaCppBridge(
+            model_path,
+            n_ctx=2048,
+            n_gpu_layers=0,
+            verbose=False,
+        )
+        self._spec = self._bridge.load_model()
+        self._is_iswa = is_iswa_spec(self._spec)
+        self._compute_standard = compute_fourier_fingerprint_v2
+        self._compute_iswa = compute_iswa_fingerprint
+        if self._is_iswa:
+            sections = self._spec["cache_sections"]
+            self._dim = sum(s.n_kv_heads * s.head_dim * 2 for s in sections)
+        else:
+            self._dim = self._spec["n_kv_heads"] * self._spec["head_dim"] * 2
+    def embed(self, text: str) -> torch.Tensor:
+        """Generate text through model, extract KV keys, compute Fourier fp."""
+        self._bridge.llm.reset()
+        self._bridge.generate(text, max_tokens=1)
+        if self._is_iswa:
+            parsed = self._bridge.extract_kv_cache_iswa()
+            return self._compute_iswa(parsed, freqs=[0, 1])
+        parsed = self._bridge.extract_kv_cache()
+        layer_keys = parsed.keys.float().mean(dim=2)
+        return self._compute_standard(layer_keys, freqs=[0, 1])
+    @property
+    def source(self) -> str:
+        return "llama_cpp"
+    @property
+    def dim(self) -> int:
+        return self._dim
+# ── Strategy 2: Sentence-transformers ────────────────────────────────
+class SBertEmbedder:
+    """Semantic fingerprint via sentence-transformers.
+    Uses all-MiniLM-L6-v2 (80MB, 384-dim). Downloads on first use.
+    Subsequent calls use the cached model (~50ms per text on CPU).
+    """
+    MODEL_NAME = "all-MiniLM-L6-v2"
+    def __init__(self) -> None:
+        import logging
+        import warnings
+        # Suppress noisy HF/tokenizer/sbert/safetensors warnings on load
+        for name in (
+            "sentence_transformers",
+            "transformers",
+            "transformers.modeling_utils",
+            "huggingface_hub",
+            "huggingface_hub.utils",
+            "safetensors",
+        ):
+            logging.getLogger(name).setLevel(logging.CRITICAL)
+        # Suppress the HF_TOKEN and load-report warnings
+        os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
+        os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
+        os.environ.setdefault("HF_HUB_VERBOSITY", "error")
+        warnings.filterwarnings("ignore", category=FutureWarning)
+        from sentence_transformers import SentenceTransformer
+        self._model = SentenceTransformer(self.MODEL_NAME)
+        self._dim = self._model.get_sentence_embedding_dimension()
+    def embed(self, text: str) -> torch.Tensor:
+        # encode returns numpy array
+        vec = self._model.encode(text, normalize_embeddings=True)
+        return torch.from_numpy(vec.astype(np.float32))
+    @property
+    def source(self) -> str:
+        return "sbert"
+    @property
+    def dim(self) -> int:
+        return self._dim
+# ── Strategy 3: Hash fallback ────────────────────────────────────────
+class HashEmbedder:
+    """Deterministic pseudo-fingerprint from SHA256 hash.
+    No semantic meaning — same text always maps to same vector,
+    but unrelated texts have random cosine similarity (~0).
+    """
+    def __init__(self, dim: int = 2048) -> None:
+        self._dim = dim
+    def embed(self, text: str) -> torch.Tensor:
+        seed = int(hashlib.sha256(text.encode()).hexdigest()[:8], 16)
+        rng = np.random.RandomState(seed)
+        fp = rng.randn(self._dim).astype("float32")
+        fp /= np.linalg.norm(fp) + 1e-8
+        return torch.from_numpy(fp)
+    @property
+    def source(self) -> str:
+        return "hash-fallback"
+    @property
+    def dim(self) -> int:
+        return self._dim
+# ── Singleton factory ────────────────────────────────────────────────
+_cached_embedder: Embedder | None = None
+def _create_embedder() -> Embedder:
+    """Try strategies in priority order, return first that works."""
+    # Strategy 1: llama_cpp
+    model_path = os.environ.get("ENGRAM_MODEL_PATH", "")
+    if model_path and Path(model_path).exists():
+        try:
+            return LlamaCppEmbedder(model_path)
+        except Exception:
+            pass
+    # Strategy 2: sentence-transformers
+    try:
+        embedder = SBertEmbedder()
+        return embedder
+    except Exception:
+        pass
+    # Strategy 3: hash fallback (always works)
+    return HashEmbedder()
+def get_embedder() -> Embedder:
+    """Get the cached embedder singleton."""
+    global _cached_embedder
+    if _cached_embedder is None:
+        _cached_embedder = _create_embedder()
+    return _cached_embedder
+def get_fingerprint(text: str) -> tuple[torch.Tensor, str]:
+    """
+    Compute fingerprint for text using best available strategy.
+    Returns:
+        (fingerprint_tensor, source_tag)
+    """
+    embedder = get_embedder()
+    fp = embedder.embed(text)
+    return fp, embedder.source
+def reset_embedder() -> None:
+    """Reset the cached embedder (for testing or strategy switching)."""
+    global _cached_embedder
+    _cached_embedder = None

kvcos/engram/format.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""
+EIGENGRAM binary format codec (EGR1 v1.0).
+An EIGENGRAM (.eng) file is a self-contained semantic certificate
+for a KV-cache document. It encodes two fingerprint vectors, the
+shared coordinate system they live in, and enough metadata to
+reproduce the query fold-in without access to the original text
+or model.
+Design goals:
+  - Portable: pure binary, no JSON, no pickle, no protobuf.
+  - Versioned: magic bytes + version field.
+  - Self-contained: joint_center embedded for query fold-in.
+  - Compact: float16 vectors, ~800 bytes total per document.
+Dual-fingerprint architecture:
+  vec_perdoc  - per-document SVD projection (same-model, margin ~0.37)
+  vec_fcdb    - FCDB projection (cross-model, margin ~0.013)
+Binary layout (little-endian, 99-byte fixed header):
+  Offset  Size  Type     Field
+     0     4    bytes    magic = "EGR1"
+     4     1    uint8    version (currently 1)
+     5    32    ascii    corpus_hash
+    37    20    ascii    created_at
+    57    16    ascii    model_id (null-padded)
+    73     2    uint16   basis_rank R
+    75     2    uint16   n_corpus
+    77     2    int8x2   layer_range
+    79     4    uint32   context_len
+    83     4    float32  l2_norm
+    87     4    float32  scs
+    91     4    float32  margin_proof
+    95     2    uint16   task_desc_len
+    97     2    uint16   cache_id_len
+  Variable:
+    99     R*2  float16  vec_perdoc
+   +R*2   R*2  float16  vec_fcdb
+  +2R*2   256  float16  joint_center (128 x float16)
+   +256   var  utf-8    task_description
+   +var   var  utf-8    cache_id
+Total for R=116: ~800 bytes.
+Compatibility: readers MUST reject magic != "EGR1" or version mismatch.
+"""
+from __future__ import annotations
+import struct
+import numpy as np
+import torch
+EIGENGRAM_MAGIC = b"EGR1"
+EIGENGRAM_VERSION = 1
+class EigramEncoder:
+    """Encode and decode EIGENGRAM binary certificates.
+    A single instance handles both directions. EigramDecoder is an alias.
+    Float16 storage preserves cosine similarity to > 0.999.
+    """
+    def encode(
+        self,
+        vec_perdoc: torch.Tensor,
+        vec_fcdb: torch.Tensor,
+        joint_center: torch.Tensor,
+        corpus_hash: str,
+        model_id: str,
+        basis_rank: int,
+        n_corpus: int,
+        layer_range: tuple[int, int],
+        context_len: int,
+        l2_norm: float,
+        scs: float,
+        margin_proof: float,
+        task_description: str,
+        cache_id: str,
+        vec_fourier: torch.Tensor | None = None,
+        local_density: int = 0,
+        eigenform_score: float = 1.0,
+        confusion_flag: bool = False,
+        vec_fourier_v2: torch.Tensor | None = None,
+    ) -> bytes:
+        """Serialise all fields into an EIGENGRAM binary blob."""
+        from datetime import datetime, timezone
+        td_b = task_description.encode("utf-8")[:256]
+        ci_b = cache_id.encode("utf-8")[:64]
+        now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S")
+        buf = bytearray()
+        buf += EIGENGRAM_MAGIC
+        buf += struct.pack("<B", EIGENGRAM_VERSION)
+        buf += corpus_hash.encode("ascii")[:32].ljust(32, b"\x00")
+        buf += now.encode("ascii")[:20].ljust(20, b"\x00")
+        buf += model_id.encode("ascii")[:16].ljust(16, b"\x00")
+        buf += struct.pack("<H", basis_rank)
+        buf += struct.pack("<H", n_corpus)
+        buf += struct.pack("<bb", layer_range[0], layer_range[1])
+        buf += struct.pack("<I", context_len)
+        buf += struct.pack("<f", l2_norm)
+        buf += struct.pack("<f", scs)
+        buf += struct.pack("<f", margin_proof)
+        buf += struct.pack("<H", len(td_b))
+        buf += struct.pack("<H", len(ci_b))
+        # fourier_dim: 0 if no vec_fourier, else len(vec_fourier)
+        fourier_dim = len(vec_fourier) if vec_fourier is not None else 0
+        buf += struct.pack("<H", fourier_dim)
+        buf += struct.pack("<H", local_density)
+        buf += struct.pack("<f", eigenform_score)
+        buf += vec_perdoc.to(torch.float16).numpy().tobytes()
+        buf += vec_fcdb.to(torch.float16).numpy().tobytes()
+        buf += joint_center[:128].to(torch.float16).numpy().tobytes()
+        buf += td_b
+        buf += ci_b
+        # Append vec_fourier if present (backward-compatible extension)
+        if vec_fourier is not None:
+            buf += vec_fourier.to(torch.float16).numpy().tobytes()
+        # v1.2 extension: confusion_flag + vec_fourier_v2
+        # Written only when at least one is non-default, preserving
+        # backward compat with readers that stop after vec_fourier.
+        if confusion_flag or vec_fourier_v2 is not None:
+            buf += struct.pack("<B", 1 if confusion_flag else 0)
+            v2_dim = len(vec_fourier_v2) if vec_fourier_v2 is not None else 0
+            buf += struct.pack("<H", v2_dim)
+            if vec_fourier_v2 is not None:
+                buf += vec_fourier_v2.to(torch.float16).numpy().tobytes()
+        return bytes(buf)
+    def decode(self, data: bytes) -> dict:
+        """Deserialise an EIGENGRAM binary blob into a dict.
+        Returns dict with all fields. Vectors upcast to float32.
+        Raises ValueError on magic/version mismatch.
+        """
+        if len(data) < 4 or data[:4] != EIGENGRAM_MAGIC:
+            raise ValueError(
+                f"Invalid EIGENGRAM magic: {data[:4]!r} (expected {EIGENGRAM_MAGIC!r})"
+            )
+        off = 4
+        version = struct.unpack_from("<B", data, off)[0]; off += 1
+        if version != EIGENGRAM_VERSION:
+            raise ValueError(
+                f"Unsupported EIGENGRAM version {version} "
+                f"(this reader supports v{EIGENGRAM_VERSION})"
+            )
+        corpus_hash = data[off : off + 32].rstrip(b"\x00").decode("ascii"); off += 32
+        created_at = data[off : off + 20].rstrip(b"\x00").decode("ascii"); off += 20
+        model_id = data[off : off + 16].rstrip(b"\x00").decode("ascii"); off += 16
+        basis_rank = struct.unpack_from("<H", data, off)[0]; off += 2
+        n_corpus = struct.unpack_from("<H", data, off)[0]; off += 2
+        lr0, lr1 = struct.unpack_from("<bb", data, off); off += 2
+        context_len = struct.unpack_from("<I", data, off)[0]; off += 4
+        l2_norm = struct.unpack_from("<f", data, off)[0]; off += 4
+        scs = struct.unpack_from("<f", data, off)[0]; off += 4
+        margin_proof = struct.unpack_from("<f", data, off)[0]; off += 4
+        td_len = struct.unpack_from("<H", data, off)[0]; off += 2
+        ci_len = struct.unpack_from("<H", data, off)[0]; off += 2
+        # v1.1 extension fields: fourier_dim + local_density
+        # Detect by checking if file has extra bytes beyond v1.0 layout
+        fourier_dim = 0
+        local_density = 0
+        expected_old_size = off + basis_rank * 4 + 256 + td_len + ci_len
+        eigenform_score = 1.0
+        if len(data) > expected_old_size + 4:
+            fourier_dim = struct.unpack_from("<H", data, off)[0]; off += 2
+            local_density = struct.unpack_from("<H", data, off)[0]; off += 2
+            eigenform_score = struct.unpack_from("<f", data, off)[0]; off += 4
+        # If the file was written with fourier_dim field but is old format,
+        # we already consumed 2 bytes. This is safe because old files
+        # won't have extra bytes.
+        R = basis_rank
+        vec_perdoc = torch.from_numpy(
+            np.frombuffer(data, dtype=np.float16, count=R, offset=off).copy()
+        ).float(); off += R * 2
+        vec_fcdb = torch.from_numpy(
+            np.frombuffer(data, dtype=np.float16, count=R, offset=off).copy()
+        ).float(); off += R * 2
+        joint_center = torch.from_numpy(
+            np.frombuffer(data, dtype=np.float16, count=128, offset=off).copy()
+        ).float(); off += 128 * 2
+        task_description = data[off : off + td_len].decode("utf-8", errors="replace"); off += td_len
+        cache_id = data[off : off + ci_len].decode("utf-8", errors="replace"); off += ci_len
+        # Read vec_fourier if present
+        vec_fourier = None
+        if fourier_dim > 0 and off + fourier_dim * 2 <= len(data):
+            vec_fourier = torch.from_numpy(
+                np.frombuffer(data, dtype=np.float16, count=fourier_dim, offset=off).copy()
+            ).float()
+            off += fourier_dim * 2
+        # v1.2 extension: confusion_flag + vec_fourier_v2
+        confusion_flag = False
+        vec_fourier_v2 = None
+        if off + 3 <= len(data):  # 1 byte flag + 2 byte dim minimum
+            confusion_flag = bool(struct.unpack_from("<B", data, off)[0])
+            off += 1
+            v2_dim = struct.unpack_from("<H", data, off)[0]
+            off += 2
+            if v2_dim > 0 and off + v2_dim * 2 <= len(data):
+                vec_fourier_v2 = torch.from_numpy(
+                    np.frombuffer(data, dtype=np.float16, count=v2_dim, offset=off).copy()
+                ).float()
+        result = {
+            "version": version,
+            "corpus_hash": corpus_hash,
+            "created_at": created_at,
+            "model_id": model_id,
+            "basis_rank": basis_rank,
+            "n_corpus": n_corpus,
+            "layer_range": (lr0, lr1),
+            "context_len": context_len,
+            "l2_norm": l2_norm,
+            "scs": scs,
+            "margin_proof": margin_proof,
+            "vec_perdoc": vec_perdoc,
+            "vec_fcdb": vec_fcdb,
+            "joint_center": joint_center,
+            "task_description": task_description,
+            "cache_id": cache_id,
+        }
+        if vec_fourier is not None:
+            result["vec_fourier"] = vec_fourier
+        if vec_fourier_v2 is not None:
+            result["vec_fourier_v2"] = vec_fourier_v2
+        result["local_density"] = local_density
+        result["eigenform_score"] = eigenform_score
+        result["confusion_flag"] = confusion_flag
+        return result
+EigramDecoder = EigramEncoder

kvcos/engram/hnsw_index.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""
+ENGRAM HNSW Index — O(log N) approximate nearest neighbor retrieval.
+Wraps faiss.IndexHNSWFlat for production-scale ENGRAM search.
+Primary fingerprint: v2 layer-normalized Fourier f0+f1.
+Usage:
+    idx = EngramIndex(dim=2048)
+    idx.add_batch(doc_ids, vectors)
+    results = idx.search(query_fp, top_k=5)
+    idx.save('index/hnsw')
+    idx2 = EngramIndex.load('index/hnsw')
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+from dataclasses import dataclass
+import faiss
+import numpy as np
+import torch
+import torch.nn.functional as F
+logger = logging.getLogger(__name__)
+@dataclass
+class HNSWResult:
+    """Single HNSW search result."""
+    doc_id: str
+    score: float
+    rank: int
+    margin: float = 0.0
+class EngramIndex:
+    """HNSW-backed ENGRAM retrieval index.
+    HNSW parameters:
+        M=32: graph degree (higher = better recall, more memory)
+        efConstruction=200: build-time search width
+        efSearch=64: query-time search width
+    """
+    M = 32
+    EF_CONSTRUCTION = 200
+    EF_SEARCH = 64
+    def __init__(self, dim: int = 2048):
+        self._dim = dim
+        self._index: faiss.IndexHNSWFlat | None = None
+        self._ids: list[str] = []
+        self._id_to_pos: dict[str, int] = {}
+        self._n_docs: int = 0
+    def add_batch(
+        self,
+        doc_ids: list[str],
+        vectors: torch.Tensor,
+    ) -> None:
+        """Build HNSW index from vectors.
+        Args:
+            doc_ids: list of document identifiers
+            vectors: [N, dim] tensor of fingerprints
+        """
+        matrix = F.normalize(vectors.float(), dim=-1).numpy().astype("float32")
+        self._dim = matrix.shape[1]
+        self._ids = list(doc_ids)
+        self._id_to_pos = {cid: i for i, cid in enumerate(doc_ids)}
+        self._n_docs = len(doc_ids)
+        self._index = faiss.IndexHNSWFlat(self._dim, self.M)
+        self._index.hnsw.efConstruction = self.EF_CONSTRUCTION
+        self._index.hnsw.efSearch = self.EF_SEARCH
+        self._index.add(matrix)
+    def build(
+        self,
+        eng_files: list[str],
+        fp_key: str = "vec_fourier_v2",
+        verbose: bool = True,
+    ) -> None:
+        """Build HNSW index from list of .eng file paths.
+        Args:
+            eng_files: List of paths to .eng encoded files.
+            fp_key:    Fingerprint field to index.
+                       Default 'vec_fourier_v2' (S3 validated, 99.5% recall).
+                       Falls back to 'vec_fourier' if v2 not present.
+        """
+        from kvcos.engram.reader import read_eigengram
+        doc_ids = []
+        vecs = []
+        missing_v2 = 0
+        for fp in eng_files:
+            data = read_eigengram(fp)
+            cid = data.get("cache_id")
+            if not cid:
+                continue
+            vec = data.get(fp_key)
+            if vec is None:
+                vec = data.get("vec_fourier")
+                missing_v2 += 1
+            if vec is None:
+                continue
+            doc_ids.append(cid)
+            vecs.append(vec.float())
+        if not vecs:
+            raise ValueError(
+                f"No valid fingerprints found in {len(eng_files)} files"
+            )
+        if missing_v2 > 0 and verbose:
+            logger.warning(
+                "%d docs missing %s, used vec_fourier fallback",
+                missing_v2, fp_key,
+            )
+        self.add_batch(doc_ids, torch.stack(vecs))
+        if verbose:
+            logger.info("HNSW index built: %d docs, dim=%d", self._n_docs, self._dim)
+            logger.info("M=%d, efC=%d, efS=%d", self.M, self.EF_CONSTRUCTION, self.EF_SEARCH)
+    def search(
+        self,
+        query_fp: torch.Tensor,
+        top_k: int = 5,
+    ) -> list[HNSWResult]:
+        """Search the HNSW index.
+        Returns list of HNSWResult sorted by score descending.
+        HNSW uses L2 on normalized vectors: cosine = 1 - L2^2/2.
+        """
+        if self._index is None:
+            raise RuntimeError("Index not built. Call add_batch() or load() first.")
+        qn = F.normalize(query_fp.float().unsqueeze(0), dim=-1).numpy().astype("float32")
+        D, I = self._index.search(qn, min(top_k + 1, self._n_docs))
+        results = []
+        for rank, (dist, idx) in enumerate(zip(D[0], I[0])):
+            if idx < 0:
+                continue
+            cosine_sim = float(1.0 - dist / 2.0)
+            results.append(HNSWResult(
+                doc_id=self._ids[idx], score=cosine_sim, rank=rank,
+            ))
+        if len(results) >= 2:
+            results[0].margin = results[0].score - results[1].score
+        return results[:top_k]
+    def save(self, path: str) -> None:
+        """Save index to disk (faiss + JSON metadata)."""
+        os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
+        faiss.write_index(self._index, path + ".faiss")
+        meta_path = path + ".meta.json"
+        with open(meta_path, "w") as f:
+            json.dump({
+                "ids": self._ids,
+                "id_to_pos": self._id_to_pos,
+                "dim": self._dim,
+                "n_docs": self._n_docs,
+            }, f, indent=2)
+    @classmethod
+    def load(cls, path: str) -> EngramIndex:
+        """Load index from disk."""
+        obj = cls()
+        obj._index = faiss.read_index(path + ".faiss")
+        meta_path = path + ".meta.json"
+        with open(meta_path, "r") as f:
+            meta = json.load(f)
+        obj._ids = meta["ids"]
+        obj._id_to_pos = meta["id_to_pos"]
+        obj._dim = meta["dim"]
+        obj._n_docs = meta["n_docs"]
+        return obj
+    def __len__(self) -> int:
+        return self._n_docs
+    def get_vector(self, doc_id: str) -> torch.Tensor | None:
+        """Return stored vector for doc_id, or None if not found."""
+        pos = self._id_to_pos.get(doc_id)
+        if pos is None:
+            return None
+        vec_np = np.zeros(self._dim, dtype="float32")
+        self._index.reconstruct(pos, vec_np)
+        return torch.from_numpy(vec_np)
+    def __repr__(self) -> str:
+        return f"EngramIndex(n={self._n_docs}, dim={self._dim}, M={self.M})"

kvcos/engram/index_c.py ADDED Viewed

	@@ -0,0 +1,426 @@

+"""
+kvcos/engram/index_c.py — Confidence history index for ENGRAM.
+Stores retrieval confidence records across sessions.
+Makes the system self-aware: chronic failures are known before retrieval.
+Schema:
+  retrievals:     one row per geodesic_retrieve() call
+  confusion_pairs: doc pairs that confuse each other (confidence<threshold)
+  doc_stats:      per-doc aggregate reliability scores
+Usage:
+  ic = IndexC.open("results/index_c.db")
+  ic.record(session_id="s1", query_doc_id="doc_146", result=geodesic_result)
+  prior = ic.prior("doc_146")
+  pairs = ic.confusion_registry()
+  rmap  = ic.reliability_map()
+"""
+from __future__ import annotations
+import os
+import sqlite3
+import time
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass
+class ConfidenceRecord:
+    session_id: str
+    query_doc_id: str
+    result_doc_id: str
+    confidence: str
+    margin: float
+    stages_used: int
+    constraint_used: bool
+    correct: bool
+    ts: float
+@dataclass
+class DocPrior:
+    """Prior confidence distribution for a doc_id."""
+    doc_id: str
+    n_high: int
+    n_medium: int
+    n_low: int
+    n_total: int
+    reliability: float
+    is_chronic_failure: bool
+    @property
+    def dominant_confidence(self) -> str:
+        if self.n_total == 0:
+            return "unknown"
+        counts = {
+            "high": self.n_high,
+            "medium": self.n_medium,
+            "low": self.n_low,
+        }
+        return max(counts, key=counts.get)
+@dataclass
+class ConfusionPair:
+    doc_a: str
+    doc_b: str
+    n_confusions: int
+    first_seen: float
+    last_seen: float
+SCHEMA = """
+CREATE TABLE IF NOT EXISTS retrievals (
+    id              INTEGER PRIMARY KEY AUTOINCREMENT,
+    session_id      TEXT    NOT NULL,
+    query_doc_id    TEXT    NOT NULL,
+    result_doc_id   TEXT    NOT NULL,
+    confidence      TEXT    NOT NULL,
+    margin          REAL    NOT NULL,
+    stages_used     INTEGER NOT NULL,
+    constraint_used INTEGER NOT NULL,
+    correct         INTEGER NOT NULL,
+    ts              REAL    NOT NULL
+);
+CREATE INDEX IF NOT EXISTS idx_ret_query  ON retrievals(query_doc_id);
+CREATE INDEX IF NOT EXISTS idx_ret_result ON retrievals(result_doc_id);
+CREATE INDEX IF NOT EXISTS idx_ret_conf   ON retrievals(confidence);
+CREATE INDEX IF NOT EXISTS idx_ret_sess   ON retrievals(session_id);
+CREATE TABLE IF NOT EXISTS confusion_pairs (
+    doc_a         TEXT NOT NULL,
+    doc_b         TEXT NOT NULL,
+    n_confusions  INTEGER NOT NULL DEFAULT 1,
+    first_seen    REAL    NOT NULL,
+    last_seen     REAL    NOT NULL,
+    PRIMARY KEY (doc_a, doc_b)
+);
+CREATE TABLE IF NOT EXISTS doc_stats (
+    doc_id          TEXT    PRIMARY KEY,
+    n_high          INTEGER NOT NULL DEFAULT 0,
+    n_medium        INTEGER NOT NULL DEFAULT 0,
+    n_low           INTEGER NOT NULL DEFAULT 0,
+    reliability     REAL    NOT NULL DEFAULT 1.0,
+    last_updated    REAL    NOT NULL
+);
+"""
+class IndexC:
+    """
+    Confidence history index.
+    Backed by SQLite — append-only, persistent across sessions.
+    Provides priors for geodesic_retrieve() to pre-apply constraints
+    on docs that are known chronic failures.
+    """
+    CHRONIC_FAILURE_THRESHOLD = 0.5
+    def __init__(self, db_path: str):
+        self._db_path = str(db_path)
+        self._conn = sqlite3.connect(
+            db_path, check_same_thread=False, isolation_level=None
+        )
+        self._conn.execute("PRAGMA journal_mode=WAL")
+        self._conn.executescript(SCHEMA)
+    @classmethod
+    def open(cls, db_path: str | Path) -> "IndexC":
+        """Open (or create) the Index-C database at db_path."""
+        os.makedirs(Path(db_path).parent, exist_ok=True)
+        return cls(str(db_path))
+    # ── WRITE ────────────────────────────────────────────────────────
+    def record(
+        self,
+        session_id: str,
+        query_doc_id: str,
+        result_doc_id: str,
+        confidence: str,
+        margin: float,
+        stages_used: int = 1,
+        constraint_used: bool = False,
+        correct: bool = True,
+        ts: float | None = None,
+    ) -> None:
+        """Log one retrieval result to the index."""
+        ts = ts or time.time()
+        with self._conn:
+            self._conn.execute(
+                """INSERT INTO retrievals
+                   (session_id, query_doc_id, result_doc_id, confidence,
+                    margin, stages_used, constraint_used, correct, ts)
+                   VALUES (?,?,?,?,?,?,?,?,?)""",
+                (session_id, query_doc_id, result_doc_id, confidence,
+                 float(margin), int(stages_used), int(constraint_used),
+                 int(correct), float(ts)),
+            )
+            if not correct:
+                self._register_confusion(query_doc_id, result_doc_id, ts)
+            self._update_doc_stats(query_doc_id, confidence, ts)
+    def _register_confusion(
+        self, doc_a: str, doc_b: str, ts: float
+    ) -> None:
+        """Insert or increment confusion pair."""
+        existing = self._conn.execute(
+            "SELECT n_confusions FROM confusion_pairs "
+            "WHERE doc_a=? AND doc_b=?",
+            (doc_a, doc_b),
+        ).fetchone()
+        if existing:
+            self._conn.execute(
+                "UPDATE confusion_pairs SET n_confusions=n_confusions+1, "
+                "last_seen=? WHERE doc_a=? AND doc_b=?",
+                (ts, doc_a, doc_b),
+            )
+        else:
+            self._conn.execute(
+                "INSERT INTO confusion_pairs "
+                "(doc_a, doc_b, n_confusions, first_seen, last_seen) "
+                "VALUES (?,?,1,?,?)",
+                (doc_a, doc_b, ts, ts),
+            )
+    def _update_doc_stats(
+        self, doc_id: str, confidence: str, ts: float
+    ) -> None:
+        """Upsert doc_stats row for doc_id."""
+        col_map = {"high": "n_high", "medium": "n_medium", "low": "n_low"}
+        col = col_map.get(confidence, "n_medium")
+        existing = self._conn.execute(
+            "SELECT n_high, n_medium, n_low FROM doc_stats WHERE doc_id=?",
+            (doc_id,),
+        ).fetchone()
+        if existing:
+            n_high, n_medium, n_low = existing
+            if col == "n_high":
+                n_high += 1
+            elif col == "n_medium":
+                n_medium += 1
+            else:
+                n_low += 1
+            n_total = n_high + n_medium + n_low
+            reliability = (n_high + n_medium) / n_total if n_total > 0 else 1.0
+            self._conn.execute(
+                "UPDATE doc_stats SET n_high=?, n_medium=?, n_low=?, "
+                "reliability=?, last_updated=? WHERE doc_id=?",
+                (n_high, n_medium, n_low, reliability, ts, doc_id),
+            )
+        else:
+            vals = {"n_high": 0, "n_medium": 0, "n_low": 0}
+            vals[col] = 1
+            reliability = (vals["n_high"] + vals["n_medium"]) / 1
+            self._conn.execute(
+                "INSERT INTO doc_stats "
+                "(doc_id, n_high, n_medium, n_low, reliability, last_updated) "
+                "VALUES (?,?,?,?,?,?)",
+                (doc_id, vals["n_high"], vals["n_medium"],
+                 vals["n_low"], reliability, ts),
+            )
+    # ── READ ─────────────────────────────────────────────────────────
+    def prior(self, doc_id: str) -> DocPrior:
+        """Return prior confidence distribution for doc_id."""
+        row = self._conn.execute(
+            "SELECT n_high, n_medium, n_low, reliability "
+            "FROM doc_stats WHERE doc_id=?",
+            (doc_id,),
+        ).fetchone()
+        if not row:
+            return DocPrior(
+                doc_id=doc_id, n_high=0, n_medium=0, n_low=0,
+                n_total=0, reliability=1.0, is_chronic_failure=False,
+            )
+        n_high, n_medium, n_low, reliability = row
+        n_total = n_high + n_medium + n_low
+        return DocPrior(
+            doc_id=doc_id,
+            n_high=n_high,
+            n_medium=n_medium,
+            n_low=n_low,
+            n_total=n_total,
+            reliability=reliability,
+            is_chronic_failure=(
+                n_low / n_total > self.CHRONIC_FAILURE_THRESHOLD
+                if n_total > 0
+                else False
+            ),
+        )
+    def confusion_registry(
+        self, min_confusions: int = 1
+    ) -> list[ConfusionPair]:
+        """Return known confusion pairs with >= min_confusions."""
+        rows = self._conn.execute(
+            "SELECT doc_a, doc_b, n_confusions, first_seen, last_seen "
+            "FROM confusion_pairs WHERE n_confusions >= ? "
+            "ORDER BY n_confusions DESC",
+            (min_confusions,),
+        ).fetchall()
+        return [
+            ConfusionPair(
+                doc_a=r[0], doc_b=r[1], n_confusions=r[2],
+                first_seen=r[3], last_seen=r[4],
+            )
+            for r in rows
+        ]
+    def reliability_map(self) -> dict[str, float]:
+        """Return {doc_id: reliability_score} for all tracked docs."""
+        rows = self._conn.execute(
+            "SELECT doc_id, reliability FROM doc_stats ORDER BY reliability"
+        ).fetchall()
+        return {r[0]: float(r[1]) for r in rows}
+    def session_history(self, session_id: str) -> list[ConfidenceRecord]:
+        """Return all records for a session_id."""
+        rows = self._conn.execute(
+            "SELECT session_id, query_doc_id, result_doc_id, confidence, "
+            "margin, stages_used, constraint_used, correct, ts "
+            "FROM retrievals WHERE session_id=? ORDER BY ts",
+            (session_id,),
+        ).fetchall()
+        return [
+            ConfidenceRecord(
+                session_id=r[0], query_doc_id=r[1], result_doc_id=r[2],
+                confidence=r[3], margin=float(r[4]), stages_used=int(r[5]),
+                constraint_used=bool(r[6]), correct=bool(r[7]), ts=float(r[8]),
+            )
+            for r in rows
+        ]
+    def n_sessions(self) -> int:
+        """Number of distinct sessions recorded."""
+        row = self._conn.execute(
+            "SELECT COUNT(DISTINCT session_id) FROM retrievals"
+        ).fetchone()
+        return row[0] if row else 0
+    # ── RECENCY-WEIGHTED RELIABILITY ────────────────────────────────
+    def weighted_reliability(
+        self,
+        doc_id: str,
+        decay:  float = 0.85,
+    ) -> float:
+        """
+        Exponentially weighted reliability score.
+        Newer retrievals have higher weight.
+        decay=0.85: a failure 5 sessions ago counts 0.85^5 = 0.44
+        of a failure last session.
+        Returns float in [0, 1]. Returns 1.0 if no history.
+        """
+        rows = self._conn.execute(
+            """SELECT correct FROM retrievals
+               WHERE query_doc_id=?
+               ORDER BY ts ASC""",
+            (doc_id,),
+        ).fetchall()
+        if not rows:
+            return 1.0
+        history = [bool(r[0]) for r in rows]
+        n       = len(history)
+        weights = [decay ** (n - 1 - i) for i in range(n)]
+        total_w = sum(weights)
+        score   = sum(w * int(h) for w, h in zip(weights, history))
+        return round(score / total_w, 6) if total_w > 0 else 1.0
+    # ── INDEX GROWTH REVALIDATION ────────────────────────────────────
+    def on_document_added(
+        self,
+        new_doc_id:           str,
+        new_vec,              # torch.Tensor [dim]
+        hnsw_index,           # EngramIndex instance
+        revalidation_radius:  float = 0.85,
+        density_threshold:    int   = 3,
+    ) -> list[str]:
+        """
+        Call after adding a document to the HNSW index.
+        Recomputes local_density for neighbors of new_doc_id
+        whose similarity > revalidation_radius.
+        Returns list of doc_ids whose density was updated.
+        Why this matters:
+          At N=200, doc_042 is in sparse space (density=1).
+          At N=500, doc_201 lands near doc_042 (cosine=0.88).
+          doc_042 is now in a denser region — its confidence tier
+          has degraded. This method detects and records that.
+        """
+        import torch
+        import torch.nn.functional as F
+        updated = []
+        try:
+            results = hnsw_index.search(new_vec, top_k=20)
+        except Exception:
+            return updated
+        for r in results:
+            if r.doc_id == new_doc_id:
+                continue
+            if r.score < revalidation_radius:
+                continue
+            # Recompute density for this neighbor
+            neighbor_vec = hnsw_index.get_vector(r.doc_id)
+            if neighbor_vec is None:
+                continue
+            all_results = hnsw_index.search(neighbor_vec, top_k=50)
+            new_density = sum(
+                1 for x in all_results
+                if x.doc_id != r.doc_id and x.score > revalidation_radius
+            )
+            ts = __import__("time").time()
+            existing = self._conn.execute(
+                "SELECT n_high FROM doc_stats WHERE doc_id=?",
+                (r.doc_id,),
+            ).fetchone()
+            if existing:
+                self._conn.execute(
+                    "UPDATE doc_stats SET last_updated=? WHERE doc_id=?",
+                    (ts, r.doc_id),
+                )
+            else:
+                self._conn.execute(
+                    "INSERT OR IGNORE INTO doc_stats "
+                    "(doc_id, n_high, n_medium, n_low, reliability, last_updated) "
+                    "VALUES (?,0,0,0,1.0,?)",
+                    (r.doc_id, ts),
+                )
+            # If density crossed threshold, register as needing constraint activation
+            if new_density > density_threshold:
+                self._register_confusion(r.doc_id, new_doc_id, ts)
+            updated.append(r.doc_id)
+            self._conn.commit()
+        return updated
+    def close(self) -> None:
+        self._conn.close()
+    def __repr__(self) -> str:
+        n = self._conn.execute(
+            "SELECT COUNT(*) FROM retrievals"
+        ).fetchone()[0]
+        return f"IndexC(db={self._db_path!r}, n_records={n})"

kvcos/engram/knowledge_index.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""
+kvcos/engram/knowledge_index.py — HNSW index over the knowledge store.
+Builds and maintains a faiss HNSW index over all .eng files in
+~/.engram/knowledge/. Supports dynamic dimension (384 for sbert,
+2048 for llama_cpp/hash) — determined at build time from the first
+.eng file.
+Usage:
+    # Build from all knowledge .eng files
+    kidx = KnowledgeIndex.build_from_knowledge_dir()
+    results = kidx.search("HNSW recall benchmark", k=5)
+    kidx.save()
+    # Load pre-built index
+    kidx = KnowledgeIndex.load()
+    results = kidx.search("testing patterns", k=3)
+Index files:
+    ~/.engram/index/knowledge.faiss
+    ~/.engram/index/knowledge.meta
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+from dataclasses import dataclass
+from pathlib import Path
+import faiss
+import numpy as np
+import torch
+import torch.nn.functional as F
+from kvcos.engram.embedder import get_fingerprint
+from kvcos.engram.format import EigramEncoder
+logger = logging.getLogger(__name__)
+INDEX_DIR = Path(
+    os.environ.get("ENGRAM_INDEX_DIR", "~/.engram/index")
+).expanduser()
+KNOWLEDGE_DIR = Path(
+    os.environ.get("ENGRAM_KNOWLEDGE_DIR", "~/.engram/knowledge")
+).expanduser()
+INDEX_NAME = "knowledge"
+_encoder = EigramEncoder()
+@dataclass(frozen=True)
+class KnowledgeResult:
+    """Single search result from the knowledge index."""
+    doc_id: str
+    score: float
+    rank: int
+    source_path: str
+    project: str
+    content: str
+    chunk_info: str     # "2/5" format
+    headers: list[str]
+    margin: float = 0.0
+class KnowledgeIndex:
+    """HNSW index over the ENGRAM knowledge store.
+    Parameters match EngramIndex for consistency:
+        M=32, efConstruction=200, efSearch=64
+    """
+    M = 32
+    EF_CONSTRUCTION = 200
+    EF_SEARCH = 64
+    def __init__(self, dim: int = 384) -> None:
+        self._dim = dim
+        self._index: faiss.IndexHNSWFlat | None = None
+        self._meta: list[dict] = []   # per-vector metadata
+        self._n_docs: int = 0
+    @classmethod
+    def build_from_knowledge_dir(
+        cls,
+        knowledge_dir: Path | None = None,
+        verbose: bool = True,
+    ) -> KnowledgeIndex:
+        """Build HNSW index from all .eng files in the knowledge directory."""
+        if knowledge_dir is None:
+            knowledge_dir = KNOWLEDGE_DIR
+        eng_files = sorted(knowledge_dir.rglob("*.eng"), key=os.path.getmtime)
+        eng_files = [p for p in eng_files if p.suffix == ".eng"]
+        if not eng_files:
+            raise ValueError(f"No .eng files found in {knowledge_dir}")
+        vectors: list[torch.Tensor] = []
+        metas: list[dict] = []
+        skipped = 0
+        for p in eng_files:
+            try:
+                data = _encoder.decode(p.read_bytes())
+                fp = data.get("vec_fourier_v2")
+                if fp is None:
+                    fp = data.get("vec_fourier")
+                if fp is None:
+                    skipped += 1
+                    continue
+                # Load sidecar metadata
+                meta_path = Path(str(p) + ".meta.json")
+                meta = {}
+                if meta_path.exists():
+                    meta = json.loads(meta_path.read_text())
+                # Use sidecar description if longer than binary
+                description = meta.get("task_description", "") or \
+                    data.get("task_description", "")
+                vectors.append(fp.float())
+                metas.append({
+                    "doc_id": data.get("cache_id", p.stem),
+                    "source_path": meta.get("source_path", ""),
+                    "project": meta.get("project", ""),
+                    "content": description,
+                    "chunk_index": meta.get("chunk_index", 0),
+                    "chunk_total": meta.get("chunk_total", 1),
+                    "headers": meta.get("headers", []),
+                    "fp_source": meta.get("fp_source", "unknown"),
+                })
+            except Exception as exc:
+                logger.debug("Skipping %s: %s", p, exc)
+                skipped += 1
+        if not vectors:
+            raise ValueError(
+                f"No valid fingerprints in {len(eng_files)} .eng files"
+            )
+        # Stack and determine dimension from actual data
+        matrix = torch.stack(vectors)
+        dim = matrix.shape[1]
+        # Normalize for cosine similarity via L2
+        matrix = F.normalize(matrix, dim=-1).numpy().astype("float32")
+        # Build HNSW
+        obj = cls(dim=dim)
+        obj._index = faiss.IndexHNSWFlat(dim, cls.M)
+        obj._index.hnsw.efConstruction = cls.EF_CONSTRUCTION
+        obj._index.hnsw.efSearch = cls.EF_SEARCH
+        obj._index.add(matrix)
+        obj._meta = metas
+        obj._n_docs = len(metas)
+        if verbose:
+            projects = {m["project"] for m in metas}
+            logger.info("Knowledge HNSW: %d vectors, dim=%d", obj._n_docs, dim)
+            logger.info("Projects: %s", sorted(projects))
+            if skipped:
+                logger.warning("Skipped: %d files (no fingerprint)", skipped)
+        return obj
+    def search(
+        self,
+        query: str | torch.Tensor,
+        k: int = 5,
+    ) -> list[KnowledgeResult]:
+        """
+        Search the knowledge index.
+        Args:
+            query: Search text (will be fingerprinted) or pre-computed tensor.
+            k: Number of results to return.
+        Returns:
+            List of KnowledgeResult sorted by score descending.
+        """
+        if self._index is None:
+            raise RuntimeError("Index not built. Call build_from_knowledge_dir() first.")
+        if isinstance(query, str):
+            query_fp, _ = get_fingerprint(query)
+        else:
+            query_fp = query
+        qn = F.normalize(
+            query_fp.float().unsqueeze(0), dim=-1
+        ).numpy().astype("float32")
+        top = min(k + 1, self._n_docs)
+        D, I = self._index.search(qn, top)
+        results: list[KnowledgeResult] = []
+        for rank, (dist, idx) in enumerate(zip(D[0], I[0])):
+            if idx < 0 or idx >= len(self._meta):
+                continue
+            meta = self._meta[idx]
+            cosine = float(1.0 - dist / 2.0)
+            ci = meta.get("chunk_index", 0)
+            ct = meta.get("chunk_total", 1)
+            results.append(KnowledgeResult(
+                doc_id=meta["doc_id"],
+                score=cosine,
+                rank=rank,
+                source_path=meta.get("source_path", ""),
+                project=meta.get("project", ""),
+                content=meta.get("content", ""),
+                chunk_info=f"{ci + 1}/{ct}",
+                headers=meta.get("headers", []),
+            ))
+        # Set margin on top result
+        if len(results) >= 2:
+            results[0] = KnowledgeResult(
+                doc_id=results[0].doc_id,
+                score=results[0].score,
+                rank=results[0].rank,
+                source_path=results[0].source_path,
+                project=results[0].project,
+                content=results[0].content,
+                chunk_info=results[0].chunk_info,
+                headers=results[0].headers,
+                margin=results[0].score - results[1].score,
+            )
+        return results[:k]
+    def save(self, index_dir: Path | None = None) -> Path:
+        """Save index to disk."""
+        if index_dir is None:
+            index_dir = INDEX_DIR
+        index_dir.mkdir(parents=True, exist_ok=True)
+        faiss_path = index_dir / f"{INDEX_NAME}.faiss"
+        meta_path = index_dir / f"{INDEX_NAME}.meta.json"
+        faiss.write_index(self._index, str(faiss_path))
+        with open(meta_path, "w") as f:
+            json.dump({
+                "meta": self._meta,
+                "dim": self._dim,
+                "n_docs": self._n_docs,
+            }, f, indent=2)
+        return faiss_path
+    @classmethod
+    def load(cls, index_dir: Path | None = None) -> KnowledgeIndex:
+        """Load pre-built index from disk."""
+        if index_dir is None:
+            index_dir = INDEX_DIR
+        faiss_path = index_dir / f"{INDEX_NAME}.faiss"
+        meta_path = index_dir / f"{INDEX_NAME}.meta.json"
+        if not faiss_path.exists():
+            raise FileNotFoundError(
+                f"No knowledge index at {faiss_path}. "
+                "Build with KnowledgeIndex.build_from_knowledge_dir()"
+            )
+        obj = cls()
+        obj._index = faiss.read_index(str(faiss_path))
+        with open(meta_path, "r") as f:
+            data = json.load(f)
+        obj._meta = data["meta"]
+        obj._dim = data["dim"]
+        obj._n_docs = data["n_docs"]
+        return obj
+    def __len__(self) -> int:
+        return self._n_docs
+    def __repr__(self) -> str:
+        return (
+            f"KnowledgeIndex(n={self._n_docs}, dim={self._dim}, "
+            f"M={self.M})"
+        )

kvcos/engram/manifest.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""
+kvcos/engram/manifest.py — Knowledge index manifest registry.
+Tracks which source files have been indexed into .eng files,
+their content hashes for incremental re-indexing, and chunk
+metadata for multi-chunk files.
+Storage: JSON file at ~/.engram/manifest.json (human-readable,
+git-friendly, easily inspectable).
+Thread safety: reads are lock-free, writes use atomic rename.
+"""
+from __future__ import annotations
+import hashlib
+import json
+import os
+import tempfile
+import time
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Iterator
+@dataclass(frozen=True)
+class ChunkRecord:
+    """One indexed chunk from a source file."""
+    eng_path: str          # Absolute path to .eng file
+    chunk_index: int       # 0-based chunk index within source
+    chunk_total: int       # Total chunks for this source
+    char_start: int        # Start offset in source content
+    char_end: int          # End offset in source content
+    indexed_at: float      # Unix timestamp of indexing
+@dataclass(frozen=True)
+class SourceRecord:
+    """Registry entry for one indexed source file."""
+    source_path: str       # Absolute path to original .md file
+    content_hash: str      # SHA-256 of file content at index time
+    project: str           # Project namespace (e.g., "engram", "_global")
+    file_size: int         # Bytes at index time
+    chunks: tuple[ChunkRecord, ...] = ()
+    indexed_at: float = 0.0
+    last_verified: float = 0.0
+    @property
+    def eng_paths(self) -> list[str]:
+        """All .eng file paths for this source."""
+        return [c.eng_path for c in self.chunks]
+def _content_hash(content: str) -> str:
+    """SHA-256 hex digest of string content."""
+    return hashlib.sha256(content.encode("utf-8")).hexdigest()
+def _file_hash(path: Path) -> str:
+    """SHA-256 hex digest of file on disk."""
+    return hashlib.sha256(path.read_bytes()).hexdigest()
+class Manifest:
+    """
+    Knowledge index manifest — tracks source-to-.eng mappings.
+    Immutable-style operations: all mutations return new state
+    and write atomically to disk.
+    Usage:
+        m = Manifest.load()
+        m = m.register(source_path, content_hash, project, chunks)
+        # m is now updated and persisted to disk
+    """
+    def __init__(
+        self,
+        records: dict[str, SourceRecord],
+        manifest_path: Path,
+    ) -> None:
+        self._records = dict(records)  # defensive copy
+        self._path = manifest_path
+    @classmethod
+    def load(cls, manifest_path: Path | None = None) -> Manifest:
+        """Load manifest from disk, or create empty if not found."""
+        if manifest_path is None:
+            manifest_path = Path(
+                os.environ.get("ENGRAM_MANIFEST_PATH",
+                               "~/.engram/manifest.json")
+            ).expanduser()
+        if manifest_path.exists():
+            data = json.loads(manifest_path.read_text())
+            records = {}
+            for key, rec_data in data.get("sources", {}).items():
+                chunks = tuple(
+                    ChunkRecord(**c) for c in rec_data.pop("chunks", [])
+                )
+                records[key] = SourceRecord(**rec_data, chunks=chunks)
+            return cls(records, manifest_path)
+        return cls({}, manifest_path)
+    def register(
+        self,
+        source_path: str,
+        content_hash: str,
+        project: str,
+        file_size: int,
+        chunks: list[ChunkRecord],
+    ) -> Manifest:
+        """
+        Register a newly indexed source file. Returns updated Manifest.
+        Overwrites any existing record for the same source_path
+        (re-index scenario).
+        """
+        now = time.time()
+        record = SourceRecord(
+            source_path=source_path,
+            content_hash=content_hash,
+            project=project,
+            file_size=file_size,
+            chunks=tuple(chunks),
+            indexed_at=now,
+            last_verified=now,
+        )
+        new_records = dict(self._records)
+        new_records[source_path] = record
+        new_manifest = Manifest(new_records, self._path)
+        new_manifest._persist()
+        return new_manifest
+    def unregister(self, source_path: str) -> Manifest:
+        """Remove a source from the manifest. Returns updated Manifest."""
+        new_records = {
+            k: v for k, v in self._records.items()
+            if k != source_path
+        }
+        new_manifest = Manifest(new_records, self._path)
+        new_manifest._persist()
+        return new_manifest
+    def needs_reindex(self, source_path: str, current_hash: str) -> bool:
+        """Check if a source file needs re-indexing (content changed)."""
+        record = self._records.get(source_path)
+        if record is None:
+            return True
+        return record.content_hash != current_hash
+    def get_record(self, source_path: str) -> SourceRecord | None:
+        """Look up a source record by path."""
+        return self._records.get(source_path)
+    def get_project_records(self, project: str) -> list[SourceRecord]:
+        """All records for a given project namespace."""
+        return [
+            r for r in self._records.values()
+            if r.project == project
+        ]
+    def all_records(self) -> Iterator[SourceRecord]:
+        """Iterate over all registered source records."""
+        yield from self._records.values()
+    @property
+    def total_sources(self) -> int:
+        return len(self._records)
+    @property
+    def total_chunks(self) -> int:
+        return sum(len(r.chunks) for r in self._records.values())
+    @property
+    def projects(self) -> set[str]:
+        return {r.project for r in self._records.values()}
+    def summary(self) -> dict:
+        """Quick stats for display."""
+        return {
+            "total_sources": self.total_sources,
+            "total_chunks": self.total_chunks,
+            "projects": sorted(self.projects),
+            "manifest_path": str(self._path),
+        }
+    def _persist(self) -> None:
+        """Atomic write to disk via tempfile + rename."""
+        self._path.parent.mkdir(parents=True, exist_ok=True)
+        serializable = {
+            "version": 1,
+            "updated_at": time.time(),
+            "sources": {},
+        }
+        for key, rec in self._records.items():
+            rec_dict = asdict(rec)
+            serializable["sources"][key] = rec_dict
+        # Atomic write: write to temp, then rename
+        fd, tmp_path = tempfile.mkstemp(
+            dir=str(self._path.parent),
+            suffix=".tmp",
+        )
+        try:
+            with os.fdopen(fd, "w") as f:
+                json.dump(serializable, f, indent=2)
+            os.replace(tmp_path, str(self._path))
+        except Exception:
+            # Clean up temp file on failure
+            try:
+                os.unlink(tmp_path)
+            except OSError:
+                pass
+            raise
+    def __len__(self) -> int:
+        return self.total_sources
+    def __contains__(self, source_path: str) -> bool:
+        return source_path in self._records
+    def __repr__(self) -> str:
+        return (
+            f"Manifest({self.total_sources} sources, "
+            f"{self.total_chunks} chunks, "
+            f"projects={sorted(self.projects)})"
+        )

kvcos/engram/metadata_disambiguate.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+kvcos/engram/metadata_disambiguate.py
+Stage 4 retrieval: activates when the fingerprint pipeline returns LOW.
+Uses .eng metadata fields (domain, context_len, l2_norm, task_description)
+to break ties that the Fourier fingerprint cannot resolve.
+Returns Stage4Result with confidence='low-metadata' and metadata_used=True.
+Design note (VRCM source):
+  When constraint satisfaction fails on the fingerprint axis, switch to
+  orthogonal axes — metadata fields that are independent of spectral structure.
+  The medicine/biology failure exists because their f0+f1 profiles are
+  spectrally identical. Their metadata is NOT identical: context_len differs,
+  l2_norm differs, task_description keywords differ. That orthogonal signal
+  is what Stage 4 exploits.
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+@dataclass
+class Stage4Result:
+    doc_id:           str
+    meta_score:       float
+    confidence:       str   = 'low-metadata'
+    metadata_used:    bool  = True
+    domain_matched:   bool  = False
+    score_breakdown:  dict  = None
+    def __post_init__(self):
+        if self.score_breakdown is None:
+            self.score_breakdown = {}
+def _keyword_overlap(text_a: str, text_b: str) -> float:
+    """Jaccard overlap on lowercase word sets, excluding stopwords."""
+    STOPWORDS = {
+        'the', 'a', 'an', 'is', 'are', 'was', 'were', 'in', 'of',
+        'to', 'and', 'or', 'that', 'this', 'it', 'with', 'for',
+        'on', 'at', 'by', 'from', 'be', 'has', 'have', 'had',
+    }
+    def words(t):
+        return set(w for w in re.sub(r'[^a-z0-9 ]', ' ',
+                                     (t or '').lower()).split()
+                   if w not in STOPWORDS and len(w) > 2)
+    a, b = words(text_a), words(text_b)
+    if not a or not b:
+        return 0.0
+    return len(a & b) / len(a | b)
+def metadata_disambiguate(
+    candidates: list[dict],
+    query_metadata: dict,
+    domain_bonus:   float = 0.3,
+    max_len:        int   = 8192,
+    max_norm:       float = 10.0,
+) -> Stage4Result | None:
+    """
+    Stage 4 disambiguation using .eng metadata fields.
+    Args:
+        candidates:      list of dicts from eng_index values.
+                         Each must have: cache_id, task_description,
+                         context_len (optional), l2_norm (optional),
+                         metadata dict with domain (optional).
+        query_metadata:  dict with same structure as one candidate.
+        domain_bonus:    score added for exact domain match (default 0.3).
+        max_len:         normalisation constant for context_len diff.
+        max_norm:        normalisation constant for l2_norm diff.
+    Returns:
+        Stage4Result for the highest meta-scoring candidate, or None
+        if candidates list is empty.
+    """
+    if not candidates:
+        return None
+    best: Stage4Result | None = None
+    q_domain  = (query_metadata.get('metadata') or {}).get('domain', '')
+    q_len     = float(query_metadata.get('context_len') or 512)
+    q_norm    = float(query_metadata.get('l2_norm') or 1.0)
+    q_desc    = (query_metadata.get('task_description') or '')[:80]
+    for cand in candidates:
+        c_domain  = (cand.get('metadata') or {}).get('domain', '')
+        c_len     = float(cand.get('context_len') or 512)
+        c_norm    = float(cand.get('l2_norm') or 1.0)
+        c_desc    = (cand.get('task_description') or '')[:80]
+        c_id      = cand.get('cache_id', '')
+        domain_match  = (q_domain and c_domain and q_domain == c_domain)
+        domain_score  = domain_bonus if domain_match else 0.0
+        len_score     = 1.0 - min(abs(q_len - c_len) / max(max_len, 1), 1.0)
+        norm_score    = 1.0 - min(abs(q_norm - c_norm) / max(max_norm, 1), 1.0)
+        kw_score      = _keyword_overlap(q_desc, c_desc)
+        meta_score    = domain_score + len_score + norm_score + kw_score
+        r = Stage4Result(
+            doc_id         = c_id,
+            meta_score     = meta_score,
+            confidence     = 'low-metadata',
+            metadata_used  = True,
+            domain_matched = domain_match,
+            score_breakdown = {
+                'domain': round(domain_score, 3),
+                'len':    round(len_score, 3),
+                'norm':   round(norm_score, 3),
+                'kw':     round(kw_score, 3),
+            },
+        )
+        if best is None or meta_score > best.meta_score:
+            best = r
+    return best

kvcos/engram/reader.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+EIGENGRAM reader: .eng file -> IndexEntry + fingerprint vectors
+"""
+from __future__ import annotations
+from pathlib import Path
+from .format import EigramDecoder
+from kvcos.core.manifold_index import IndexEntry
+_decoder = EigramDecoder()
+def read_eigengram(path: str) -> dict:
+    """Read a .eng file and return decoded fields."""
+    if not Path(path).exists():
+        raise FileNotFoundError(f"EIGENGRAM not found: {path}")
+    data = Path(path).read_bytes()
+    return _decoder.decode(data)
+def load_eigengram_index(
+    paths: list[str],
+    fingerprint: str = "perdoc",
+) -> tuple[list, list]:
+    """Load multiple .eng files for ManifoldIndex.
+    fingerprint: 'perdoc' (same-model) | 'fcdb' (cross-model)
+    Returns (vecs, entries) ready for ManifoldIndex.add().
+    """
+    if fingerprint not in ("perdoc", "fcdb", "fourier"):
+        raise ValueError(f"fingerprint must be 'perdoc', 'fcdb', or 'fourier', got '{fingerprint}'")
+    vecs = []
+    entries = []
+    key = f"vec_{fingerprint}"
+    for path in paths:
+        rec = read_eigengram(path)
+        vecs.append(rec[key])
+        entries.append(
+            IndexEntry(
+                cache_id=rec["cache_id"],
+                task_description=rec["task_description"],
+                model_id=rec["model_id"],
+                created_at=rec["created_at"],
+                context_len=rec["context_len"],
+                l2_norm=rec["l2_norm"],
+            )
+        )
+    return vecs, entries

kvcos/engram/retrieval.py ADDED Viewed

	@@ -0,0 +1,513 @@

+"""
+ENGRAM constrained retrieval — apophatic negative constraint layer.
+Implements constrained_retrieve() which penalizes candidates too
+similar to known confusion partners, resolving dense-region failures.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+import torch
+import torch.nn.functional as F
+@dataclass
+class CosineResult:
+    """Single retrieval result."""
+    doc_id: str
+    score: float
+    cos_score: float
+    margin: float = 0.0
+    constrained: bool = False
+@dataclass
+class EngramQuery:
+    """Query with optional negative constraints.
+    like:        Fingerprint to match (positive constraint).
+    unlike:      Fingerprints to avoid (negative constraints).
+    min_margin:  Minimum acceptable score gap.
+    fingerprint: Which fingerprint field to use ('fourier', 'fcdb', 'perdoc').
+    """
+    like: torch.Tensor
+    unlike: list[torch.Tensor] = field(default_factory=list)
+    min_margin: float = 0.001
+    domain_hint: str | None = None
+    fingerprint: str = "fourier"
+def cosine_search(
+    query_fp: torch.Tensor,
+    index: dict[str, torch.Tensor],
+    top_k: int = 5,
+) -> list[CosineResult]:
+    """Standard unconstrained cosine similarity search."""
+    if not index:
+        return []
+    doc_ids = list(index.keys())
+    matrix = torch.stack([index[d] for d in doc_ids])
+    qn = F.normalize(query_fp.unsqueeze(0).float(), dim=-1)
+    mn = F.normalize(matrix.float(), dim=-1)
+    sims = (qn @ mn.T).squeeze(0)
+    top_indices = sims.topk(min(top_k, len(doc_ids))).indices.tolist()
+    results = [
+        CosineResult(
+            doc_id=doc_ids[i],
+            score=float(sims[i].item()),
+            cos_score=float(sims[i].item()),
+        )
+        for i in top_indices
+    ]
+    if len(results) >= 2:
+        results[0].margin = results[0].score - results[1].score
+    return results
+def constrained_retrieve(
+    query: EngramQuery,
+    index: dict[str, torch.Tensor],
+    top_k: int = 5,
+    neg_weight: float = 0.5,
+    neg_threshold: float = 0.85,
+) -> list[CosineResult]:
+    """Retrieval with negative (apophatic) constraint layer.
+    Penalizes candidates too similar to `unlike` fingerprints.
+    In dense regions, this discriminates between docs that would
+    otherwise have identical cosine scores.
+    Algorithm:
+        1. Compute cosine similarity to query (positive score)
+        2. For each unlike fingerprint, compute sim to each candidate
+        3. Subtract penalty: neg_weight * max(0, sim_to_unlike - threshold)
+        4. Sort by adjusted score
+    """
+    if not index:
+        return []
+    doc_ids = list(index.keys())
+    matrix = torch.stack([index[d] for d in doc_ids])
+    qn = F.normalize(query.like.unsqueeze(0).float(), dim=-1)
+    mn = F.normalize(matrix.float(), dim=-1)
+    cos_scores = (qn @ mn.T).squeeze(0)
+    adjusted = cos_scores.clone()
+    if query.unlike:
+        for unlike_fp in query.unlike:
+            un = F.normalize(unlike_fp.unsqueeze(0).float(), dim=-1)
+            neg_sims = (un @ mn.T).squeeze(0)
+            penalty = neg_weight * torch.clamp(neg_sims - neg_threshold, min=0)
+            adjusted = adjusted - penalty
+    top_indices = adjusted.topk(min(top_k, len(doc_ids))).indices.tolist()
+    results = [
+        CosineResult(
+            doc_id=doc_ids[i],
+            score=float(adjusted[i].item()),
+            cos_score=float(cos_scores[i].item()),
+            constrained=bool(query.unlike),
+        )
+        for i in top_indices
+    ]
+    if len(results) >= 2:
+        results[0].margin = results[0].score - results[1].score
+    return results
+# ── TWO-STAGE GEODESIC RETRIEVAL ──────────────────────────────────────
+from enum import Enum
+class RetrievalConfidence(Enum):
+    HIGH = "high"  # margin > 5x threshold, single pass sufficient
+    MEDIUM = "medium"  # margin > threshold, or resolved by stage-2
+    LOW = "low"  # margin < threshold after stage-2 — uncertain
+@dataclass
+class GeodesicResult:
+    """Result from geodesic_retrieve()."""
+    doc_id: str
+    score: float
+    margin: float
+    confidence: RetrievalConfidence
+    stages_used: int = 1  # 1 = single pass, 2 = two-stage, 3 = constrained
+    constraint_used:  bool = False
+    stage4_used:      bool = False
+    stage4_doc_id:    str  = ""
+def geodesic_retrieve(
+    query_fp: torch.Tensor,
+    hnsw_index,  # EngramIndex instance
+    eng_index: dict,  # {doc_id: eng_data} for constraint layer
+    margin_threshold: float = 0.005,
+    correction_weight: float = 0.3,
+    top_k: int = 5,
+) -> GeodesicResult:
+    """
+    Two-stage geodesic retrieval with automatic confidence scoring.
+    Stage 1: HNSW approximate nearest-neighbor search.
+             If margin(top1, top2) >= margin_threshold -> HIGH or MEDIUM.
+             Return immediately.
+    Stage 2: Activated when margin < margin_threshold.
+             Interpolate query fingerprint toward Stage-1 top-1 result.
+             The interpolation weight (correction_weight=0.3) bends the
+             geodesic toward the probable destination without assuming
+             the Stage-1 answer is correct.
+             If Stage-2 margin >= threshold -> MEDIUM confidence.
+             If Stage-2 margin still < threshold -> LOW confidence.
+    Stage 3: If confusion_flag is set on Stage-2 top result AND
+             eng_index is provided -> activate negative constraint.
+             Uses the confusion partner fingerprint as unlike constraint.
+    Args:
+        query_fp:          [dim] query fingerprint (v2 recommended).
+        hnsw_index:        Built EngramIndex instance.
+        eng_index:         Dict {doc_id: eng_data} loaded from .eng files.
+                           Used for Stage-2 interpolation and Stage-3
+                           constraint layer. Pass empty dict {} to disable.
+        margin_threshold:  Minimum margin for MEDIUM confidence.
+                           Default 0.005 (below S3 mean margin of 0.009).
+        correction_weight: Interpolation weight for Stage-2 trajectory
+                           correction. 0.3 = 30% pull toward top-1.
+                           Range: 0.1 (gentle) to 0.5 (aggressive).
+        top_k:             Candidates per search pass.
+    Returns:
+        GeodesicResult with doc_id, score, margin, confidence, stages_used.
+    Usage:
+        result = geodesic_retrieve(query_fp, idx, eng_index={})
+        if result.confidence == RetrievalConfidence.LOW:
+            # Flag for human review or return with uncertainty warning
+            pass
+    """
+    # Stage 1: HNSW search
+    s1_results = hnsw_index.search(query_fp, top_k=top_k)
+    if len(s1_results) < 2:
+        return GeodesicResult(
+            doc_id=s1_results[0].doc_id if s1_results else "",
+            score=s1_results[0].score if s1_results else 0.0,
+            margin=0.0,
+            confidence=RetrievalConfidence.LOW,
+            stages_used=1,
+        )
+    s1_margin = s1_results[0].margin
+    # High confidence: single pass sufficient
+    if s1_margin >= margin_threshold * 5:
+        return GeodesicResult(
+            doc_id=s1_results[0].doc_id,
+            score=s1_results[0].score,
+            margin=s1_margin,
+            confidence=RetrievalConfidence.HIGH,
+            stages_used=1,
+        )
+    # Medium confidence: above threshold but not high
+    if s1_margin >= margin_threshold:
+        return GeodesicResult(
+            doc_id=s1_results[0].doc_id,
+            score=s1_results[0].score,
+            margin=s1_margin,
+            confidence=RetrievalConfidence.MEDIUM,
+            stages_used=1,
+        )
+    # Stage 2: trajectory correction
+    # Retrieve top-1 fingerprint from eng_index for interpolation
+    top1_id = s1_results[0].doc_id
+    top1_eng = eng_index.get(top1_id, {})
+    top1_fp = top1_eng.get("vec_fourier_v2")
+    if top1_fp is None:
+        top1_fp = top1_eng.get("vec_fourier")
+    if top1_fp is not None:
+        # Bend geodesic toward Stage-1 top-1
+        refined_fp = F.normalize(
+            (1 - correction_weight) * query_fp.float()
+            + correction_weight * top1_fp.float(),
+            dim=-1,
+        )
+        s2_results = hnsw_index.search(refined_fp, top_k=top_k)
+        s2_margin = s2_results[0].margin if len(s2_results) >= 2 else 0.0
+        # Stage 3: check confusion_flag on Stage-2 top result
+        s2_top_id = s2_results[0].doc_id if s2_results else top1_id
+        s2_top_eng = eng_index.get(s2_top_id, {})
+        if s2_top_eng.get("confusion_flag") and eng_index:
+            # Activate negative constraint: find confusion partner fps
+            def _pick_fp(d: dict) -> torch.Tensor | None:
+                v = d.get("vec_fourier_v2")
+                return v if v is not None else d.get("vec_fourier")
+            confusion_fps = [
+                _pick_fp(d)
+                for did, d in eng_index.items()
+                if d.get("confusion_flag")
+                and did != s2_top_id
+                and _pick_fp(d) is not None
+            ]
+            if confusion_fps:
+                # Build flat index for constrained_retrieve
+                flat_index = {
+                    did: _pick_fp(d)
+                    for did, d in eng_index.items()
+                    if _pick_fp(d) is not None
+                }
+                q_constrained = EngramQuery(
+                    like=refined_fp,
+                    unlike=confusion_fps[:3],  # top 3 confusion partners
+                    min_margin=margin_threshold,
+                )
+                s3_results = constrained_retrieve(
+                    q_constrained,
+                    flat_index,
+                )
+                if s3_results:
+                    s3_margin = s3_results[0].margin
+                    s3_conf = (
+                        RetrievalConfidence.MEDIUM
+                        if s3_margin >= margin_threshold
+                        else RetrievalConfidence.LOW
+                    )
+                    return GeodesicResult(
+                        doc_id=s3_results[0].doc_id,
+                        score=s3_results[0].score,
+                        margin=s3_margin,
+                        confidence=s3_conf,
+                        stages_used=3,
+                        constraint_used=True,
+                    )
+        if s2_margin >= margin_threshold:
+            return GeodesicResult(
+                doc_id=s2_top_id,
+                score=s2_results[0].score,
+                margin=s2_margin,
+                confidence=RetrievalConfidence.MEDIUM,
+                stages_used=2,
+            )
+        else:
+            # Both stages low margin — return LOW confidence
+            return GeodesicResult(
+                doc_id=s2_top_id,
+                score=s2_results[0].score,
+                margin=s2_margin,
+                confidence=RetrievalConfidence.LOW,
+                stages_used=2,
+            )
+    else:
+        # No vector for interpolation — return Stage-1 with LOW confidence
+        return GeodesicResult(
+            doc_id=top1_id,
+            score=s1_results[0].score,
+            margin=s1_margin,
+            confidence=RetrievalConfidence.LOW,
+            stages_used=1,
+        )
+def geodesic_retrieve_with_prior(
+    query_fp: torch.Tensor,
+    hnsw_index,
+    eng_index: dict,
+    index_c=None,
+    query_doc_id: str | None = None,
+    margin_threshold: float = 0.005,
+    correction_weight: float = 0.3,
+    top_k: int = 5,
+) -> GeodesicResult:
+    """
+    Prior-aware geodesic retrieval. Uses IndexC history to pre-apply
+    constraints on known chronic failures — skipping Stages 1 and 2.
+    When index_c and query_doc_id are provided:
+      - If doc is a chronic failure: apply Stage 3 (constraint) immediately.
+        This avoids 2 wasted HNSW passes before getting to the constraint.
+      - If doc has prior LOW history (not yet chronic): lower threshold.
+      - If no prior: standard 3-stage geodesic_retrieve().
+    Args:
+        query_fp:         [dim] query fingerprint.
+        hnsw_index:       Built EngramIndex instance.
+        eng_index:        {doc_id: eng_data} from .eng files.
+        index_c:          IndexC instance, or None to disable prior mode.
+        query_doc_id:     doc_id being queried (for prior lookup).
+        margin_threshold: Base margin threshold. Lowered if prior is LOW.
+        correction_weight: Stage-2 interpolation weight.
+        top_k:            Candidates per HNSW pass.
+    Returns:
+        GeodesicResult. For chronic failures: stages_used=0 (preempted).
+    """
+    if index_c is None or query_doc_id is None:
+        return geodesic_retrieve(
+            query_fp, hnsw_index, eng_index,
+            margin_threshold=margin_threshold,
+            correction_weight=correction_weight,
+            top_k=top_k,
+        )
+    prior = index_c.prior(query_doc_id)
+    # Preemptive mode: known chronic failure
+    if prior.is_chronic_failure and prior.n_total >= 2:
+        pairs = index_c.confusion_registry(min_confusions=1)
+        partners = [
+            p.doc_b for p in pairs if p.doc_a == query_doc_id
+        ] + [
+            p.doc_a for p in pairs if p.doc_b == query_doc_id
+        ]
+        unlike_fps = []
+        for partner_id in partners[:3]:
+            partner_eng = eng_index.get(partner_id, {})
+            fp = partner_eng.get("vec_fourier_v2")
+            if fp is None:
+                fp = partner_eng.get("vec_fourier")
+            if fp is not None:
+                unlike_fps.append(fp)
+        if unlike_fps:
+            flat_index: dict[str, torch.Tensor] = {}
+            for did, d in eng_index.items():
+                v = d.get("vec_fourier_v2")
+                if v is None:
+                    v = d.get("vec_fourier")
+                if v is not None:
+                    flat_index[did] = v
+            q_constrained = EngramQuery(
+                like=query_fp,
+                unlike=unlike_fps,
+                min_margin=margin_threshold,
+            )
+            s3 = constrained_retrieve(q_constrained, flat_index)
+            if s3:
+                s3_margin = s3[0].margin
+                return GeodesicResult(
+                    doc_id=s3[0].doc_id,
+                    score=s3[0].score,
+                    margin=s3_margin,
+                    confidence=(
+                        RetrievalConfidence.MEDIUM
+                        if s3_margin >= margin_threshold
+                        else RetrievalConfidence.LOW
+                    ),
+                    stages_used=0,
+                    constraint_used=True,
+                )
+    # Prior LOW but not yet chronic: tighten threshold
+    if prior.n_low > 0 and prior.n_total > 0:
+        low_frac = prior.n_low / prior.n_total
+        margin_threshold = margin_threshold * (1 + low_frac)
+    return geodesic_retrieve(
+        query_fp, hnsw_index, eng_index,
+        margin_threshold=margin_threshold,
+        correction_weight=correction_weight,
+        top_k=top_k,
+    )
+def geodesic_retrieve_stage4(
+    query_fp: torch.Tensor,
+    hnsw_index,
+    eng_index: dict,
+    query_metadata: dict | None = None,
+    index_c=None,
+    query_doc_id: str | None = None,
+    margin_threshold: float = 0.005,
+    correction_weight: float = 0.3,
+    top_k: int = 5,
+) -> GeodesicResult:
+    """
+    Full pipeline: prior-aware geodesic retrieval with Stage 4 fallback.
+    Extends geodesic_retrieve_with_prior() with a Stage 4 metadata
+    disambiguation layer. When confidence is LOW and query_metadata
+    is provided, calls metadata_disambiguate() on top candidates
+    from the last HNSW pass before giving up.
+    Confidence tier:
+      HIGH         -> fingerprint, 0% error rate target
+      MEDIUM       -> fingerprint, 0% error rate target
+      LOW          -> fingerprint failed, Stage 4 unavailable
+      low-metadata -> fingerprint failed, Stage 4 used secondary signal
+    Args:
+        query_fp:       [dim] query fingerprint (vec_fourier_v2).
+        hnsw_index:     Built EngramIndex.
+        eng_index:      {doc_id: eng_data} from .eng files.
+        query_metadata: dict with task_description, context_len, l2_norm,
+                        metadata.domain -- from the query doc's .eng data.
+                        If None, Stage 4 is disabled.
+        index_c:        IndexC instance for prior lookup.
+        query_doc_id:   doc_id of the query source (for priors).
+        margin_threshold, correction_weight, top_k: as in base function.
+    """
+    from kvcos.engram.metadata_disambiguate import metadata_disambiguate
+    base = geodesic_retrieve_with_prior(
+        query_fp, hnsw_index, eng_index,
+        index_c=index_c,
+        query_doc_id=query_doc_id,
+        margin_threshold=margin_threshold,
+        correction_weight=correction_weight,
+        top_k=top_k,
+    )
+    # Only activate Stage 4 on LOW confidence with metadata available
+    if base.confidence != RetrievalConfidence.LOW:
+        return base
+    if query_metadata is None:
+        return base
+    # Get top candidates from HNSW for metadata scoring
+    candidates_hnsw = hnsw_index.search(query_fp, top_k=5)
+    candidates_meta = [
+        eng_index[r.doc_id]
+        for r in candidates_hnsw
+        if r.doc_id in eng_index
+    ]
+    if not candidates_meta:
+        return base
+    s4 = metadata_disambiguate(candidates_meta, query_metadata)
+    if s4 is None:
+        return base
+    return GeodesicResult(
+        doc_id           = s4.doc_id,
+        score            = base.score,
+        margin           = base.margin,
+        confidence       = RetrievalConfidence.LOW,   # still LOW
+        stages_used      = base.stages_used,
+        constraint_used  = base.constraint_used,
+        stage4_used      = True,
+        stage4_doc_id    = s4.doc_id,
+    )
+    # Note: confidence stays LOW -- Stage 4 is a tiebreaker, not a promotion.
+    # Callers should check stage4_used=True to distinguish "failed silently"
+    # from "failed with secondary signal". The confidence tier string
+    # 'low-metadata' is available via Stage4Result for logging.

kvcos/engram/session_propagator.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""
+kvcos/engram/session_propagator.py — Session start/end for ENGRAM.
+Bridges geodesic_retrieve() results and IndexC persistence.
+Call session_start() at the top of each session to load priors.
+Call session_end() at the bottom to persist results.
+"""
+from __future__ import annotations
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from kvcos.engram.index_c import IndexC, DocPrior
+@dataclass
+class SessionSummary:
+    session_id: str
+    n_total: int
+    n_correct: int
+    n_high: int
+    n_medium: int
+    n_low: int
+    n_preempted: int
+    new_confusion_pairs: list[tuple[str, str]]
+    recall: float
+    duration_s: float
+class SessionPropagator:
+    """
+    Manages session-level IndexC writes.
+    Accumulates retrieval results in memory during the session.
+    Writes them to IndexC at session_end().
+    """
+    def __init__(self, db_path: str, session_id: str):
+        self._db_path = str(db_path)
+        self._session_id = session_id
+        self._ic: IndexC | None = None
+        self._records: list[dict] = []
+        self._start_ts: float = 0.0
+        self._started: bool = False
+    def session_start(self) -> dict[str, DocPrior]:
+        """
+        Open IndexC, return {doc_id: DocPrior} for all known docs.
+        Call at the top of each session.
+        """
+        self._ic = IndexC.open(self._db_path)
+        self._start_ts = time.time()
+        self._started = True
+        rmap = self._ic.reliability_map()
+        return {
+            doc_id: self._ic.prior(doc_id)
+            for doc_id in rmap
+        }
+    @property
+    def index_c(self) -> IndexC:
+        """Access the IndexC instance (after session_start)."""
+        if self._ic is None:
+            raise RuntimeError("Call session_start() first.")
+        return self._ic
+    def record(
+        self,
+        query_doc_id: str,
+        result_doc_id: str,
+        confidence: str,
+        margin: float,
+        stages_used: int = 1,
+        constraint_used: bool = False,
+        correct: bool = True,
+    ) -> None:
+        """Buffer one retrieval result for this session."""
+        self._records.append({
+            "query_doc_id": query_doc_id,
+            "result_doc_id": result_doc_id,
+            "confidence": confidence,
+            "margin": float(margin),
+            "stages_used": int(stages_used),
+            "constraint_used": bool(constraint_used),
+            "correct": bool(correct),
+            "ts": time.time(),
+        })
+    def session_end(self) -> SessionSummary:
+        """Write all buffered records to IndexC. Return summary."""
+        if not self._started or self._ic is None:
+            raise RuntimeError("Call session_start() before session_end().")
+        confusion_before = {
+            (p.doc_a, p.doc_b)
+            for p in self._ic.confusion_registry(min_confusions=1)
+        }
+        for rec in self._records:
+            self._ic.record(
+                session_id=self._session_id,
+                query_doc_id=rec["query_doc_id"],
+                result_doc_id=rec["result_doc_id"],
+                confidence=rec["confidence"],
+                margin=rec["margin"],
+                stages_used=rec["stages_used"],
+                constraint_used=rec["constraint_used"],
+                correct=rec["correct"],
+                ts=rec["ts"],
+            )
+        confusion_after = {
+            (p.doc_a, p.doc_b)
+            for p in self._ic.confusion_registry(min_confusions=1)
+        }
+        new_pairs = list(confusion_after - confusion_before)
+        n_total = len(self._records)
+        n_correct = sum(1 for r in self._records if r["correct"])
+        counters = {"high": 0, "medium": 0, "low": 0}
+        n_preempted = 0
+        for r in self._records:
+            counters[r["confidence"]] = counters.get(r["confidence"], 0) + 1
+            if r["stages_used"] == 0:
+                n_preempted += 1
+        summary = SessionSummary(
+            session_id=self._session_id,
+            n_total=n_total,
+            n_correct=n_correct,
+            n_high=counters["high"],
+            n_medium=counters["medium"],
+            n_low=counters["low"],
+            n_preempted=n_preempted,
+            new_confusion_pairs=new_pairs,
+            recall=n_correct / n_total if n_total > 0 else 0.0,
+            duration_s=time.time() - self._start_ts,
+        )
+        self._ic.close()
+        self._ic = None
+        self._started = False
+        self._records = []
+        return summary
+    def summary_str(self, s: SessionSummary) -> str:
+        return (
+            f"Session {s.session_id}: "
+            f"{s.n_total} retrievals | "
+            f"recall={s.recall:.1%} | "
+            f"H={s.n_high}/M={s.n_medium}/L={s.n_low} | "
+            f"preempted={s.n_preempted} | "
+            f"new_pairs={len(s.new_confusion_pairs)} | "
+            f"{s.duration_s:.1f}s"
+        )

kvcos/engram/writer.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""
+EIGENGRAM writer: text + model -> .eng file
+"""
+from __future__ import annotations
+import gc
+import hashlib
+import os
+from pathlib import Path
+import torch
+from llama_cpp import Llama
+from kvcos.core.blob_parser import parse_state_blob
+from .format import EigramEncoder
+_encoder = EigramEncoder()
+_DEFAULT_LR = (8, 24)
+_DEFAULT_GATE = 6
+_DEFAULT_RANK = 116
+def _get_model_id(model_path: str) -> str:
+    name = os.path.basename(model_path)
+    if "3B" in name or "3b" in name:
+        return "Llama-3.2-3B"
+    if "8B" in name or "8b" in name:
+        return "Llama-3.1-8B"
+    return name[:15]
+def _corpus_hash(basis_path: str) -> str:
+    raw = Path(basis_path).read_bytes()
+    return hashlib.sha256(raw).hexdigest()[:32]
+def write_eigengram(
+    model_path: str,
+    text: str,
+    output_path: str,
+    cache_id: str = "",
+    task_description: str = "",
+    layer_range: tuple[int, int] = _DEFAULT_LR,
+    gate: int = _DEFAULT_GATE,
+    rank_perdoc: int = _DEFAULT_RANK,
+    basis_path: str = "results/corpus_basis_fcdb_v2.pt",
+) -> dict:
+    """Encode a document as an EIGENGRAM file."""
+    saved = torch.load(basis_path, weights_only=False)
+    P_fcdb = saved["basis"]
+    center = saved["joint_center"]
+    n_corpus = int(saved["n_docs"])
+    basis_rank = P_fcdb.shape[0]
+    llm = Llama(model_path=model_path, n_ctx=2048, n_gpu_layers=-1, verbose=False)
+    meta = llm.metadata
+    n_kv = int(meta.get("llama.attention.head_count_kv", "8"))
+    hd = int(meta.get("llama.embedding_length", "4096")) // int(
+        meta.get("llama.attention.head_count", "32")
+    )
+    llm.reset()
+    llm(text.strip(), max_tokens=1, temperature=0.0)
+    state_bytes = bytes(llm.save_state().llama_state)
+    del llm
+    gc.collect()
+    p = parse_state_blob(state_bytes, n_kv_heads=n_kv, head_dim=hd)
+    l0, l1 = layer_range
+    k = p.keys[l0:l1].float().reshape(-1, hd)
+    mean_v = k.mean(0)
+    l2_norm = float(mean_v.norm().item())
+    # Per-doc SVD fingerprint
+    if k.shape[0] > 8192:
+        gen = torch.Generator()
+        gen.manual_seed(42)
+        idx = torch.randperm(k.shape[0], generator=gen)[:8192]
+        svd_input = k[idx]
+    else:
+        svd_input = k
+    _, _, Vh = torch.linalg.svd(svd_input, full_matrices=False)
+    proj = (svd_input @ Vh[gate : gate + rank_perdoc].T).mean(0)
+    vec_perdoc = proj / (proj.norm() + 1e-8)
+    # FCDB fingerprint
+    delta = mean_v - center
+    delta = delta / (delta.norm() + 1e-8)
+    vec_fcdb = delta @ P_fcdb.T
+    vec_fcdb = vec_fcdb / (vec_fcdb.norm() + 1e-8)
+    # SCS
+    scs = float(
+        ((delta @ P_fcdb.T @ P_fcdb) ** 2).sum().item()
+        / ((delta**2).sum().item() + 1e-12)
+    )
+    corpus_h = _corpus_hash(basis_path)
+    model_id = _get_model_id(model_path)
+    cert = _encoder.encode(
+        vec_perdoc=vec_perdoc,
+        vec_fcdb=vec_fcdb,
+        joint_center=center,
+        corpus_hash=corpus_h,
+        model_id=model_id,
+        basis_rank=basis_rank,
+        n_corpus=n_corpus,
+        layer_range=layer_range,
+        context_len=int(k.shape[0]),
+        l2_norm=l2_norm,
+        scs=scs,
+        margin_proof=0.0,
+        task_description=task_description or text[:100],
+        cache_id=cache_id or "",
+    )
+    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+    with open(output_path, "wb") as f:
+        f.write(cert)
+    return {
+        "output_path": output_path,
+        "model_id": model_id,
+        "corpus_hash": corpus_h,
+        "basis_rank": basis_rank,
+        "n_corpus": n_corpus,
+        "file_size_bytes": len(cert),
+        "scs": round(scs, 4),
+        "l2_norm": round(l2_norm, 4),
+        "layer_range": layer_range,
+    }

kvcos/mar/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""ENGRAM Protocol — MAR (Manifold Attention Retrieval)
+Backward compatibility re-exports. All classes have moved to kvcos.core.
+Import from kvcos.core directly for new code.
+"""
+from kvcos.core.manifold_index import IndexEntry, ManifoldIndex
+from kvcos.core.retriever import EGRRetriever, RetrievalResponse, RetrievalResult
+from kvcos.core.state_extractor import ExtractionResult, MARStateExtractor, SVDProjection
+__all__ = [
+    "IndexEntry",
+    "ManifoldIndex",
+    "EGRRetriever",
+    "RetrievalResponse",
+    "RetrievalResult",
+    "ExtractionResult",
+    "MARStateExtractor",
+    "SVDProjection",
+]

kvcos/storage/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""ENGRAM Protocol — Storage backends for .eng files."""
+from kvcos.storage.backends import StorageBackend
+from kvcos.storage.local import LocalStorageBackend
+__all__ = [
+    "StorageBackend",
+    "LocalStorageBackend",
+]

kvcos/storage/backends.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+ENGRAM Protocol — Abstract Storage Backend
+All storage backends (local, redis, S3) implement this interface.
+Phase 1 uses local disk only.
+"""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from pathlib import Path
+from kvcos.core.types import CacheStats, EngramMetadata
+class StorageBackend(ABC):
+    """Abstract interface for engram storage backends.
+    All operations are synchronous in Phase 1.
+    """
+    @abstractmethod
+    def store(self, cache_id: str, data: bytes, metadata: EngramMetadata) -> str:
+        """Store a .eng file. Returns storage path/key."""
+        ...
+    @abstractmethod
+    def store_file(self, cache_id: str, source_path: Path, metadata: EngramMetadata) -> str:
+        """Store a .eng file from a local path (zero-copy when possible)."""
+        ...
+    @abstractmethod
+    def get(self, cache_id: str) -> bytes | None:
+        """Retrieve a .eng file as bytes. None if not found."""
+        ...
+    @abstractmethod
+    def get_path(self, cache_id: str) -> Path | None:
+        """Get local filesystem path for a cache entry. None if not found."""
+        ...
+    @abstractmethod
+    def get_metadata(self, cache_id: str) -> EngramMetadata | None:
+        """Read only metadata (header-only, no tensor data loaded)."""
+        ...
+    @abstractmethod
+    def delete(self, cache_id: str) -> bool:
+        """Delete a cache entry. Returns True if deleted."""
+        ...
+    @abstractmethod
+    def list_entries(
+        self,
+        agent_id: str | None = None,
+        model_family: str | None = None,
+        limit: int = 100,
+    ) -> list[EngramMetadata]:
+        """List cache entries with optional filters."""
+        ...
+    @abstractmethod
+    def exists(self, cache_id: str) -> bool:
+        """Check if a cache entry exists."""
+        ...
+    @abstractmethod
+    def stats(self) -> CacheStats:
+        """Get aggregate statistics for the store."""
+        ...

kvcos/storage/local.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""
+ENGRAM Protocol — Local Disk Storage Backend
+Directory layout:
+    {data_dir}/{model_family}/{agent_id}/{date}/{cache_id}.eng
+Phase 1 production backend. Zero infrastructure dependencies.
+Uses safetensors header-only read for metadata operations.
+D7: One safetensors file per 256-token block.
+"""
+from __future__ import annotations
+import logging
+import shutil
+from collections import defaultdict
+from datetime import datetime, timezone
+from pathlib import Path
+logger = logging.getLogger(__name__)
+from kvcos.core.serializer import EngramSerializer
+from kvcos.core.types import ENG_FILE_EXTENSION, CacheStats, EngramMetadata
+from kvcos.storage.backends import StorageBackend
+class LocalStorageBackend(StorageBackend):
+    """Local filesystem storage for .eng files.
+    Files organized by model family, agent ID, and date.
+    """
+    def __init__(self, data_dir: Path):
+        self.data_dir = data_dir
+        self.data_dir.mkdir(parents=True, exist_ok=True)
+        self._serializer = EngramSerializer()
+        self._index: dict[str, Path] = {}  # cache_id → file path
+        self._rebuild_index()
+    def _rebuild_index(self) -> None:
+        """Scan data directory and rebuild in-memory path index."""
+        self._index.clear()
+        for eng_file in self.data_dir.rglob(f"*{ENG_FILE_EXTENSION}"):
+            cache_id = eng_file.stem
+            try:
+                meta = self._serializer.read_metadata_only(eng_file)
+                if "cache_id" in meta:
+                    cache_id = meta["cache_id"]
+            except Exception as e:
+                logger.debug("Skipping metadata for %s: %s", eng_file.name, e)
+            self._index[cache_id] = eng_file
+    def _resolve_path(self, metadata: EngramMetadata) -> Path:
+        """Determine storage path from metadata."""
+        model_family = metadata.get("model_family", "unknown")
+        agent_id = metadata.get("agent_id", "default")
+        date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+        cache_id = metadata.get("cache_id", "unknown")
+        path = self.data_dir / model_family / agent_id / date_str / f"{cache_id}{ENG_FILE_EXTENSION}"
+        path.parent.mkdir(parents=True, exist_ok=True)
+        return path
+    def store(self, cache_id: str, data: bytes, metadata: EngramMetadata) -> str:
+        metadata_copy = dict(metadata)
+        metadata_copy["cache_id"] = cache_id
+        path = self._resolve_path(metadata_copy)  # type: ignore[arg-type]
+        tmp_path = path.with_suffix(f"{ENG_FILE_EXTENSION}.tmp")
+        try:
+            tmp_path.write_bytes(data)
+            tmp_path.rename(path)
+        except Exception:
+            tmp_path.unlink(missing_ok=True)
+            raise
+        self._index[cache_id] = path
+        return str(path)
+    def store_file(self, cache_id: str, source_path: Path, metadata: EngramMetadata) -> str:
+        metadata_copy = dict(metadata)
+        metadata_copy["cache_id"] = cache_id
+        dest_path = self._resolve_path(metadata_copy)  # type: ignore[arg-type]
+        if source_path == dest_path:
+            self._index[cache_id] = dest_path
+            return str(dest_path)
+        tmp_path = dest_path.with_suffix(f"{ENG_FILE_EXTENSION}.tmp")
+        try:
+            shutil.copy2(str(source_path), str(tmp_path))
+            tmp_path.rename(dest_path)
+        except Exception:
+            tmp_path.unlink(missing_ok=True)
+            raise
+        self._index[cache_id] = dest_path
+        return str(dest_path)
+    def get(self, cache_id: str) -> bytes | None:
+        path = self._index.get(cache_id)
+        if path is None or not path.exists():
+            return None
+        return path.read_bytes()
+    def get_path(self, cache_id: str) -> Path | None:
+        path = self._index.get(cache_id)
+        if path is None or not path.exists():
+            return None
+        return path
+    def get_metadata(self, cache_id: str) -> EngramMetadata | None:
+        path = self._index.get(cache_id)
+        if path is None or not path.exists():
+            return None
+        try:
+            return self._serializer.read_metadata_only(path)
+        except Exception as e:
+            logger.warning("Failed to read metadata for %s: %s", cache_id, e)
+            return None
+    def delete(self, cache_id: str) -> bool:
+        path = self._index.pop(cache_id, None)
+        if path is None or not path.exists():
+            return False
+        path.unlink()
+        parent = path.parent
+        try:
+            while parent != self.data_dir:
+                if not any(parent.iterdir()):
+                    parent.rmdir()
+                    parent = parent.parent
+                else:
+                    break
+        except OSError:
+            pass
+        return True
+    def list_entries(
+        self,
+        agent_id: str | None = None,
+        model_family: str | None = None,
+        limit: int = 100,
+    ) -> list[EngramMetadata]:
+        results: list[EngramMetadata] = []
+        for cache_id, path in self._index.items():
+            if len(results) >= limit:
+                break
+            if not path.exists():
+                continue
+            try:
+                meta = self._serializer.read_metadata_only(path)
+            except Exception as e:
+                logger.debug("Skipping %s in list_entries: %s", cache_id, e)
+                continue
+            if agent_id and meta.get("agent_id") != agent_id:
+                continue
+            if model_family and meta.get("model_family") != model_family:
+                continue
+            results.append(meta)
+        results.sort(key=lambda m: m.get("created_at", ""), reverse=True)
+        return results[:limit]
+    def exists(self, cache_id: str) -> bool:
+        path = self._index.get(cache_id)
+        return path is not None and path.exists()
+    def stats(self) -> CacheStats:
+        total_entries = 0
+        total_size = 0
+        model_counts: dict[str, int] = defaultdict(int)
+        for cache_id, path in self._index.items():
+            if not path.exists():
+                continue
+            total_entries += 1
+            total_size += path.stat().st_size
+            try:
+                meta = self._serializer.read_metadata_only(path)
+                model_counts[meta.get("model_family", "unknown")] += 1
+            except Exception as e:
+                logger.debug("Metadata read failed for %s: %s", cache_id, e)
+                model_counts["unknown"] += 1
+        return CacheStats(
+            total_entries=total_entries,
+            total_size_bytes=total_size,
+            avg_compression_ratio=0.0,
+            model_breakdown=dict(model_counts),
+        )
+    def vacuum(self) -> int:
+        """Remove stale index entries for deleted files."""
+        stale = [cid for cid, path in self._index.items() if not path.exists()]
+        for cid in stale:
+            del self._index[cid]
+        return len(stale)