File size: 5,976 Bytes

0769ff3

"""
ENGRAM Protocol — Model Architecture Registry


Contains ModelCacheSpec definitions for known models and utilities
to look up specs by model_id or infer model family from string.

D3: extraction_layers set to middle-to-deep (8-31 for 32-layer models)
per ShadowKV validation. Early layers (0-7) and final layer preserved.
"""

from __future__ import annotations

from kvcos.core.types import AttentionType, CacheSection, ModelCacheSpec

# ── Pre-registered Model Specs ────────────────────────────────────────────────

# Llama 3.1 8B — Primary Phase 1 target (D1, D6)
# GQA: 32 query heads, 8 KV heads, head_dim 128
LLAMA_3_1_8B = ModelCacheSpec(
    model_id="meta-llama/Llama-3.1-8B-Instruct",
    model_family="llama",
    n_layers=32,
    n_heads=32,
    n_kv_heads=8,
    head_dim=128,
    rope_enabled=True,
    extraction_layers=tuple(range(8, 32)),  # layers 8-31 (D3)
)

# Llama 3.1 8B base (non-instruct)
LLAMA_3_1_8B_BASE = ModelCacheSpec(
    model_id="meta-llama/Llama-3.1-8B",
    model_family="llama",
    n_layers=32,
    n_heads=32,
    n_kv_heads=8,
    head_dim=128,
    rope_enabled=True,
    extraction_layers=tuple(range(8, 32)),
)

# Phi-3-Mini-128K — Secondary Phase 1 target
# ShadowKV validated SVD on this model (D3)
# MHA: 32 query heads, 32 KV heads (no GQA), head_dim 96
PHI_3_MINI = ModelCacheSpec(
    model_id="microsoft/Phi-3-mini-128k-instruct",
    model_family="phi",
    n_layers=32,
    n_heads=32,
    n_kv_heads=32,  # Phi-3-Mini uses MHA, not GQA
    head_dim=96,
    rope_enabled=True,
    extraction_layers=tuple(range(8, 32)),
)

# Gemma 2 2B — NOTE: QK-Norm model, SVD behavior may differ (T3 caveat)
GEMMA_2_2B = ModelCacheSpec(
    model_id="google/gemma-2-2b-it",
    model_family="gemma",
    n_layers=26,
    n_heads=8,
    n_kv_heads=4,
    head_dim=256,
    rope_enabled=True,
    extraction_layers=tuple(range(6, 26)),
)

# Qwen 2.5 7B
QWEN_2_5_7B = ModelCacheSpec(
    model_id="Qwen/Qwen2.5-7B-Instruct",
    model_family="qwen",
    n_layers=28,
    n_heads=28,
    n_kv_heads=4,
    head_dim=128,
    rope_enabled=True,
    extraction_layers=tuple(range(7, 28)),
)

# Mistral 7B v0.3
MISTRAL_7B = ModelCacheSpec(
    model_id="mistralai/Mistral-7B-Instruct-v0.3",
    model_family="mistral",
    n_layers=32,
    n_heads=32,
    n_kv_heads=8,
    head_dim=128,
    rope_enabled=True,
    extraction_layers=tuple(range(8, 32)),
)


# Gemma 4 26B-A4B — ISWA model (Interleaved Sliding Window Attention)
# Dual KV cache: Global (full context) + SWA (sliding window 1024 tokens)
# MoE: 128 experts, 8 active — does NOT affect KV cache (FFN-only)
# Reverse-engineered from llama.cpp b5200+ state blob format.
GEMMA_4_26B_A4B = ModelCacheSpec(
    model_id="google/gemma-4-26b-a4b-it",
    model_family="gemma",
    n_layers=30,   # total: 5 global + 25 SWA
    n_heads=32,
    n_kv_heads=8,  # dominant section (SWA)
    head_dim=256,  # dominant section (SWA)
    rope_enabled=True,
    extraction_layers=tuple(range(8, 30)),
    cache_sections=(
        CacheSection(
            attention_type=AttentionType.FULL,
            n_layers=5,
            n_kv_heads=2,
            head_dim=512,
        ),
        CacheSection(
            attention_type=AttentionType.SLIDING,
            n_layers=25,
            n_kv_heads=8,
            head_dim=256,
            window_size=1024,
        ),
    ),
)


# ── Registry ──────────────────────────────────────────────────────────────────

_REGISTRY: dict[str, ModelCacheSpec] = {
    spec["model_id"]: spec
    for spec in [
        LLAMA_3_1_8B,
        LLAMA_3_1_8B_BASE,
        PHI_3_MINI,
        GEMMA_2_2B,
        GEMMA_4_26B_A4B,
        QWEN_2_5_7B,
        MISTRAL_7B,
    ]
}

_FAMILY_MAP: dict[str, str] = {
    "llama": "llama",
    "meta-llama": "llama",
    "phi": "phi",
    "microsoft/phi": "phi",
    "gemma": "gemma",
    "google/gemma": "gemma",
    "qwen": "qwen",
    "mistral": "mistral",
    "deepseek": "deepseek",
}


def get_model_spec(model_id: str) -> ModelCacheSpec | None:
    """Look up a ModelCacheSpec by exact model_id."""
    return _REGISTRY.get(model_id)


def register_model_spec(spec: ModelCacheSpec) -> None:
    """Register a new model spec in the runtime registry."""
    _REGISTRY[spec["model_id"]] = spec


def infer_model_family(model_id: str) -> str:
    """Infer model family from a model_id string."""
    model_id_lower = model_id.lower()
    for prefix, family in _FAMILY_MAP.items():
        if prefix in model_id_lower:
            return family
    return "unknown"


def make_spec_from_metadata(
    model_id: str,
    n_layers: int,
    n_heads: int,
    n_kv_heads: int,
    head_dim: int,
    rope_enabled: bool = True,
) -> ModelCacheSpec:
    """Create a ModelCacheSpec from raw parameters.

    Automatically sets extraction_layers to middle-to-deep range (D3).
    """
    skip_layers = max(1, n_layers // 4)
    extraction_layers = tuple(range(skip_layers, n_layers))

    return ModelCacheSpec(
        model_id=model_id,
        model_family=infer_model_family(model_id),
        n_layers=n_layers,
        n_heads=n_heads,
        n_kv_heads=n_kv_heads,
        head_dim=head_dim,
        rope_enabled=rope_enabled,
        extraction_layers=extraction_layers,
    )


def is_iswa_spec(spec: ModelCacheSpec) -> bool:
    """Check if a model spec describes an ISWA (multi-section) cache."""
    return "cache_sections" in spec


def validate_kv_shape(
    spec: ModelCacheSpec,
    n_layers: int,
    n_kv_heads: int,
    head_dim: int,
) -> bool:
    """Validate that KV tensor dimensions match the model spec."""
    return (
        spec["n_layers"] == n_layers
        and spec["n_kv_heads"] == n_kv_heads
        and spec["head_dim"] == head_dim
    )