from __future__ import annotations from dataclasses import asdict, dataclass from typing import Literal ModelFamily = Literal["llama", "qwen2", "qwen3_5_hybrid"] SourceFormat = Literal["hf", "gguf"] RuntimeName = Literal["transformers", "dotcache_hf", "vllm", "llama_cpp"] LocalTier = Literal["works_here", "stretch_here", "reference_only"] @dataclass(frozen=True, slots=True) class ModelSpec: key: str display_name: str model_id: str tokenizer_model_id: str | None family: ModelFamily source_format: SourceFormat runtime: RuntimeName context_window: int local_tier: LocalTier dotcache_ready: bool benchmark_harness: str | None prompt_lengths: tuple[int, ...] notes: str gguf_hf_file: str | None = None def to_dict(self) -> dict[str, object]: return asdict(self) _MODEL_REGISTRY: dict[str, ModelSpec] = { "tinyllama_hf": ModelSpec( key="tinyllama_hf", display_name="TinyLlama 1.1B Chat", model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0", tokenizer_model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0", family="llama", source_format="hf", runtime="dotcache_hf", context_window=2048, local_tier="works_here", dotcache_ready=True, benchmark_harness="llama_compare", prompt_lengths=(289, 577, 1536), notes="Current smallest real-model regression lane for exact HF DotCache on this Mac.", ), "smollm2_360m_hf": ModelSpec( key="smollm2_360m_hf", display_name="SmolLM2 360M Instruct", model_id="HuggingFaceTB/SmolLM2-360M-Instruct", tokenizer_model_id="HuggingFaceTB/SmolLM2-360M-Instruct", family="llama", source_format="hf", runtime="dotcache_hf", context_window=8192, local_tier="works_here", dotcache_ready=True, benchmark_harness="llama_compare", prompt_lengths=(1024, 2048), notes="Best higher-context real-model lane currently working on this Mac.", ), "smollm2_1p7b_hf": ModelSpec( key="smollm2_1p7b_hf", display_name="SmolLM2 1.7B Instruct", model_id="HuggingFaceTB/SmolLM2-1.7B-Instruct", tokenizer_model_id="HuggingFaceTB/SmolLM2-1.7B-Instruct", family="llama", source_format="hf", runtime="dotcache_hf", context_window=8192, local_tier="stretch_here", dotcache_ready=True, benchmark_harness="llama_compare", prompt_lengths=(1024, 2048), notes="Larger SmolLM2 CUDA lane. Current recorded runs keep 1.0 agreement on plain M0/M0 at 1024 and 2048.", ), "llama32_3b_hf": ModelSpec( key="llama32_3b_hf", display_name="Llama 3.2 3B Instruct", model_id="meta-llama/Llama-3.2-3B-Instruct", tokenizer_model_id="meta-llama/Llama-3.2-3B-Instruct", family="llama", source_format="hf", runtime="dotcache_hf", context_window=131072, local_tier="stretch_here", dotcache_ready=True, benchmark_harness="llama_compare", prompt_lengths=(1024, 2048, 4096), notes="Best next proper-model HF target because it matches the current Llama-family integration path.", ), "qwen25_3b_hf": ModelSpec( key="qwen25_3b_hf", display_name="Qwen2.5 3B Instruct", model_id="Qwen/Qwen2.5-3B-Instruct", tokenizer_model_id="Qwen/Qwen2.5-3B-Instruct", family="qwen2", source_format="hf", runtime="dotcache_hf", context_window=32768, local_tier="stretch_here", dotcache_ready=True, benchmark_harness="qwen2_compare", prompt_lengths=(1024, 2048, 4096), notes="First non-Llama native-weight DotCache target on the HF path. On CUDA, the recommended lane is key-exact K=M3 / V=M0 because default M0/M0 drifts on Qwen2.5.", ), "qwen25_1p5b_hf": ModelSpec( key="qwen25_1p5b_hf", display_name="Qwen2.5 1.5B Instruct", model_id="Qwen/Qwen2.5-1.5B-Instruct", tokenizer_model_id="Qwen/Qwen2.5-1.5B-Instruct", family="qwen2", source_format="hf", runtime="dotcache_hf", context_window=32768, local_tier="stretch_here", dotcache_ready=True, benchmark_harness="qwen2_compare", prompt_lengths=(1024, 2048), notes="Smaller Qwen2.5 CUDA lane. Current recorded runs show that layer:0 selective exact-K restores 2048 agreement with about 3.6% exact K pages.", ), "qwen25_7b_hf": ModelSpec( key="qwen25_7b_hf", display_name="Qwen2.5 7B Instruct", model_id="Qwen/Qwen2.5-7B-Instruct", tokenizer_model_id="Qwen/Qwen2.5-7B-Instruct", family="qwen2", source_format="hf", runtime="dotcache_hf", context_window=32768, local_tier="stretch_here", dotcache_ready=True, benchmark_harness="qwen2_compare", prompt_lengths=(1024, 2048, 4096), notes="First larger CUDA-native Qwen2 scale-up lane. On the 5090 pod, key-exact K=M3 / V=M0 restored 1024/2048 agreement where default M0/M0 drifted.", ), "qwen35_4b_hf": ModelSpec( key="qwen35_4b_hf", display_name="Qwen3.5 4B", model_id="Qwen/Qwen3.5-4B", tokenizer_model_id="Qwen/Qwen3.5-4B", family="qwen3_5_hybrid", source_format="hf", runtime="transformers", context_window=262144, local_tier="reference_only", dotcache_ready=False, benchmark_harness="qwen35_text", prompt_lengths=(512, 1024), notes="Dense text lane on non-CUDA backends and the default CUDA 4B StateCache lane on this pod. The current recommended CUDA runtime is DeltaNet StateCache post_update_m0 at 8-bit with renorm disabled plus recurrent M3 escapes on layers 0, 1, and 2.", ), "qwen35_0p8b_hf": ModelSpec( key="qwen35_0p8b_hf", display_name="Qwen3.5 0.8B", model_id="Qwen/Qwen3.5-0.8B", tokenizer_model_id="Qwen/Qwen3.5-0.8B", family="qwen3_5_hybrid", source_format="hf", runtime="transformers", context_window=262144, local_tier="stretch_here", dotcache_ready=False, benchmark_harness="qwen35_text", prompt_lengths=(512, 1024), notes="Dense text lane on non-CUDA backends and the default CUDA StateCache lane on this pod. The current recommended CUDA runtime is DeltaNet StateCache post_update_m0 at 8-bit with renorm disabled.", ), "llama32_3b_gguf": ModelSpec( key="llama32_3b_gguf", display_name="Llama 3.2 3B Instruct GGUF", model_id="bartowski/Llama-3.2-3B-Instruct-GGUF", tokenizer_model_id="meta-llama/Llama-3.2-3B-Instruct", family="llama", source_format="gguf", runtime="llama_cpp", context_window=131072, local_tier="reference_only", dotcache_ready=False, benchmark_harness="gguf_external", prompt_lengths=(1024, 2048, 4096), notes="External reference baseline for llama.cpp / GGUF comparisons.", gguf_hf_file="Llama-3.2-3B-Instruct-Q4_K_M.gguf", ), "qwen25_3b_gguf": ModelSpec( key="qwen25_3b_gguf", display_name="Qwen2.5 3B Instruct GGUF", model_id="Qwen/Qwen2.5-3B-Instruct-GGUF", tokenizer_model_id="Qwen/Qwen2.5-3B-Instruct", family="qwen2", source_format="gguf", runtime="llama_cpp", context_window=32768, local_tier="reference_only", dotcache_ready=False, benchmark_harness="gguf_external", prompt_lengths=(1024, 2048, 4096), notes="External GGUF reference lane for future TurboQuant / llama.cpp comparisons.", gguf_hf_file="qwen2.5-3b-instruct-q4_k_m.gguf", ), "qwen25_7b_gguf": ModelSpec( key="qwen25_7b_gguf", display_name="Qwen2.5 7B Instruct GGUF", model_id="Qwen/Qwen2.5-7B-Instruct-GGUF", tokenizer_model_id="Qwen/Qwen2.5-7B-Instruct", family="qwen2", source_format="gguf", runtime="llama_cpp", context_window=32768, local_tier="reference_only", dotcache_ready=False, benchmark_harness="gguf_external", prompt_lengths=(1024, 2048, 4096), notes="External GGUF reference lane for larger Qwen2.5 CUDA comparisons.", gguf_hf_file="qwen2.5-7b-instruct-q4_k_m-00001-of-00002.gguf", ), } def list_model_specs() -> tuple[ModelSpec, ...]: return tuple(_MODEL_REGISTRY.values()) def get_model_spec(key: str) -> ModelSpec: try: return _MODEL_REGISTRY[key] except KeyError as exc: raise KeyError(f"unknown model registry key: {key}") from exc