DotCache-Arena / dotcache /model_registry.py
DeanoCalver's picture
Initial DotCache Arena Space upload
751ad26 verified
Raw
History Blame Contribute Delete
8.77 kB
from __future__ import annotations
from dataclasses import asdict, dataclass
from typing import Literal
ModelFamily = Literal["llama", "qwen2", "qwen3_5_hybrid"]
SourceFormat = Literal["hf", "gguf"]
RuntimeName = Literal["transformers", "dotcache_hf", "vllm", "llama_cpp"]
LocalTier = Literal["works_here", "stretch_here", "reference_only"]
@dataclass(frozen=True, slots=True)
class ModelSpec:
key: str
display_name: str
model_id: str
tokenizer_model_id: str | None
family: ModelFamily
source_format: SourceFormat
runtime: RuntimeName
context_window: int
local_tier: LocalTier
dotcache_ready: bool
benchmark_harness: str | None
prompt_lengths: tuple[int, ...]
notes: str
gguf_hf_file: str | None = None
def to_dict(self) -> dict[str, object]:
return asdict(self)
_MODEL_REGISTRY: dict[str, ModelSpec] = {
"tinyllama_hf": ModelSpec(
key="tinyllama_hf",
display_name="TinyLlama 1.1B Chat",
model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
tokenizer_model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
family="llama",
source_format="hf",
runtime="dotcache_hf",
context_window=2048,
local_tier="works_here",
dotcache_ready=True,
benchmark_harness="llama_compare",
prompt_lengths=(289, 577, 1536),
notes="Current smallest real-model regression lane for exact HF DotCache on this Mac.",
),
"smollm2_360m_hf": ModelSpec(
key="smollm2_360m_hf",
display_name="SmolLM2 360M Instruct",
model_id="HuggingFaceTB/SmolLM2-360M-Instruct",
tokenizer_model_id="HuggingFaceTB/SmolLM2-360M-Instruct",
family="llama",
source_format="hf",
runtime="dotcache_hf",
context_window=8192,
local_tier="works_here",
dotcache_ready=True,
benchmark_harness="llama_compare",
prompt_lengths=(1024, 2048),
notes="Best higher-context real-model lane currently working on this Mac.",
),
"smollm2_1p7b_hf": ModelSpec(
key="smollm2_1p7b_hf",
display_name="SmolLM2 1.7B Instruct",
model_id="HuggingFaceTB/SmolLM2-1.7B-Instruct",
tokenizer_model_id="HuggingFaceTB/SmolLM2-1.7B-Instruct",
family="llama",
source_format="hf",
runtime="dotcache_hf",
context_window=8192,
local_tier="stretch_here",
dotcache_ready=True,
benchmark_harness="llama_compare",
prompt_lengths=(1024, 2048),
notes="Larger SmolLM2 CUDA lane. Current recorded runs keep 1.0 agreement on plain M0/M0 at 1024 and 2048.",
),
"llama32_3b_hf": ModelSpec(
key="llama32_3b_hf",
display_name="Llama 3.2 3B Instruct",
model_id="meta-llama/Llama-3.2-3B-Instruct",
tokenizer_model_id="meta-llama/Llama-3.2-3B-Instruct",
family="llama",
source_format="hf",
runtime="dotcache_hf",
context_window=131072,
local_tier="stretch_here",
dotcache_ready=True,
benchmark_harness="llama_compare",
prompt_lengths=(1024, 2048, 4096),
notes="Best next proper-model HF target because it matches the current Llama-family integration path.",
),
"qwen25_3b_hf": ModelSpec(
key="qwen25_3b_hf",
display_name="Qwen2.5 3B Instruct",
model_id="Qwen/Qwen2.5-3B-Instruct",
tokenizer_model_id="Qwen/Qwen2.5-3B-Instruct",
family="qwen2",
source_format="hf",
runtime="dotcache_hf",
context_window=32768,
local_tier="stretch_here",
dotcache_ready=True,
benchmark_harness="qwen2_compare",
prompt_lengths=(1024, 2048, 4096),
notes="First non-Llama native-weight DotCache target on the HF path. On CUDA, the recommended lane is key-exact K=M3 / V=M0 because default M0/M0 drifts on Qwen2.5.",
),
"qwen25_1p5b_hf": ModelSpec(
key="qwen25_1p5b_hf",
display_name="Qwen2.5 1.5B Instruct",
model_id="Qwen/Qwen2.5-1.5B-Instruct",
tokenizer_model_id="Qwen/Qwen2.5-1.5B-Instruct",
family="qwen2",
source_format="hf",
runtime="dotcache_hf",
context_window=32768,
local_tier="stretch_here",
dotcache_ready=True,
benchmark_harness="qwen2_compare",
prompt_lengths=(1024, 2048),
notes="Smaller Qwen2.5 CUDA lane. Current recorded runs show that layer:0 selective exact-K restores 2048 agreement with about 3.6% exact K pages.",
),
"qwen25_7b_hf": ModelSpec(
key="qwen25_7b_hf",
display_name="Qwen2.5 7B Instruct",
model_id="Qwen/Qwen2.5-7B-Instruct",
tokenizer_model_id="Qwen/Qwen2.5-7B-Instruct",
family="qwen2",
source_format="hf",
runtime="dotcache_hf",
context_window=32768,
local_tier="stretch_here",
dotcache_ready=True,
benchmark_harness="qwen2_compare",
prompt_lengths=(1024, 2048, 4096),
notes="First larger CUDA-native Qwen2 scale-up lane. On the 5090 pod, key-exact K=M3 / V=M0 restored 1024/2048 agreement where default M0/M0 drifted.",
),
"qwen35_4b_hf": ModelSpec(
key="qwen35_4b_hf",
display_name="Qwen3.5 4B",
model_id="Qwen/Qwen3.5-4B",
tokenizer_model_id="Qwen/Qwen3.5-4B",
family="qwen3_5_hybrid",
source_format="hf",
runtime="transformers",
context_window=262144,
local_tier="reference_only",
dotcache_ready=False,
benchmark_harness="qwen35_text",
prompt_lengths=(512, 1024),
notes="Dense text lane on non-CUDA backends and the default CUDA 4B StateCache lane on this pod. The current recommended CUDA runtime is DeltaNet StateCache post_update_m0 at 8-bit with renorm disabled plus recurrent M3 escapes on layers 0, 1, and 2.",
),
"qwen35_0p8b_hf": ModelSpec(
key="qwen35_0p8b_hf",
display_name="Qwen3.5 0.8B",
model_id="Qwen/Qwen3.5-0.8B",
tokenizer_model_id="Qwen/Qwen3.5-0.8B",
family="qwen3_5_hybrid",
source_format="hf",
runtime="transformers",
context_window=262144,
local_tier="stretch_here",
dotcache_ready=False,
benchmark_harness="qwen35_text",
prompt_lengths=(512, 1024),
notes="Dense text lane on non-CUDA backends and the default CUDA StateCache lane on this pod. The current recommended CUDA runtime is DeltaNet StateCache post_update_m0 at 8-bit with renorm disabled.",
),
"llama32_3b_gguf": ModelSpec(
key="llama32_3b_gguf",
display_name="Llama 3.2 3B Instruct GGUF",
model_id="bartowski/Llama-3.2-3B-Instruct-GGUF",
tokenizer_model_id="meta-llama/Llama-3.2-3B-Instruct",
family="llama",
source_format="gguf",
runtime="llama_cpp",
context_window=131072,
local_tier="reference_only",
dotcache_ready=False,
benchmark_harness="gguf_external",
prompt_lengths=(1024, 2048, 4096),
notes="External reference baseline for llama.cpp / GGUF comparisons.",
gguf_hf_file="Llama-3.2-3B-Instruct-Q4_K_M.gguf",
),
"qwen25_3b_gguf": ModelSpec(
key="qwen25_3b_gguf",
display_name="Qwen2.5 3B Instruct GGUF",
model_id="Qwen/Qwen2.5-3B-Instruct-GGUF",
tokenizer_model_id="Qwen/Qwen2.5-3B-Instruct",
family="qwen2",
source_format="gguf",
runtime="llama_cpp",
context_window=32768,
local_tier="reference_only",
dotcache_ready=False,
benchmark_harness="gguf_external",
prompt_lengths=(1024, 2048, 4096),
notes="External GGUF reference lane for future TurboQuant / llama.cpp comparisons.",
gguf_hf_file="qwen2.5-3b-instruct-q4_k_m.gguf",
),
"qwen25_7b_gguf": ModelSpec(
key="qwen25_7b_gguf",
display_name="Qwen2.5 7B Instruct GGUF",
model_id="Qwen/Qwen2.5-7B-Instruct-GGUF",
tokenizer_model_id="Qwen/Qwen2.5-7B-Instruct",
family="qwen2",
source_format="gguf",
runtime="llama_cpp",
context_window=32768,
local_tier="reference_only",
dotcache_ready=False,
benchmark_harness="gguf_external",
prompt_lengths=(1024, 2048, 4096),
notes="External GGUF reference lane for larger Qwen2.5 CUDA comparisons.",
gguf_hf_file="qwen2.5-7b-instruct-q4_k_m-00001-of-00002.gguf",
),
}
def list_model_specs() -> tuple[ModelSpec, ...]:
return tuple(_MODEL_REGISTRY.values())
def get_model_spec(key: str) -> ModelSpec:
try:
return _MODEL_REGISTRY[key]
except KeyError as exc:
raise KeyError(f"unknown model registry key: {key}") from exc