Spaces:

DeanoCalver
/

DotCache-Arena

Paused

App Files Files Community

DotCache-Arena / dotcache /model_registry.py

DeanoCalver

Initial DotCache Arena Space upload

751ad26 verified 3 months ago

Raw

History Blame Contribute Delete

8.77 kB

	from __future__ import annotations

	from dataclasses import asdict, dataclass
	from typing import Literal

	ModelFamily = Literal["llama", "qwen2", "qwen3_5_hybrid"]
	SourceFormat = Literal["hf", "gguf"]
	RuntimeName = Literal["transformers", "dotcache_hf", "vllm", "llama_cpp"]
	LocalTier = Literal["works_here", "stretch_here", "reference_only"]


	@dataclass(frozen=True, slots=True)
	class ModelSpec:
	key: str
	display_name: str
	model_id: str
	tokenizer_model_id: str \| None
	family: ModelFamily
	source_format: SourceFormat
	runtime: RuntimeName
	context_window: int
	local_tier: LocalTier
	dotcache_ready: bool
	benchmark_harness: str \| None
	prompt_lengths: tuple[int, ...]
	notes: str
	gguf_hf_file: str \| None = None

	def to_dict(self) -> dict[str, object]:
	return asdict(self)


	_MODEL_REGISTRY: dict[str, ModelSpec] = {
	"tinyllama_hf": ModelSpec(
	key="tinyllama_hf",
	display_name="TinyLlama 1.1B Chat",
	model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	tokenizer_model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	family="llama",
	source_format="hf",
	runtime="dotcache_hf",
	context_window=2048,
	local_tier="works_here",
	dotcache_ready=True,
	benchmark_harness="llama_compare",
	prompt_lengths=(289, 577, 1536),
	notes="Current smallest real-model regression lane for exact HF DotCache on this Mac.",
	),
	"smollm2_360m_hf": ModelSpec(
	key="smollm2_360m_hf",
	display_name="SmolLM2 360M Instruct",
	model_id="HuggingFaceTB/SmolLM2-360M-Instruct",
	tokenizer_model_id="HuggingFaceTB/SmolLM2-360M-Instruct",
	family="llama",
	source_format="hf",
	runtime="dotcache_hf",
	context_window=8192,
	local_tier="works_here",
	dotcache_ready=True,
	benchmark_harness="llama_compare",
	prompt_lengths=(1024, 2048),
	notes="Best higher-context real-model lane currently working on this Mac.",
	),
	"smollm2_1p7b_hf": ModelSpec(
	key="smollm2_1p7b_hf",
	display_name="SmolLM2 1.7B Instruct",
	model_id="HuggingFaceTB/SmolLM2-1.7B-Instruct",
	tokenizer_model_id="HuggingFaceTB/SmolLM2-1.7B-Instruct",
	family="llama",
	source_format="hf",
	runtime="dotcache_hf",
	context_window=8192,
	local_tier="stretch_here",
	dotcache_ready=True,
	benchmark_harness="llama_compare",
	prompt_lengths=(1024, 2048),
	notes="Larger SmolLM2 CUDA lane. Current recorded runs keep 1.0 agreement on plain M0/M0 at 1024 and 2048.",
	),
	"llama32_3b_hf": ModelSpec(
	key="llama32_3b_hf",
	display_name="Llama 3.2 3B Instruct",
	model_id="meta-llama/Llama-3.2-3B-Instruct",
	tokenizer_model_id="meta-llama/Llama-3.2-3B-Instruct",
	family="llama",
	source_format="hf",
	runtime="dotcache_hf",
	context_window=131072,
	local_tier="stretch_here",
	dotcache_ready=True,
	benchmark_harness="llama_compare",
	prompt_lengths=(1024, 2048, 4096),
	notes="Best next proper-model HF target because it matches the current Llama-family integration path.",
	),
	"qwen25_3b_hf": ModelSpec(
	key="qwen25_3b_hf",
	display_name="Qwen2.5 3B Instruct",
	model_id="Qwen/Qwen2.5-3B-Instruct",
	tokenizer_model_id="Qwen/Qwen2.5-3B-Instruct",
	family="qwen2",
	source_format="hf",
	runtime="dotcache_hf",
	context_window=32768,
	local_tier="stretch_here",
	dotcache_ready=True,
	benchmark_harness="qwen2_compare",
	prompt_lengths=(1024, 2048, 4096),
	notes="First non-Llama native-weight DotCache target on the HF path. On CUDA, the recommended lane is key-exact K=M3 / V=M0 because default M0/M0 drifts on Qwen2.5.",
	),
	"qwen25_1p5b_hf": ModelSpec(
	key="qwen25_1p5b_hf",
	display_name="Qwen2.5 1.5B Instruct",
	model_id="Qwen/Qwen2.5-1.5B-Instruct",
	tokenizer_model_id="Qwen/Qwen2.5-1.5B-Instruct",
	family="qwen2",
	source_format="hf",
	runtime="dotcache_hf",
	context_window=32768,
	local_tier="stretch_here",
	dotcache_ready=True,
	benchmark_harness="qwen2_compare",
	prompt_lengths=(1024, 2048),
	notes="Smaller Qwen2.5 CUDA lane. Current recorded runs show that layer:0 selective exact-K restores 2048 agreement with about 3.6% exact K pages.",
	),
	"qwen25_7b_hf": ModelSpec(
	key="qwen25_7b_hf",
	display_name="Qwen2.5 7B Instruct",
	model_id="Qwen/Qwen2.5-7B-Instruct",
	tokenizer_model_id="Qwen/Qwen2.5-7B-Instruct",
	family="qwen2",
	source_format="hf",
	runtime="dotcache_hf",
	context_window=32768,
	local_tier="stretch_here",
	dotcache_ready=True,
	benchmark_harness="qwen2_compare",
	prompt_lengths=(1024, 2048, 4096),
	notes="First larger CUDA-native Qwen2 scale-up lane. On the 5090 pod, key-exact K=M3 / V=M0 restored 1024/2048 agreement where default M0/M0 drifted.",
	),
	"qwen35_4b_hf": ModelSpec(
	key="qwen35_4b_hf",
	display_name="Qwen3.5 4B",
	model_id="Qwen/Qwen3.5-4B",
	tokenizer_model_id="Qwen/Qwen3.5-4B",
	family="qwen3_5_hybrid",
	source_format="hf",
	runtime="transformers",
	context_window=262144,
	local_tier="reference_only",
	dotcache_ready=False,
	benchmark_harness="qwen35_text",
	prompt_lengths=(512, 1024),
	notes="Dense text lane on non-CUDA backends and the default CUDA 4B StateCache lane on this pod. The current recommended CUDA runtime is DeltaNet StateCache post_update_m0 at 8-bit with renorm disabled plus recurrent M3 escapes on layers 0, 1, and 2.",
	),
	"qwen35_0p8b_hf": ModelSpec(
	key="qwen35_0p8b_hf",
	display_name="Qwen3.5 0.8B",
	model_id="Qwen/Qwen3.5-0.8B",
	tokenizer_model_id="Qwen/Qwen3.5-0.8B",
	family="qwen3_5_hybrid",
	source_format="hf",
	runtime="transformers",
	context_window=262144,
	local_tier="stretch_here",
	dotcache_ready=False,
	benchmark_harness="qwen35_text",
	prompt_lengths=(512, 1024),
	notes="Dense text lane on non-CUDA backends and the default CUDA StateCache lane on this pod. The current recommended CUDA runtime is DeltaNet StateCache post_update_m0 at 8-bit with renorm disabled.",
	),
	"llama32_3b_gguf": ModelSpec(
	key="llama32_3b_gguf",
	display_name="Llama 3.2 3B Instruct GGUF",
	model_id="bartowski/Llama-3.2-3B-Instruct-GGUF",
	tokenizer_model_id="meta-llama/Llama-3.2-3B-Instruct",
	family="llama",
	source_format="gguf",
	runtime="llama_cpp",
	context_window=131072,
	local_tier="reference_only",
	dotcache_ready=False,
	benchmark_harness="gguf_external",
	prompt_lengths=(1024, 2048, 4096),
	notes="External reference baseline for llama.cpp / GGUF comparisons.",
	gguf_hf_file="Llama-3.2-3B-Instruct-Q4_K_M.gguf",
	),
	"qwen25_3b_gguf": ModelSpec(
	key="qwen25_3b_gguf",
	display_name="Qwen2.5 3B Instruct GGUF",
	model_id="Qwen/Qwen2.5-3B-Instruct-GGUF",
	tokenizer_model_id="Qwen/Qwen2.5-3B-Instruct",
	family="qwen2",
	source_format="gguf",
	runtime="llama_cpp",
	context_window=32768,
	local_tier="reference_only",
	dotcache_ready=False,
	benchmark_harness="gguf_external",
	prompt_lengths=(1024, 2048, 4096),
	notes="External GGUF reference lane for future TurboQuant / llama.cpp comparisons.",
	gguf_hf_file="qwen2.5-3b-instruct-q4_k_m.gguf",
	),
	"qwen25_7b_gguf": ModelSpec(
	key="qwen25_7b_gguf",
	display_name="Qwen2.5 7B Instruct GGUF",
	model_id="Qwen/Qwen2.5-7B-Instruct-GGUF",
	tokenizer_model_id="Qwen/Qwen2.5-7B-Instruct",
	family="qwen2",
	source_format="gguf",
	runtime="llama_cpp",
	context_window=32768,
	local_tier="reference_only",
	dotcache_ready=False,
	benchmark_harness="gguf_external",
	prompt_lengths=(1024, 2048, 4096),
	notes="External GGUF reference lane for larger Qwen2.5 CUDA comparisons.",
	gguf_hf_file="qwen2.5-7b-instruct-q4_k_m-00001-of-00002.gguf",
	),
	}


	def list_model_specs() -> tuple[ModelSpec, ...]:
	return tuple(_MODEL_REGISTRY.values())


	def get_model_spec(key: str) -> ModelSpec:
	try:
	return _MODEL_REGISTRY[key]
	except KeyError as exc:
	raise KeyError(f"unknown model registry key: {key}") from exc