Spaces:
Paused
Paused
| from __future__ import annotations | |
| from dataclasses import asdict, dataclass | |
| from typing import Literal | |
| ModelFamily = Literal["llama", "qwen2", "qwen3_5_hybrid"] | |
| SourceFormat = Literal["hf", "gguf"] | |
| RuntimeName = Literal["transformers", "dotcache_hf", "vllm", "llama_cpp"] | |
| LocalTier = Literal["works_here", "stretch_here", "reference_only"] | |
| class ModelSpec: | |
| key: str | |
| display_name: str | |
| model_id: str | |
| tokenizer_model_id: str | None | |
| family: ModelFamily | |
| source_format: SourceFormat | |
| runtime: RuntimeName | |
| context_window: int | |
| local_tier: LocalTier | |
| dotcache_ready: bool | |
| benchmark_harness: str | None | |
| prompt_lengths: tuple[int, ...] | |
| notes: str | |
| gguf_hf_file: str | None = None | |
| def to_dict(self) -> dict[str, object]: | |
| return asdict(self) | |
| _MODEL_REGISTRY: dict[str, ModelSpec] = { | |
| "tinyllama_hf": ModelSpec( | |
| key="tinyllama_hf", | |
| display_name="TinyLlama 1.1B Chat", | |
| model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0", | |
| tokenizer_model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0", | |
| family="llama", | |
| source_format="hf", | |
| runtime="dotcache_hf", | |
| context_window=2048, | |
| local_tier="works_here", | |
| dotcache_ready=True, | |
| benchmark_harness="llama_compare", | |
| prompt_lengths=(289, 577, 1536), | |
| notes="Current smallest real-model regression lane for exact HF DotCache on this Mac.", | |
| ), | |
| "smollm2_360m_hf": ModelSpec( | |
| key="smollm2_360m_hf", | |
| display_name="SmolLM2 360M Instruct", | |
| model_id="HuggingFaceTB/SmolLM2-360M-Instruct", | |
| tokenizer_model_id="HuggingFaceTB/SmolLM2-360M-Instruct", | |
| family="llama", | |
| source_format="hf", | |
| runtime="dotcache_hf", | |
| context_window=8192, | |
| local_tier="works_here", | |
| dotcache_ready=True, | |
| benchmark_harness="llama_compare", | |
| prompt_lengths=(1024, 2048), | |
| notes="Best higher-context real-model lane currently working on this Mac.", | |
| ), | |
| "smollm2_1p7b_hf": ModelSpec( | |
| key="smollm2_1p7b_hf", | |
| display_name="SmolLM2 1.7B Instruct", | |
| model_id="HuggingFaceTB/SmolLM2-1.7B-Instruct", | |
| tokenizer_model_id="HuggingFaceTB/SmolLM2-1.7B-Instruct", | |
| family="llama", | |
| source_format="hf", | |
| runtime="dotcache_hf", | |
| context_window=8192, | |
| local_tier="stretch_here", | |
| dotcache_ready=True, | |
| benchmark_harness="llama_compare", | |
| prompt_lengths=(1024, 2048), | |
| notes="Larger SmolLM2 CUDA lane. Current recorded runs keep 1.0 agreement on plain M0/M0 at 1024 and 2048.", | |
| ), | |
| "llama32_3b_hf": ModelSpec( | |
| key="llama32_3b_hf", | |
| display_name="Llama 3.2 3B Instruct", | |
| model_id="meta-llama/Llama-3.2-3B-Instruct", | |
| tokenizer_model_id="meta-llama/Llama-3.2-3B-Instruct", | |
| family="llama", | |
| source_format="hf", | |
| runtime="dotcache_hf", | |
| context_window=131072, | |
| local_tier="stretch_here", | |
| dotcache_ready=True, | |
| benchmark_harness="llama_compare", | |
| prompt_lengths=(1024, 2048, 4096), | |
| notes="Best next proper-model HF target because it matches the current Llama-family integration path.", | |
| ), | |
| "qwen25_3b_hf": ModelSpec( | |
| key="qwen25_3b_hf", | |
| display_name="Qwen2.5 3B Instruct", | |
| model_id="Qwen/Qwen2.5-3B-Instruct", | |
| tokenizer_model_id="Qwen/Qwen2.5-3B-Instruct", | |
| family="qwen2", | |
| source_format="hf", | |
| runtime="dotcache_hf", | |
| context_window=32768, | |
| local_tier="stretch_here", | |
| dotcache_ready=True, | |
| benchmark_harness="qwen2_compare", | |
| prompt_lengths=(1024, 2048, 4096), | |
| notes="First non-Llama native-weight DotCache target on the HF path. On CUDA, the recommended lane is key-exact K=M3 / V=M0 because default M0/M0 drifts on Qwen2.5.", | |
| ), | |
| "qwen25_1p5b_hf": ModelSpec( | |
| key="qwen25_1p5b_hf", | |
| display_name="Qwen2.5 1.5B Instruct", | |
| model_id="Qwen/Qwen2.5-1.5B-Instruct", | |
| tokenizer_model_id="Qwen/Qwen2.5-1.5B-Instruct", | |
| family="qwen2", | |
| source_format="hf", | |
| runtime="dotcache_hf", | |
| context_window=32768, | |
| local_tier="stretch_here", | |
| dotcache_ready=True, | |
| benchmark_harness="qwen2_compare", | |
| prompt_lengths=(1024, 2048), | |
| notes="Smaller Qwen2.5 CUDA lane. Current recorded runs show that layer:0 selective exact-K restores 2048 agreement with about 3.6% exact K pages.", | |
| ), | |
| "qwen25_7b_hf": ModelSpec( | |
| key="qwen25_7b_hf", | |
| display_name="Qwen2.5 7B Instruct", | |
| model_id="Qwen/Qwen2.5-7B-Instruct", | |
| tokenizer_model_id="Qwen/Qwen2.5-7B-Instruct", | |
| family="qwen2", | |
| source_format="hf", | |
| runtime="dotcache_hf", | |
| context_window=32768, | |
| local_tier="stretch_here", | |
| dotcache_ready=True, | |
| benchmark_harness="qwen2_compare", | |
| prompt_lengths=(1024, 2048, 4096), | |
| notes="First larger CUDA-native Qwen2 scale-up lane. On the 5090 pod, key-exact K=M3 / V=M0 restored 1024/2048 agreement where default M0/M0 drifted.", | |
| ), | |
| "qwen35_4b_hf": ModelSpec( | |
| key="qwen35_4b_hf", | |
| display_name="Qwen3.5 4B", | |
| model_id="Qwen/Qwen3.5-4B", | |
| tokenizer_model_id="Qwen/Qwen3.5-4B", | |
| family="qwen3_5_hybrid", | |
| source_format="hf", | |
| runtime="transformers", | |
| context_window=262144, | |
| local_tier="reference_only", | |
| dotcache_ready=False, | |
| benchmark_harness="qwen35_text", | |
| prompt_lengths=(512, 1024), | |
| notes="Dense text lane on non-CUDA backends and the default CUDA 4B StateCache lane on this pod. The current recommended CUDA runtime is DeltaNet StateCache post_update_m0 at 8-bit with renorm disabled plus recurrent M3 escapes on layers 0, 1, and 2.", | |
| ), | |
| "qwen35_0p8b_hf": ModelSpec( | |
| key="qwen35_0p8b_hf", | |
| display_name="Qwen3.5 0.8B", | |
| model_id="Qwen/Qwen3.5-0.8B", | |
| tokenizer_model_id="Qwen/Qwen3.5-0.8B", | |
| family="qwen3_5_hybrid", | |
| source_format="hf", | |
| runtime="transformers", | |
| context_window=262144, | |
| local_tier="stretch_here", | |
| dotcache_ready=False, | |
| benchmark_harness="qwen35_text", | |
| prompt_lengths=(512, 1024), | |
| notes="Dense text lane on non-CUDA backends and the default CUDA StateCache lane on this pod. The current recommended CUDA runtime is DeltaNet StateCache post_update_m0 at 8-bit with renorm disabled.", | |
| ), | |
| "llama32_3b_gguf": ModelSpec( | |
| key="llama32_3b_gguf", | |
| display_name="Llama 3.2 3B Instruct GGUF", | |
| model_id="bartowski/Llama-3.2-3B-Instruct-GGUF", | |
| tokenizer_model_id="meta-llama/Llama-3.2-3B-Instruct", | |
| family="llama", | |
| source_format="gguf", | |
| runtime="llama_cpp", | |
| context_window=131072, | |
| local_tier="reference_only", | |
| dotcache_ready=False, | |
| benchmark_harness="gguf_external", | |
| prompt_lengths=(1024, 2048, 4096), | |
| notes="External reference baseline for llama.cpp / GGUF comparisons.", | |
| gguf_hf_file="Llama-3.2-3B-Instruct-Q4_K_M.gguf", | |
| ), | |
| "qwen25_3b_gguf": ModelSpec( | |
| key="qwen25_3b_gguf", | |
| display_name="Qwen2.5 3B Instruct GGUF", | |
| model_id="Qwen/Qwen2.5-3B-Instruct-GGUF", | |
| tokenizer_model_id="Qwen/Qwen2.5-3B-Instruct", | |
| family="qwen2", | |
| source_format="gguf", | |
| runtime="llama_cpp", | |
| context_window=32768, | |
| local_tier="reference_only", | |
| dotcache_ready=False, | |
| benchmark_harness="gguf_external", | |
| prompt_lengths=(1024, 2048, 4096), | |
| notes="External GGUF reference lane for future TurboQuant / llama.cpp comparisons.", | |
| gguf_hf_file="qwen2.5-3b-instruct-q4_k_m.gguf", | |
| ), | |
| "qwen25_7b_gguf": ModelSpec( | |
| key="qwen25_7b_gguf", | |
| display_name="Qwen2.5 7B Instruct GGUF", | |
| model_id="Qwen/Qwen2.5-7B-Instruct-GGUF", | |
| tokenizer_model_id="Qwen/Qwen2.5-7B-Instruct", | |
| family="qwen2", | |
| source_format="gguf", | |
| runtime="llama_cpp", | |
| context_window=32768, | |
| local_tier="reference_only", | |
| dotcache_ready=False, | |
| benchmark_harness="gguf_external", | |
| prompt_lengths=(1024, 2048, 4096), | |
| notes="External GGUF reference lane for larger Qwen2.5 CUDA comparisons.", | |
| gguf_hf_file="qwen2.5-7b-instruct-q4_k_m-00001-of-00002.gguf", | |
| ), | |
| } | |
| def list_model_specs() -> tuple[ModelSpec, ...]: | |
| return tuple(_MODEL_REGISTRY.values()) | |
| def get_model_spec(key: str) -> ModelSpec: | |
| try: | |
| return _MODEL_REGISTRY[key] | |
| except KeyError as exc: | |
| raise KeyError(f"unknown model registry key: {key}") from exc | |