Spaces:
Running on Zero
Running on Zero
| """ | |
| specialist_presets.py -- ModularMind-on-V2 specialist sizing. | |
| DENSE ~80M specialists (Supra-50M-style, scaled up): a dense Llama-ish transformer with | |
| NO MoE / Engram / Hyper-Connections / HRM, so the parameters go into language modeling | |
| instead of machinery -> coherent generation (the lesson from SupraLabs/Supra-50M-Base, | |
| a dense model that produces coherent multi-paragraph text on FineWeb-Edu). | |
| Shape (shared across domains; only the vocab differs, read from registry.py): | |
| hidden 640, 16 layers, 10 heads / 5 KV (GQA), dense FFN 1728, ctx 1024, d_latent 256. | |
| -> ~81.5M params at vocab 16384 (the shared length-max tokenizer). | |
| The bridge bus (d_latent=256) and latent IO are kept, so train_link.py / the Gradio | |
| adapter still work after retraining. | |
| """ | |
| from config import SpikeWhaleConfig | |
| from registry import spec | |
| def _dense_80m(vocab_size: int) -> SpikeWhaleConfig: | |
| """A dense ~80M specialist for the given vocab.""" | |
| return SpikeWhaleConfig( | |
| vocab_size=vocab_size, | |
| hidden_size=640, | |
| num_hidden_layers=16, | |
| num_attention_heads=10, | |
| num_key_value_heads=5, # GQA | |
| head_dim=64, | |
| qk_rope_head_dim=16, | |
| q_lora_rank=320, | |
| o_lora_rank=160, | |
| tie_word_embeddings=True, | |
| # DENSE: no MoE. moe_intermediate_size still sizes the DenseFFN (model.py). | |
| use_moe=False, | |
| moe_intermediate_size=1728, | |
| # strip the heavy extras -> params go to the LM, not machinery | |
| use_engram=False, | |
| use_hyper_connections=False, | |
| hc_mult=1, | |
| use_hrm_refine=False, | |
| num_nextn_predict_layers=0, | |
| use_derf=False, | |
| use_xsa=True, | |
| # keep the ModularMind bridge bus so train_link.py / the adapter still work | |
| use_latent_io=True, | |
| d_latent=256, | |
| # uniform 1024 context (Supra used 1024). base_context MUST be >= training --seq-len. | |
| chain_position=0, | |
| base_context=4096, | |
| base_rope_theta=10000.0, | |
| ) | |
| def specialist_config(domain: str = "language", position: int = 0) -> SpikeWhaleConfig: | |
| """A dense ~80M specialist; vocab comes from registry.py (single source of truth).""" | |
| return _dense_80m(spec(domain)["vocab"]) | |
| def generic_specialist_config(vocab_size: int, position: int = 0) -> SpikeWhaleConfig: | |
| """Same dense ~80M shape for an arbitrary vocab (new domains 'just work').""" | |
| return _dense_80m(vocab_size) | |
| # Foundation chain ordering (derived from the registry, so it grows automatically) | |
| try: | |
| from registry import SPECIALISTS as _REG | |
| FOUNDATION_ORDER = {v["position"]: k for k, v in _REG.items()} | |
| except Exception: | |
| FOUNDATION_ORDER = {0: "language", 1: "reasoning", 2: "tool_use"} | |