""" specialist_presets.py -- ModularMind-on-V2 specialist sizing. DENSE ~80M specialists (Supra-50M-style, scaled up): a dense Llama-ish transformer with NO MoE / Engram / Hyper-Connections / HRM, so the parameters go into language modeling instead of machinery -> coherent generation (the lesson from SupraLabs/Supra-50M-Base, a dense model that produces coherent multi-paragraph text on FineWeb-Edu). Shape (shared across domains; only the vocab differs, read from registry.py): hidden 640, 16 layers, 10 heads / 5 KV (GQA), dense FFN 1728, ctx 1024, d_latent 256. -> ~81.5M params at vocab 16384 (the shared length-max tokenizer). The bridge bus (d_latent=256) and latent IO are kept, so train_link.py / the Gradio adapter still work after retraining. """ from config import SpikeWhaleConfig from registry import spec def _dense_80m(vocab_size: int) -> SpikeWhaleConfig: """A dense ~80M specialist for the given vocab.""" return SpikeWhaleConfig( vocab_size=vocab_size, hidden_size=640, num_hidden_layers=16, num_attention_heads=10, num_key_value_heads=5, # GQA head_dim=64, qk_rope_head_dim=16, q_lora_rank=320, o_lora_rank=160, tie_word_embeddings=True, # DENSE: no MoE. moe_intermediate_size still sizes the DenseFFN (model.py). use_moe=False, moe_intermediate_size=1728, # strip the heavy extras -> params go to the LM, not machinery use_engram=False, use_hyper_connections=False, hc_mult=1, use_hrm_refine=False, num_nextn_predict_layers=0, use_derf=False, use_xsa=True, # keep the ModularMind bridge bus so train_link.py / the adapter still work use_latent_io=True, d_latent=256, # uniform 1024 context (Supra used 1024). base_context MUST be >= training --seq-len. chain_position=0, base_context=4096, base_rope_theta=10000.0, ) def specialist_config(domain: str = "language", position: int = 0) -> SpikeWhaleConfig: """A dense ~80M specialist; vocab comes from registry.py (single source of truth).""" return _dense_80m(spec(domain)["vocab"]) def generic_specialist_config(vocab_size: int, position: int = 0) -> SpikeWhaleConfig: """Same dense ~80M shape for an arbitrary vocab (new domains 'just work').""" return _dense_80m(vocab_size) # Foundation chain ordering (derived from the registry, so it grows automatically) try: from registry import SPECIALISTS as _REG FOUNDATION_ORDER = {v["position"]: k for k, v in _REG.items()} except Exception: FOUNDATION_ORDER = {0: "language", 1: "reasoning", 2: "tool_use"}