ModuleMind / agents /modmind /specialist_presets.py
Quazim0t0's picture
Add files using upload-large-folder tool
45e7dfb verified
Raw
History Blame Contribute Delete
2.71 kB
"""
specialist_presets.py -- ModularMind-on-V2 specialist sizing.
DENSE ~80M specialists (Supra-50M-style, scaled up): a dense Llama-ish transformer with
NO MoE / Engram / Hyper-Connections / HRM, so the parameters go into language modeling
instead of machinery -> coherent generation (the lesson from SupraLabs/Supra-50M-Base,
a dense model that produces coherent multi-paragraph text on FineWeb-Edu).
Shape (shared across domains; only the vocab differs, read from registry.py):
hidden 640, 16 layers, 10 heads / 5 KV (GQA), dense FFN 1728, ctx 1024, d_latent 256.
-> ~81.5M params at vocab 16384 (the shared length-max tokenizer).
The bridge bus (d_latent=256) and latent IO are kept, so train_link.py / the Gradio
adapter still work after retraining.
"""
from config import SpikeWhaleConfig
from registry import spec
def _dense_80m(vocab_size: int) -> SpikeWhaleConfig:
"""A dense ~80M specialist for the given vocab."""
return SpikeWhaleConfig(
vocab_size=vocab_size,
hidden_size=640,
num_hidden_layers=16,
num_attention_heads=10,
num_key_value_heads=5, # GQA
head_dim=64,
qk_rope_head_dim=16,
q_lora_rank=320,
o_lora_rank=160,
tie_word_embeddings=True,
# DENSE: no MoE. moe_intermediate_size still sizes the DenseFFN (model.py).
use_moe=False,
moe_intermediate_size=1728,
# strip the heavy extras -> params go to the LM, not machinery
use_engram=False,
use_hyper_connections=False,
hc_mult=1,
use_hrm_refine=False,
num_nextn_predict_layers=0,
use_derf=False,
use_xsa=True,
# keep the ModularMind bridge bus so train_link.py / the adapter still work
use_latent_io=True,
d_latent=256,
# uniform 1024 context (Supra used 1024). base_context MUST be >= training --seq-len.
chain_position=0,
base_context=4096,
base_rope_theta=10000.0,
)
def specialist_config(domain: str = "language", position: int = 0) -> SpikeWhaleConfig:
"""A dense ~80M specialist; vocab comes from registry.py (single source of truth)."""
return _dense_80m(spec(domain)["vocab"])
def generic_specialist_config(vocab_size: int, position: int = 0) -> SpikeWhaleConfig:
"""Same dense ~80M shape for an arbitrary vocab (new domains 'just work')."""
return _dense_80m(vocab_size)
# Foundation chain ordering (derived from the registry, so it grows automatically)
try:
from registry import SPECIALISTS as _REG
FOUNDATION_ORDER = {v["position"]: k for k, v in _REG.items()}
except Exception:
FOUNDATION_ORDER = {0: "language", 1: "reasoning", 2: "tool_use"}