File size: 2,713 Bytes
45e7dfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""
specialist_presets.py -- ModularMind-on-V2 specialist sizing.

DENSE ~80M specialists (Supra-50M-style, scaled up): a dense Llama-ish transformer with
NO MoE / Engram / Hyper-Connections / HRM, so the parameters go into language modeling
instead of machinery -> coherent generation (the lesson from SupraLabs/Supra-50M-Base,
a dense model that produces coherent multi-paragraph text on FineWeb-Edu).

Shape (shared across domains; only the vocab differs, read from registry.py):
  hidden 640, 16 layers, 10 heads / 5 KV (GQA), dense FFN 1728, ctx 1024, d_latent 256.
  -> ~81.5M params at vocab 16384 (the shared length-max tokenizer).

The bridge bus (d_latent=256) and latent IO are kept, so train_link.py / the Gradio
adapter still work after retraining.
"""
from config import SpikeWhaleConfig
from registry import spec


def _dense_80m(vocab_size: int) -> SpikeWhaleConfig:
    """A dense ~80M specialist for the given vocab."""
    return SpikeWhaleConfig(
        vocab_size=vocab_size,
        hidden_size=640,
        num_hidden_layers=16,
        num_attention_heads=10,
        num_key_value_heads=5,          # GQA
        head_dim=64,
        qk_rope_head_dim=16,
        q_lora_rank=320,
        o_lora_rank=160,
        tie_word_embeddings=True,
        # DENSE: no MoE. moe_intermediate_size still sizes the DenseFFN (model.py).
        use_moe=False,
        moe_intermediate_size=1728,
        # strip the heavy extras -> params go to the LM, not machinery
        use_engram=False,
        use_hyper_connections=False,
        hc_mult=1,
        use_hrm_refine=False,
        num_nextn_predict_layers=0,
        use_derf=False,
        use_xsa=True,
        # keep the ModularMind bridge bus so train_link.py / the adapter still work
        use_latent_io=True,
        d_latent=256,
        # uniform 1024 context (Supra used 1024). base_context MUST be >= training --seq-len.
        chain_position=0,
        base_context=4096,
        base_rope_theta=10000.0,
    )


def specialist_config(domain: str = "language", position: int = 0) -> SpikeWhaleConfig:
    """A dense ~80M specialist; vocab comes from registry.py (single source of truth)."""
    return _dense_80m(spec(domain)["vocab"])


def generic_specialist_config(vocab_size: int, position: int = 0) -> SpikeWhaleConfig:
    """Same dense ~80M shape for an arbitrary vocab (new domains 'just work')."""
    return _dense_80m(vocab_size)


# Foundation chain ordering (derived from the registry, so it grows automatically)
try:
    from registry import SPECIALISTS as _REG
    FOUNDATION_ORDER = {v["position"]: k for k, v in _REG.items()}
except Exception:
    FOUNDATION_ORDER = {0: "language", 1: "reasoning", 2: "tool_use"}