File size: 2,108 Bytes
45e7dfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""
registry.py -- the SINGLE SOURCE OF TRUTH for specialists.

To add a new specialist later, add ONE entry here, then:
    python train_tokenizer.py  --domains <name>
    python train_specialist.py --domain  <name> --muon --normuon --dean-schedule --compile
    python train_link.py       --asker language --consultant <name>   # so it can be consulted

Every other script (tokenizer, data prep, model sizing, link trainer) reads from
here, so nothing else needs editing. `position` is the chain slot (context-doubling
/ chain ordering in the V2 config); give each new specialist the next index.
"""

# vocab=16384: the shared length-max tokenizer (covers prose + math; all 256 bytes). Both
# active specialists use the SAME tokenizer -> identical vocab makes routing directly comparable.
# PURE specialization (language=100% FineWeb-Edu, reasoning=100% FineMath) -> crisp routing.
SPECIALISTS = {
    "language":  dict(dataset="HuggingFaceFW/fineweb-edu", config="sample-100BT",
                      field="text", vocab=16384, position=0),
    "reasoning": dict(dataset="HuggingFaceTB/finemath", config="finemath-3plus",
                      field="text", vocab=16384, position=1),
    # dormant -- not in ACTIVE; ignore for now (no tool calling this round)
    "tool_use":  dict(dataset="glaiveai/glaive-function-calling-v2", config=None,
                      field=("system", "chat"), vocab=16384, position=2),
}

# which specialists you are actively training right now (the "foundation" set).
ACTIVE = ["language", "reasoning"]


def spec(name):
    if name not in SPECIALISTS:
        raise KeyError(f"unknown specialist {name!r}; add it to registry.SPECIALISTS. "
                       f"known: {list(SPECIALISTS)}")
    return SPECIALISTS[name]


def text_of(name_or_spec, ex):
    """Extract the training text from a streamed example (handles multi-field specs)."""
    s = name_or_spec if isinstance(name_or_spec, dict) else spec(name_or_spec)
    f = s["field"]
    if isinstance(f, (tuple, list)):
        return "\n".join(str(ex.get(k, "") or "") for k in f)
    return ex.get(f, "") or ""