Spaces:
Running on Zero
Running on Zero
File size: 2,108 Bytes
45e7dfb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | """
registry.py -- the SINGLE SOURCE OF TRUTH for specialists.
To add a new specialist later, add ONE entry here, then:
python train_tokenizer.py --domains <name>
python train_specialist.py --domain <name> --muon --normuon --dean-schedule --compile
python train_link.py --asker language --consultant <name> # so it can be consulted
Every other script (tokenizer, data prep, model sizing, link trainer) reads from
here, so nothing else needs editing. `position` is the chain slot (context-doubling
/ chain ordering in the V2 config); give each new specialist the next index.
"""
# vocab=16384: the shared length-max tokenizer (covers prose + math; all 256 bytes). Both
# active specialists use the SAME tokenizer -> identical vocab makes routing directly comparable.
# PURE specialization (language=100% FineWeb-Edu, reasoning=100% FineMath) -> crisp routing.
SPECIALISTS = {
"language": dict(dataset="HuggingFaceFW/fineweb-edu", config="sample-100BT",
field="text", vocab=16384, position=0),
"reasoning": dict(dataset="HuggingFaceTB/finemath", config="finemath-3plus",
field="text", vocab=16384, position=1),
# dormant -- not in ACTIVE; ignore for now (no tool calling this round)
"tool_use": dict(dataset="glaiveai/glaive-function-calling-v2", config=None,
field=("system", "chat"), vocab=16384, position=2),
}
# which specialists you are actively training right now (the "foundation" set).
ACTIVE = ["language", "reasoning"]
def spec(name):
if name not in SPECIALISTS:
raise KeyError(f"unknown specialist {name!r}; add it to registry.SPECIALISTS. "
f"known: {list(SPECIALISTS)}")
return SPECIALISTS[name]
def text_of(name_or_spec, ex):
"""Extract the training text from a streamed example (handles multi-field specs)."""
s = name_or_spec if isinstance(name_or_spec, dict) else spec(name_or_spec)
f = s["field"]
if isinstance(f, (tuple, list)):
return "\n".join(str(ex.get(k, "") or "") for k in f)
return ex.get(f, "") or ""
|