""" registry.py -- the SINGLE SOURCE OF TRUTH for specialists. To add a new specialist later, add ONE entry here, then: python train_tokenizer.py --domains python train_specialist.py --domain --muon --normuon --dean-schedule --compile python train_link.py --asker language --consultant # so it can be consulted Every other script (tokenizer, data prep, model sizing, link trainer) reads from here, so nothing else needs editing. `position` is the chain slot (context-doubling / chain ordering in the V2 config); give each new specialist the next index. """ # vocab=16384: the shared length-max tokenizer (covers prose + math; all 256 bytes). Both # active specialists use the SAME tokenizer -> identical vocab makes routing directly comparable. # PURE specialization (language=100% FineWeb-Edu, reasoning=100% FineMath) -> crisp routing. SPECIALISTS = { "language": dict(dataset="HuggingFaceFW/fineweb-edu", config="sample-100BT", field="text", vocab=16384, position=0), "reasoning": dict(dataset="HuggingFaceTB/finemath", config="finemath-3plus", field="text", vocab=16384, position=1), # dormant -- not in ACTIVE; ignore for now (no tool calling this round) "tool_use": dict(dataset="glaiveai/glaive-function-calling-v2", config=None, field=("system", "chat"), vocab=16384, position=2), } # which specialists you are actively training right now (the "foundation" set). ACTIVE = ["language", "reasoning"] def spec(name): if name not in SPECIALISTS: raise KeyError(f"unknown specialist {name!r}; add it to registry.SPECIALISTS. " f"known: {list(SPECIALISTS)}") return SPECIALISTS[name] def text_of(name_or_spec, ex): """Extract the training text from a streamed example (handles multi-field specs).""" s = name_or_spec if isinstance(name_or_spec, dict) else spec(name_or_spec) f = s["field"] if isinstance(f, (tuple, list)): return "\n".join(str(ex.get(k, "") or "") for k in f) return ex.get(f, "") or ""