Spaces:
Running on Zero
Running on Zero
| """ | |
| registry.py -- the SINGLE SOURCE OF TRUTH for specialists. | |
| To add a new specialist later, add ONE entry here, then: | |
| python train_tokenizer.py --domains <name> | |
| python train_specialist.py --domain <name> --muon --normuon --dean-schedule --compile | |
| python train_link.py --asker language --consultant <name> # so it can be consulted | |
| Every other script (tokenizer, data prep, model sizing, link trainer) reads from | |
| here, so nothing else needs editing. `position` is the chain slot (context-doubling | |
| / chain ordering in the V2 config); give each new specialist the next index. | |
| """ | |
| # vocab=16384: the shared length-max tokenizer (covers prose + math; all 256 bytes). Both | |
| # active specialists use the SAME tokenizer -> identical vocab makes routing directly comparable. | |
| # PURE specialization (language=100% FineWeb-Edu, reasoning=100% FineMath) -> crisp routing. | |
| SPECIALISTS = { | |
| "language": dict(dataset="HuggingFaceFW/fineweb-edu", config="sample-100BT", | |
| field="text", vocab=16384, position=0), | |
| "reasoning": dict(dataset="HuggingFaceTB/finemath", config="finemath-3plus", | |
| field="text", vocab=16384, position=1), | |
| # dormant -- not in ACTIVE; ignore for now (no tool calling this round) | |
| "tool_use": dict(dataset="glaiveai/glaive-function-calling-v2", config=None, | |
| field=("system", "chat"), vocab=16384, position=2), | |
| } | |
| # which specialists you are actively training right now (the "foundation" set). | |
| ACTIVE = ["language", "reasoning"] | |
| def spec(name): | |
| if name not in SPECIALISTS: | |
| raise KeyError(f"unknown specialist {name!r}; add it to registry.SPECIALISTS. " | |
| f"known: {list(SPECIALISTS)}") | |
| return SPECIALISTS[name] | |
| def text_of(name_or_spec, ex): | |
| """Extract the training text from a streamed example (handles multi-field specs).""" | |
| s = name_or_spec if isinstance(name_or_spec, dict) else spec(name_or_spec) | |
| f = s["field"] | |
| if isinstance(f, (tuple, list)): | |
| return "\n".join(str(ex.get(k, "") or "") for k in f) | |
| return ex.get(f, "") or "" | |