""" panel.py -- the Gradio section for the bottom of the boss app: a live demo of the Modular-Mind mixture-of-experts. For the SpikeWhale backend it leads with the *latent bridge* (the real result) and organizes the three demos into tabs. Output is rendered as rich HTML (animated routing cards, a latent-bus strip, character-diff key recovery, live token streaming) instead of markdown tables. Every handler is a generator that yields an instant "loading" notice first, so the first run never looks frozen while the ~80M models lazy-load. Hot-reloads checkpoints. """ from __future__ import annotations import html as _h import os import sys import gradio as gr # ZeroGPU: @spaces.GPU allocates a GPU only for the decorated call (CUDA is never touched at # import/startup). Falls back to a no-op decorator when `spaces` isn't installed (local / plain CPU). try: import spaces _gpu = spaces.GPU except Exception: def _gpu(fn=None, **kw): return fn if callable(fn) else (lambda f: f) def _to_gpu(moe): if hasattr(moe, "to_gpu_if_available"): moe.to_gpu_if_available() return moe EMOJI = {"language": "📖 Language", "math": "➗ Math", "tool": "🛠️ Tool-use"} COLOR = {"language": "#6aa9ff", "math": "#58d68d", "tool": "#f5b041"} DEVICE = os.environ.get("MM_AGENTS_DEVICE", "cpu") # Self-contained SpikeWhale bundle that ships next to this file (agents/modmind/: the 80M # specialists + bridge + inference code). If it's present we default to the SpikeWhale backend # so the HuggingFace Space "just works" with no env config. Env vars still override. _BUNDLED_MODMIND = os.path.join(os.path.dirname(os.path.abspath(__file__)), "modmind") _DEFAULT_BACKEND = "spikewhale" if os.path.isdir(_BUNDLED_MODMIND) else "bytegpt" _SPIKEWHALE = os.environ.get("MM_MOE_BACKEND", _DEFAULT_BACKEND).lower() in ("spikewhale", "modmind") _WARMED = {"done": False} # so the "loading the models" notice only shows on the first run _FOOTER = ( "Two ~80M dense specialists — 📖 Language (FineWeb-Edu) and ➗ Math (FineMath) — sharing a " "16k length-max tokenizer. A coordinator routes by bits-per-byte, and a trained RecursiveLink " "lets them communicate in latent space (proven in the Bridge tab). Hot-reloads checkpoints." if _SPIKEWHALE else "Three byte-level ~10M specialists, streamed-trained on FineWeb-Edu / FineMath / " "glaive-function-calling. Tiny + early-trained, so generations are rough — the routing " "(which expert is most confident) is the point. It hot-reloads as training continues." ) def _get_moe(): """Pick the MoE backend. Defaults to the bundled SpikeWhale 80M specialists (agents/modmind/) when present, else the byte-level ByteGPT experts. MM_MOE_BACKEND and MODMIND_DIR override.""" backend = os.environ.get("MM_MOE_BACKEND", _DEFAULT_BACKEND).lower() if backend in ("spikewhale", "modmind"): mm_dir = os.environ.get("MODMIND_DIR", _BUNDLED_MODMIND) if mm_dir and mm_dir not in sys.path: sys.path.insert(0, mm_dir) # front: ModMind's model.py wins over agents/model.py from moe_gradio import get_moe return get_moe from orchestrator import get_moe return get_moe # ---- HTML rendering ------------------------------------------------------------- _CSS = """""" def _wrap(body): return _CSS + "

" + body + "

" def _esc(s): return _h.escape(s or "").replace("\n", "
") def _notice(action="Generating"): """First-run popup + in-place message so nothing ever looks frozen.""" if not _WARMED["done"]: try: gr.Info("First run — loading the models (~20–40s on CPU). After this, it's quick.") except Exception: pass return _wrap(f"

⏳ Loading the ~80M specialists + {action.lower()}… " "first run can take ~20–40s on CPU; every run after is fast.

") return _wrap(f"

⏳ {action}…

") def _msg(title, body): return _wrap(f"

{title}
{body}

") def _cards(winner, weights, bits, steps): """One animated card per expert: fluency, routing weight bar, winner badge + glow.""" out = [] for n, wv in weights.items(): c = COLOR.get(n, "#9b59b6") win = (n == winner) style = f"border-color:{c};box-shadow:0 0 16px {c}40" if win else "" badge = f"ROUTED ✓" if win else "" out.append( f"

{badge}" f"

{EMOJI.get(n, n)}

" f"

{steps.get(n, 0):,} train steps · {bits[n]:.2f} bits/byte (lower = more fluent)

" f"

routing weight {wv*100:.1f}%

") return "

" + "".join(out) + "

" def _latent(shared, n=48): """The shared latent bus as a strip of signed bars (like the piano's latent strip).""" vals = list(shared or [])[:n] if not vals: return "" mx = max(1e-6, max(abs(v) for v in vals)) cells = "".join( f"= 0 else '#df7a5b'}'>" for v in vals) return (f"

{cells}

" f"

the shared latent bus — every expert's output latent, fused by the " f"RecursiveLink (first {len(vals)} of 256 dims; blue = +, orange = −)

") def _gen_box(prompt, gen, live=False): caret = "" if live else "" return (f"

{_esc(prompt)}" f"{_esc(gen)}{caret}

") def _key_rows(examples): """Wordle-style per-character diff: secret key -> what the asker recovered.""" rows = [] for k, rec, ok in examples: sec = "".join(f"{_h.escape(ch)}" for ch in k) got = "".join( f"" f"{_h.escape(rec[i]) if i < len(rec) else '·'}" for i, ch in enumerate(k)) rows.append(f"

{sec}→{got}" f"{' ✅' if ok else ''}

") return "".join(rows) def _char_acc(examples): tot = hit = 0 for k, rec, _ in examples: for i, ch in enumerate(k): tot += 1 hit += int(i < len(rec) and rec[i] == ch) return hit / max(1, tot) # ---- handlers --------------------------------------------------------------------- @_gpu(duration=120) def moe_run(query, max_new): yield _notice("Routing & generating") moe = _to_gpu(_get_moe()(DEVICE)) if not moe.available(): if _SPIKEWHALE: yield _msg("⏳ No SpikeWhale experts found", "Set MODMIND_DIR to your ModMind folder and make sure " "<domain>/checkpoints/step_*.pt exist (the panel hot-reloads them).") else: yield _msg("⏳ No experts trained yet", "Run python agents/train.py --expert language (and math, tool).") return q = (query or "").strip() or "The" winner, weights, bits = moe.route(q) _, shared = moe.shared_latent(q) steps = dict(getattr(moe, "steps", {}) or {}) c = COLOR.get(winner, "#9b59b6") head = (f"

🧭 Routed to {EMOJI.get(winner, winner)}" f" — the expert most fluent on your text (lowest bits/byte) wins

" + _cards(winner, weights, bits, steps) + _latent(shared)) gen = "" if hasattr(moe, "generate_stream"): # live token streaming for _, gen in moe.generate_stream(q, winner, max_new=int(max_new)): yield _wrap(head + _gen_box(q, gen, live=True)) else: r = moe.run(q, max_new=int(max_new)) gen = r.get("generation", "") _WARMED["done"] = True yield _wrap(head + _gen_box(q, gen, live=False) + f"

{_FOOTER}

") @_gpu(duration=120) def moe_key_recall(n): """THE PROOF: a random key shown only to the consultant; the asker reproduces it from the latent alone (with) vs ablated (without).""" yield _notice("Running the proof") moe = _to_gpu(_get_moe()(DEVICE)) if not getattr(moe, "key_recall_available", lambda: False)(): yield _msg("🔑 Bridge unavailable", "Needs the SpikeWhale backend and a trained " "links/<asker>__from__<consultant>.pt saved with the full asker.") return meta = moe.consult_meta() a = EMOJI.get(meta["asker"], meta["asker"]); c = EMOJI.get(meta["consultant"], meta["consultant"]) wr = moe.key_recall(n=int(n), ablate=False) ar = moe.key_recall(n=int(n), ablate=True) _WARMED["done"] = True cw, ca = _char_acc(wr["examples"]) * 100, _char_acc(ar["examples"]) * 100 stats = ( "

" f"

{cw:.0f}%

" f"

secret characters recovered
WITH the latent

" f"

{ca:.0f}%

" f"

recovered with the latent
CUT (ablated to zero)

" f"

1.6%

" f"

chance level
(1 in 62 per character)

" "

") yield _wrap( f"

🔑 {a} read {c}'s mind through the latent bridge

" f"

A random secret key is shown only to {c}. {a} never sees it — " f"it must reproduce the key purely by reading {c}'s latent through the trained RecursiveLink.

" + stats + f"

secret key (only {c} saw it) → what {a} recovered, character by character" f" · {wr['acc']*100:.0f}% of keys perfectly exact

" + _key_rows(wr["examples"]) + "

Cut the latent and recovery collapses to chance — that gap is the result: " "real information crossing between two models that were trained separately, on different data, " "and never met. Routing and generation are the supporting act.

") def _tile_row(label, chars, classes): cells = "".join(f"{_h.escape(ch) if ch else '·'}" for ch, cls in zip(chars, classes)) return f"

{label}{cells}

" @_gpu(duration=120) def moe_secret(secret): """Interactive bridge demo: the user's secret is shown ONLY to Math; Language answers 'what did Math just see?' from the latent alone — legible content, not steered babble.""" yield _notice("Transmitting through the latent bridge") moe = _to_gpu(_get_moe()(DEVICE)) if not getattr(moe, "relay_secret", None) or not getattr(moe, "key_recall_available", lambda: False)(): yield _msg("📨 Bridge unavailable", "Needs the SpikeWhale backend and a trained bridge saved with the full asker.") return meta = moe.consult_meta() a = EMOJI.get(meta["asker"], meta["asker"]); c = EMOJI.get(meta["consultant"], meta["consultant"]) wr = moe.relay_secret(secret, ablate=False) if wr.get("error"): yield _msg("📨 " + _h.escape(wr["error"]), "Type exactly 6 characters, letters and digits only — e.g. Xy9Qz2.") return ar = moe.relay_secret(secret, ablate=True) _WARMED["done"] = True s, got, abl = wr["secret"], wr["recovered"], ar["recovered"] nok = sum(wr["ok"]) rows = ( _tile_row(f"you told {c} (only {c} saw this):", list(s), ["k"] * len(s)) + _tile_row(f"{a} read from {c}'s latent:", [got[i] if i < len(got) else "" for i in range(len(s))], ["g" if ok else "r" for ok in wr["ok"]]) + _tile_row("same question, latent cut:", [abl[i] if i < len(abl) else "" for i in range(len(s))], ["g" if ok else "r" for ok in ar["ok"]]) ) align_note = "" if wr["aligned"] else ( "

⚠️ The tokenizer fused some of those characters into multi-character tokens " "the bridge never saw in training (it was trained on random-looking keys), so transmission " "degrades. Random-looking mixes of letters and digits — like Xy9Qz2 — transmit best.

") yield _wrap( f"

📨 {a} read your secret out of {c}'s mind — " f"{nok}/{len(s)} characters arrived intact

" f"

{a} never saw your text. It answered one question — “what did {c} just " f"see?” — using only {c}'s latent, passed through the trained RecursiveLink.

" + rows + align_note + f"

The bridge is a noisy channel (~4–5 of 6 characters usually survive), but cut " f"the latent and the answer collapses to gibberish — the content is genuinely crossing in latent " f"space, never as text. Two models, trained separately on different data, sharing a thought.

") @_gpu(duration=120) def moe_ask(a, op, b): """The Q->A bridge: an arithmetic question is shown ONLY to Math; Language answers it reading nothing but Math's latent (trained by train_qa_link.py, held-out-validated).""" yield _notice("Asking Math through the bridge") moe = _to_gpu(_get_moe()(DEVICE)) if not getattr(moe, "qa_available", lambda: False)(): yield _msg("🧮 The question→answer bridge isn't trained yet", "Run python agents/modmind/train_qa_link.py — the panel " "hot-reloads the result as soon as a checkpoint is saved.") return op = {"×": "*", "−": "-", "x": "*"}.get(str(op), str(op)) try: a, b = int(a), int(b) except (TypeError, ValueError): yield _msg("🧮 Need two whole numbers", "Pick a and b first.") return if op == "*" and not (2 <= a <= 12 and 2 <= b <= 12): yield _msg("🧮 Outside the trained range", "Multiplication was trained on 2–12 × 2–12.") return if op in ("+", "-") and not (10 <= a <= 99 and 10 <= b <= 99): yield _msg("🧮 Outside the trained range", "Addition and subtraction were trained on 10–99.") return if op == "-" and a < b: a, b = b, a # trained on non-negative answers wr = moe.ask_math(a, op, b) if wr.get("error"): yield _msg("🧮 " + _h.escape(wr["error"]), "Try a different problem.") return ar = moe.ask_math(a, op, b, ablate=True) _WARMED["done"] = True info = moe.qa_info() or {} A = EMOJI.get(info.get("asker", "language"), "📖 Language") C = EMOJI.get(info.get("consultant", "math"), "➗ Math") acc = info.get("holdout_exact", float("nan")) * 100 memorize = info.get("mode", "memorize") == "memorize" opd = {"+": "+", "-": "−", "*": "×"}[op] verdict = ("✅ correct" if wr["exact"] else f"❌ not quite (it's {wr['truth']})") if memorize: scorecard = ( f"Honest scorecard: this bridge was trained on the whole " f"table of two-digit problems (10–99 for + and −, 2–12 for ×) and answers " f"~{acc:.0f}% of them correctly. It's a lookup table transmitted through the " f"latent, not learned arithmetic — {C} stays frozen and never computes; the bridge + " f"{A}'s fine-tune memorized every answer and the question only ever travels in latent " f"space. Cut the latent and {A} has no question at all.") else: scorecard = ( f"Honest scorecard: this bridge solves {acc:.0f}% of problems it has never seen " f"in training exactly (held-out validation — generalization). {C} stays frozen; the " f"arithmetic skill lives in the bridge + {A}'s fine-tune, and the question only ever " f"travels in latent space. Cut the latent and {A} has no question at all.") rows = ( _tile_row(f"the right answer (never shown to anyone):", list(wr["want"]), ["k"] * len(wr["want"])) + _tile_row(f"{A} answered, reading {C}'s latent:", [wr["digits"][i] if i < len(wr["digits"]) else "" for i in range(len(wr["want"]))], ["g" if ok else "r" for ok in wr["ok"]]) + _tile_row("same prompt, latent cut:", [ar["digits"][i] if i < len(ar["digits"]) else "" for i in range(len(wr["want"]))], ["g" if ok else "r" for ok in ar["ok"]]) ) yield _wrap( f"

🧮 Only {C} saw {a} {opd} {b} — " f"{A} answered {_h.escape(wr['answer'])} · {verdict}

" f"

{A}'s entire input was the prompt ANS>. The question " f"existed only in {C}'s mind — it crossed to {A} as a 256-dim latent through a RecursiveLink " f"trained for question→answer (zero-padded to {len(wr['want'])} digits).

" + rows + f"

{scorecard}

") @_gpu(duration=120) def moe_combine(query, max_new, blend, consult): """Two blends compared at the same mix ratio: a real WEIGHT-MERGE (one merged model) vs an OUTPUT-BLEND (two models run separately, distributions averaged).""" yield _notice("Building merge + blending") moe = _to_gpu(_get_moe()(DEVICE)) if not getattr(moe, "merge_available", lambda: False)(): yield _msg("🧬 Unavailable", "Needs both specialists loaded.") return q = (query or "").strip() or "The water cycle works by" a = float(blend) merged_gen = moe.merge_generate(q, alpha=a, max_new=int(max_new), consult=bool(consult)) blend_gen = moe.combine(q, max_new=int(max_new), blend=a, consult=bool(consult)) _WARMED["done"] = True extra = " · +Reasoning's latent (consult)" if consult else "" yield _wrap( "

🧬 MoE Modular Minds — two ways to blend

" f"

{int(round((1-a)*100))}% 📖 Language ⟷ " f"{int(round(a*100))}% ➗ Math{extra}

" "

" f"

① Weight merge — ONE model whose weights are " f"(1−α)·Language + α·Math

" f"{_esc(q)} {_esc(merged_gen)}

" f"

② Output blend — both models run, next-token " f"distributions averaged each step

" f"{_esc(q)} {_esc(blend_gen)}

" "

Same mix ratio, two different mechanisms. Weight merge fuses the actual " "parameters into one network (only possible because they're the identical dense architecture); " "output blend is an inference-time ensemble of two separate models (only possible because " "they share the 16k tokenizer). Tick consult to also route Reasoning's latent into each " "through the trained bridge. Exploratory — generations are rough at this scale.

") HERO = """# 🧩 Modular Mind — two specialists that talk in latent space **Two ~80M models trained completely separately** — 📖 **Language** on FineWeb-Edu, ➗ **Math** on FineMath — that never saw each other's data. A coordinator **routes** your query to the right one, and a trained **RecursiveLink** lets them **communicate through latent space**: Language can read information straight out of Math's "mind." The **🔑 Bridge** tab proves it. > ℹ️ *These specialists were trained only to demonstrate a **verifiable result** — clean routing and a > provable latent-bridge ablation — **not** for production-quality output. The generated text is > intentionally rough at this scale; the mechanism is the point.*""" QA_INTRO = """### Ask ➗ Math a question — 📖 Language answers it without ever seeing it Pick an arithmetic problem. It is shown **only to ➗ Math** (which stays frozen). 📖 Language receives nothing but Math's 256-dim latent, passed through a RecursiveLink trained for **question→answer** — and types out the answer digits. Language's only text input is the prompt `ANS>`; the question itself crosses purely as a latent. The bridge has **memorized the whole table** of two-digit problems (a lookup table transmitted through latent space, not learned arithmetic) — cut the latent and Language has no question at all.""" SECRET_INTRO = """### Tell ➗ Math a secret — then watch 📖 Language read it out of Math's mind Type a 6-character code. It is shown **only to ➗ Math** — 📖 Language never sees your text. Language must answer one question: *“what did Math just see?”* — reading **only Math's latent** through the trained RecursiveLink. No text crosses between the models; the content arrives in latent space, legibly, character by character. (The channel is noisy — random-looking mixes of letters and digits transmit best.)""" BRIDGE_INTRO = """### The proof: two independent models, one latent channel A random secret key is shown **only to ➗ Math**. 📖 Language never sees it — but by reading Math's latent through the trained RecursiveLink, it **reproduces the key, character by character**. Zero out the latent and it collapses to chance. That gap *is* the result: real information crossing between two models that were trained on different data and never met. **Hit the button.**""" INTRO_BYTE = """## 🧩 Experiment — Modular Mind as a Mixture of Experts Three tiny ~10M byte-level specialists (language, math, tool-use), each streamed-trained on its own dataset. A coordinator **routes** your query to whichever expert is most fluent (perplexity-based MoE) and fuses their latents through a **RecursiveLink**. Try a math problem vs. a sentence.""" def _routing_block(): with gr.Row(): q = gr.Textbox(label="Your prompt", value="Solve for x: 2x + 3 = 11", scale=4, placeholder="a sentence or a math problem…") n = gr.Slider(40, 300, value=80, step=20, label="generate tokens", scale=1) btn = gr.Button("🧭 Route & generate", variant="primary") out = gr.HTML() btn.click(moe_run, [q, n], out) gr.Examples(examples=[["The theory of evolution explains", 80], ["Compute the derivative of x^2 + 3x", 80], ["The history of the Roman Empire began", 80]], inputs=[q, n]) def build_moe_panel(): """Create the MoE demo components inside the current gr.Blocks context.""" if not _SPIKEWHALE: with gr.Accordion("🧩 Experiment: Modular Mind = Mixture of Experts (3 specialists)", open=False): gr.Markdown(INTRO_BYTE) _routing_block() return with gr.Accordion("🧩 Modular Mind — independent specialists communicating in latent space", open=True): gr.Markdown(HERO) with gr.Tabs(): # The headline result, FIRST. with gr.Tab("🔑 The latent bridge — the proof"): gr.Markdown(BRIDGE_INTRO) with gr.Row(): kn = gr.Slider(4, 16, value=8, step=1, label="keys to test", scale=3) kbtn = gr.Button("🔑 Run the proof", variant="primary", scale=1) kout = gr.HTML() kbtn.click(moe_key_recall, [kn], kout) # Interactive: the user's own secret crosses the bridge. with gr.Tab("📨 Tell Math a secret"): gr.Markdown(SECRET_INTRO) with gr.Row(): sq = gr.Textbox(label="Your 6-character secret (letters & digits)", value="Xy9Qz2", max_length=12, scale=3) sbtn = gr.Button("📨 Show it ONLY to Math → let Language read it", variant="primary", scale=2) sout = gr.HTML() sbtn.click(moe_secret, [sq], sout) gr.Examples(examples=[["Xy9Qz2"], ["Tk7Bn2"], ["q0t0Mz"], ["gG5hH6"]], inputs=[sq]) # Q->A: Language answers a question only Math ever saw. with gr.Tab("🧮 Ask Math a question"): gr.Markdown(QA_INTRO) with gr.Row(): qa_a = gr.Number(value=23, precision=0, label="a", scale=1) qa_op = gr.Dropdown(["+", "−", "×"], value="+", label="op", scale=1) qa_b = gr.Number(value=54, precision=0, label="b", scale=1) qa_btn = gr.Button("🧮 Show ONLY Math the question → Language answers", variant="primary", scale=2) qa_out = gr.HTML() qa_btn.click(moe_ask, [qa_a, qa_op, qa_b], qa_out) gr.Examples(examples=[[23, "+", 54], [81, "−", 27], [7, "×", 8], [62, "+", 39]], inputs=[qa_a, qa_op, qa_b]) # Routing — the supporting act. with gr.Tab("🧭 Routing & generation"): gr.Markdown("Type a math problem vs. a sentence and watch the **route flip** — each " "expert is most fluent (lowest bits/byte) on its own domain. Generation " "streams in live.") _routing_block() # MoE Modular Minds — TWO ways to blend the specialists, compared side by side. with gr.Tab("🧬 MoE Modular Minds"): gr.Markdown( "**Two ways to blend the two specialists**, shown side by side at the same mix ratio:\n" "- **① Weight merge** — fuse the *parameters* into one model `(1-α)·Language + α·Math` " "(works because they're the identical dense architecture).\n" "- **② Output blend** — run both models separately and average their next-token " "distributions (works because they share the 16k tokenizer).\n\n" "Slide the mix, and tick *consult* to also route Reasoning's latent into each through the " "trained bridge.") with gr.Row(): mq = gr.Textbox(label="Prompt", value="The water cycle works by", scale=4) mn = gr.Slider(40, 160, value=70, step=10, label="generate tokens", scale=1) with gr.Row(): mblend = gr.Slider(0.0, 1.0, value=0.5, step=0.1, label="mix α: 0 = 📖 Language ⟷ 1 = ➗ Math", scale=3) mconsult = gr.Checkbox(value=False, label="consult (inject Reasoning's latent)", scale=1) mbtn = gr.Button("🧬 Blend both ways (weight-merge vs output-blend)", variant="primary") mout = gr.HTML() mbtn.click(moe_combine, [mq, mn, mblend, mconsult], mout)