"""
panel.py -- the Gradio section for the bottom of the boss app: a live demo of the
Modular-Mind mixture-of-experts.
For the SpikeWhale backend it leads with the *latent bridge* (the real result) and
organizes the three demos into tabs. Output is rendered as rich HTML (animated routing
cards, a latent-bus strip, character-diff key recovery, live token streaming) instead
of markdown tables. Every handler is a generator that yields an instant "loading"
notice first, so the first run never looks frozen while the ~80M models lazy-load.
Hot-reloads checkpoints.
"""
from __future__ import annotations
import html as _h
import os
import sys
import gradio as gr
# ZeroGPU: @spaces.GPU allocates a GPU only for the decorated call (CUDA is never touched at
# import/startup). Falls back to a no-op decorator when `spaces` isn't installed (local / plain CPU).
try:
import spaces
_gpu = spaces.GPU
except Exception:
def _gpu(fn=None, **kw):
return fn if callable(fn) else (lambda f: f)
def _to_gpu(moe):
if hasattr(moe, "to_gpu_if_available"):
moe.to_gpu_if_available()
return moe
EMOJI = {"language": "📖 Language", "math": "➗ Math", "tool": "🛠️ Tool-use"}
COLOR = {"language": "#6aa9ff", "math": "#58d68d", "tool": "#f5b041"}
DEVICE = os.environ.get("MM_AGENTS_DEVICE", "cpu")
# Self-contained SpikeWhale bundle that ships next to this file (agents/modmind/: the 80M
# specialists + bridge + inference code). If it's present we default to the SpikeWhale backend
# so the HuggingFace Space "just works" with no env config. Env vars still override.
_BUNDLED_MODMIND = os.path.join(os.path.dirname(os.path.abspath(__file__)), "modmind")
_DEFAULT_BACKEND = "spikewhale" if os.path.isdir(_BUNDLED_MODMIND) else "bytegpt"
_SPIKEWHALE = os.environ.get("MM_MOE_BACKEND", _DEFAULT_BACKEND).lower() in ("spikewhale", "modmind")
_WARMED = {"done": False} # so the "loading the models" notice only shows on the first run
_FOOTER = (
"Two ~80M dense specialists — 📖 Language (FineWeb-Edu) and ➗ Math (FineMath) — sharing a "
"16k length-max tokenizer. A coordinator routes by bits-per-byte, and a trained RecursiveLink "
"lets them communicate in latent space (proven in the Bridge tab). Hot-reloads checkpoints."
if _SPIKEWHALE else
"Three byte-level ~10M specialists, streamed-trained on FineWeb-Edu / FineMath / "
"glaive-function-calling. Tiny + early-trained, so generations are rough — the routing "
"(which expert is most confident) is the point. It hot-reloads as training continues."
)
def _get_moe():
"""Pick the MoE backend. Defaults to the bundled SpikeWhale 80M specialists
(agents/modmind/) when present, else the byte-level ByteGPT experts. MM_MOE_BACKEND
and MODMIND_DIR override."""
backend = os.environ.get("MM_MOE_BACKEND", _DEFAULT_BACKEND).lower()
if backend in ("spikewhale", "modmind"):
mm_dir = os.environ.get("MODMIND_DIR", _BUNDLED_MODMIND)
if mm_dir and mm_dir not in sys.path:
sys.path.insert(0, mm_dir) # front: ModMind's model.py wins over agents/model.py
from moe_gradio import get_moe
return get_moe
from orchestrator import get_moe
return get_moe
# ---- HTML rendering -------------------------------------------------------------
_CSS = """"""
def _wrap(body):
return _CSS + "
" + body + "
"
def _esc(s):
return _h.escape(s or "").replace("\n", "
")
def _notice(action="Generating"):
"""First-run popup + in-place message so nothing ever looks frozen."""
if not _WARMED["done"]:
try:
gr.Info("First run — loading the models (~20–40s on CPU). After this, it's quick.")
except Exception:
pass
return _wrap(f"⏳ Loading the ~80M specialists + {action.lower()}… "
"first run can take ~20–40s on CPU; every run after is fast.
")
return _wrap(f"⏳ {action}…
")
def _msg(title, body):
return _wrap(f"{title}
{body}
")
def _cards(winner, weights, bits, steps):
"""One animated card per expert: fluency, routing weight bar, winner badge + glow."""
out = []
for n, wv in weights.items():
c = COLOR.get(n, "#9b59b6")
win = (n == winner)
style = f"border-color:{c};box-shadow:0 0 16px {c}40" if win else ""
badge = f"ROUTED ✓" if win else ""
out.append(
f"{badge}"
f"
{EMOJI.get(n, n)}
"
f"
{steps.get(n, 0):,} train steps · {bits[n]:.2f} bits/byte (lower = more fluent)
"
f"
"
f"
routing weight {wv*100:.1f}%
")
return "" + "".join(out) + "
"
def _latent(shared, n=48):
"""The shared latent bus as a strip of signed bars (like the piano's latent strip)."""
vals = list(shared or [])[:n]
if not vals:
return ""
mx = max(1e-6, max(abs(v) for v in vals))
cells = "".join(
f"= 0 else '#df7a5b'}'>" for v in vals)
return (f"{cells}
"
f"the shared latent bus — every expert's output latent, fused by the "
f"RecursiveLink (first {len(vals)} of 256 dims; blue = +, orange = −)
")
def _gen_box(prompt, gen, live=False):
caret = "" if live else ""
return (f"{_esc(prompt)}"
f"{_esc(gen)}{caret}
")
def _key_rows(examples):
"""Wordle-style per-character diff: secret key -> what the asker recovered."""
rows = []
for k, rec, ok in examples:
sec = "".join(f"{_h.escape(ch)}" for ch in k)
got = "".join(
f""
f"{_h.escape(rec[i]) if i < len(rec) else '·'}"
for i, ch in enumerate(k))
rows.append(f"{sec}→{got}"
f"{' ✅' if ok else ''}
")
return "".join(rows)
def _char_acc(examples):
tot = hit = 0
for k, rec, _ in examples:
for i, ch in enumerate(k):
tot += 1
hit += int(i < len(rec) and rec[i] == ch)
return hit / max(1, tot)
# ---- handlers ---------------------------------------------------------------------
@_gpu(duration=120)
def moe_run(query, max_new):
yield _notice("Routing & generating")
moe = _to_gpu(_get_moe()(DEVICE))
if not moe.available():
if _SPIKEWHALE:
yield _msg("⏳ No SpikeWhale experts found",
"Set MODMIND_DIR to your ModMind folder and make sure "
"<domain>/checkpoints/step_*.pt exist (the panel hot-reloads them).")
else:
yield _msg("⏳ No experts trained yet",
"Run python agents/train.py --expert language (and math, tool).")
return
q = (query or "").strip() or "The"
winner, weights, bits = moe.route(q)
_, shared = moe.shared_latent(q)
steps = dict(getattr(moe, "steps", {}) or {})
c = COLOR.get(winner, "#9b59b6")
head = (f"🧭 Routed to {EMOJI.get(winner, winner)}"
f" — the expert most fluent on your text (lowest bits/byte) wins
"
+ _cards(winner, weights, bits, steps) + _latent(shared))
gen = ""
if hasattr(moe, "generate_stream"): # live token streaming
for _, gen in moe.generate_stream(q, winner, max_new=int(max_new)):
yield _wrap(head + _gen_box(q, gen, live=True))
else:
r = moe.run(q, max_new=int(max_new))
gen = r.get("generation", "")
_WARMED["done"] = True
yield _wrap(head + _gen_box(q, gen, live=False) + f"{_FOOTER}
")
@_gpu(duration=120)
def moe_key_recall(n):
"""THE PROOF: a random key shown only to the consultant; the asker reproduces it from the
latent alone (with) vs ablated (without)."""
yield _notice("Running the proof")
moe = _to_gpu(_get_moe()(DEVICE))
if not getattr(moe, "key_recall_available", lambda: False)():
yield _msg("🔑 Bridge unavailable",
"Needs the SpikeWhale backend and a trained "
"links/<asker>__from__<consultant>.pt saved with the full asker.")
return
meta = moe.consult_meta()
a = EMOJI.get(meta["asker"], meta["asker"]); c = EMOJI.get(meta["consultant"], meta["consultant"])
wr = moe.key_recall(n=int(n), ablate=False)
ar = moe.key_recall(n=int(n), ablate=True)
_WARMED["done"] = True
cw, ca = _char_acc(wr["examples"]) * 100, _char_acc(ar["examples"]) * 100
stats = (
""
f"
"
f"
{cw:.0f}%
"
f"
secret characters recovered
WITH the latent
"
f"
{ca:.0f}%
"
f"
recovered with the latent
CUT (ablated to zero)
"
f"
1.6%
"
f"
chance level
(1 in 62 per character)
"
"
")
yield _wrap(
f"🔑 {a} read {c}'s mind through the latent bridge
"
f"A random secret key is shown only to {c}. {a} never sees it — "
f"it must reproduce the key purely by reading {c}'s latent through the trained RecursiveLink.
"
+ stats
+ f"secret key (only {c} saw it) → what {a} recovered, character by character"
f" · {wr['acc']*100:.0f}% of keys perfectly exact
"
+ _key_rows(wr["examples"])
+ "Cut the latent and recovery collapses to chance — that gap is the result: "
"real information crossing between two models that were trained separately, on different data, "
"and never met. Routing and generation are the supporting act.
")
def _tile_row(label, chars, classes):
cells = "".join(f"{_h.escape(ch) if ch else '·'}"
for ch, cls in zip(chars, classes))
return f"{label}{cells}
"
@_gpu(duration=120)
def moe_secret(secret):
"""Interactive bridge demo: the user's secret is shown ONLY to Math; Language answers
'what did Math just see?' from the latent alone — legible content, not steered babble."""
yield _notice("Transmitting through the latent bridge")
moe = _to_gpu(_get_moe()(DEVICE))
if not getattr(moe, "relay_secret", None) or not getattr(moe, "key_recall_available", lambda: False)():
yield _msg("📨 Bridge unavailable",
"Needs the SpikeWhale backend and a trained bridge saved with the full asker.")
return
meta = moe.consult_meta()
a = EMOJI.get(meta["asker"], meta["asker"]); c = EMOJI.get(meta["consultant"], meta["consultant"])
wr = moe.relay_secret(secret, ablate=False)
if wr.get("error"):
yield _msg("📨 " + _h.escape(wr["error"]),
"Type exactly 6 characters, letters and digits only — e.g. Xy9Qz2.")
return
ar = moe.relay_secret(secret, ablate=True)
_WARMED["done"] = True
s, got, abl = wr["secret"], wr["recovered"], ar["recovered"]
nok = sum(wr["ok"])
rows = (
_tile_row(f"you told {c} (only {c} saw this):", list(s), ["k"] * len(s))
+ _tile_row(f"{a} read from {c}'s latent:",
[got[i] if i < len(got) else "" for i in range(len(s))],
["g" if ok else "r" for ok in wr["ok"]])
+ _tile_row("same question, latent cut:",
[abl[i] if i < len(abl) else "" for i in range(len(s))],
["g" if ok else "r" for ok in ar["ok"]])
)
align_note = "" if wr["aligned"] else (
"⚠️ The tokenizer fused some of those characters into multi-character tokens "
"the bridge never saw in training (it was trained on random-looking keys), so transmission "
"degrades. Random-looking mixes of letters and digits — like Xy9Qz2 — transmit best.
")
yield _wrap(
f"📨 {a} read your secret out of {c}'s mind — "
f"{nok}/{len(s)} characters arrived intact
"
f"{a} never saw your text. It answered one question — “what did {c} just "
f"see?” — using only {c}'s latent, passed through the trained RecursiveLink.
"
+ rows + align_note
+ f"The bridge is a noisy channel (~4–5 of 6 characters usually survive), but cut "
f"the latent and the answer collapses to gibberish — the content is genuinely crossing in latent "
f"space, never as text. Two models, trained separately on different data, sharing a thought.
")
@_gpu(duration=120)
def moe_ask(a, op, b):
"""The Q->A bridge: an arithmetic question is shown ONLY to Math; Language answers it
reading nothing but Math's latent (trained by train_qa_link.py, held-out-validated)."""
yield _notice("Asking Math through the bridge")
moe = _to_gpu(_get_moe()(DEVICE))
if not getattr(moe, "qa_available", lambda: False)():
yield _msg("🧮 The question→answer bridge isn't trained yet",
"Run python agents/modmind/train_qa_link.py — the panel "
"hot-reloads the result as soon as a checkpoint is saved.")
return
op = {"×": "*", "−": "-", "x": "*"}.get(str(op), str(op))
try:
a, b = int(a), int(b)
except (TypeError, ValueError):
yield _msg("🧮 Need two whole numbers", "Pick a and b first.")
return
if op == "*" and not (2 <= a <= 12 and 2 <= b <= 12):
yield _msg("🧮 Outside the trained range", "Multiplication was trained on 2–12 × 2–12.")
return
if op in ("+", "-") and not (10 <= a <= 99 and 10 <= b <= 99):
yield _msg("🧮 Outside the trained range", "Addition and subtraction were trained on 10–99.")
return
if op == "-" and a < b:
a, b = b, a # trained on non-negative answers
wr = moe.ask_math(a, op, b)
if wr.get("error"):
yield _msg("🧮 " + _h.escape(wr["error"]), "Try a different problem.")
return
ar = moe.ask_math(a, op, b, ablate=True)
_WARMED["done"] = True
info = moe.qa_info() or {}
A = EMOJI.get(info.get("asker", "language"), "📖 Language")
C = EMOJI.get(info.get("consultant", "math"), "➗ Math")
acc = info.get("holdout_exact", float("nan")) * 100
memorize = info.get("mode", "memorize") == "memorize"
opd = {"+": "+", "-": "−", "*": "×"}[op]
verdict = ("✅ correct" if wr["exact"] else f"❌ not quite (it's {wr['truth']})")
if memorize:
scorecard = (
f"Honest scorecard: this bridge was trained on the whole "
f"table of two-digit problems (10–99 for + and −, 2–12 for ×) and answers "
f"~{acc:.0f}% of them correctly. It's a lookup table transmitted through the "
f"latent, not learned arithmetic — {C} stays frozen and never computes; the bridge + "
f"{A}'s fine-tune memorized every answer and the question only ever travels in latent "
f"space. Cut the latent and {A} has no question at all.")
else:
scorecard = (
f"Honest scorecard: this bridge solves {acc:.0f}% of problems it has never seen "
f"in training exactly (held-out validation — generalization). {C} stays frozen; the "
f"arithmetic skill lives in the bridge + {A}'s fine-tune, and the question only ever "
f"travels in latent space. Cut the latent and {A} has no question at all.")
rows = (
_tile_row(f"the right answer (never shown to anyone):", list(wr["want"]), ["k"] * len(wr["want"]))
+ _tile_row(f"{A} answered, reading {C}'s latent:",
[wr["digits"][i] if i < len(wr["digits"]) else "" for i in range(len(wr["want"]))],
["g" if ok else "r" for ok in wr["ok"]])
+ _tile_row("same prompt, latent cut:",
[ar["digits"][i] if i < len(ar["digits"]) else "" for i in range(len(wr["want"]))],
["g" if ok else "r" for ok in ar["ok"]])
)
yield _wrap(
f"🧮 Only {C} saw {a} {opd} {b} — "
f"{A} answered {_h.escape(wr['answer'])} · {verdict}
"
f"{A}'s entire input was the prompt ANS>. The question "
f"existed only in {C}'s mind — it crossed to {A} as a 256-dim latent through a RecursiveLink "
f"trained for question→answer (zero-padded to {len(wr['want'])} digits).
"
+ rows
+ f"{scorecard}
")
@_gpu(duration=120)
def moe_combine(query, max_new, blend, consult):
"""Two blends compared at the same mix ratio: a real WEIGHT-MERGE (one merged model) vs an
OUTPUT-BLEND (two models run separately, distributions averaged)."""
yield _notice("Building merge + blending")
moe = _to_gpu(_get_moe()(DEVICE))
if not getattr(moe, "merge_available", lambda: False)():
yield _msg("🧬 Unavailable", "Needs both specialists loaded.")
return
q = (query or "").strip() or "The water cycle works by"
a = float(blend)
merged_gen = moe.merge_generate(q, alpha=a, max_new=int(max_new), consult=bool(consult))
blend_gen = moe.combine(q, max_new=int(max_new), blend=a, consult=bool(consult))
_WARMED["done"] = True
extra = " · +Reasoning's latent (consult)" if consult else ""
yield _wrap(
"🧬 MoE Modular Minds — two ways to blend
"
f"
"
f"{int(round((1-a)*100))}% 📖 Language ⟷ "
f"{int(round(a*100))}% ➗ Math{extra}
"
""
f"
① Weight merge — ONE model whose weights are "
f"(1−α)·Language + α·Math
"
f"
{_esc(q)} {_esc(merged_gen)}"
f"
② Output blend — both models run, next-token "
f"distributions averaged each step
"
f"
{_esc(q)} {_esc(blend_gen)}"
"
"
"Same mix ratio, two different mechanisms. Weight merge fuses the actual "
"parameters into one network (only possible because they're the identical dense architecture); "
"output blend is an inference-time ensemble of two separate models (only possible because "
"they share the 16k tokenizer). Tick consult to also route Reasoning's latent into each "
"through the trained bridge. Exploratory — generations are rough at this scale.
")
HERO = """# 🧩 Modular Mind — two specialists that talk in latent space
**Two ~80M models trained completely separately** — 📖 **Language** on FineWeb-Edu, ➗ **Math** on
FineMath — that never saw each other's data. A coordinator **routes** your query to the right one,
and a trained **RecursiveLink** lets them **communicate through latent space**: Language can read
information straight out of Math's "mind." The **🔑 Bridge** tab proves it.
> ℹ️ *These specialists were trained only to demonstrate a **verifiable result** — clean routing and a
> provable latent-bridge ablation — **not** for production-quality output. The generated text is
> intentionally rough at this scale; the mechanism is the point.*"""
QA_INTRO = """### Ask ➗ Math a question — 📖 Language answers it without ever seeing it
Pick an arithmetic problem. It is shown **only to ➗ Math** (which stays frozen). 📖 Language
receives nothing but Math's 256-dim latent, passed through a RecursiveLink trained for
**question→answer** — and types out the answer digits. Language's only text input is the prompt
`ANS>`; the question itself crosses purely as a latent. The bridge has **memorized the whole table**
of two-digit problems (a lookup table transmitted through latent space, not learned arithmetic) —
cut the latent and Language has no question at all."""
SECRET_INTRO = """### Tell ➗ Math a secret — then watch 📖 Language read it out of Math's mind
Type a 6-character code. It is shown **only to ➗ Math** — 📖 Language never sees your text.
Language must answer one question: *“what did Math just see?”* — reading **only Math's latent**
through the trained RecursiveLink. No text crosses between the models; the content arrives in
latent space, legibly, character by character. (The channel is noisy — random-looking mixes of
letters and digits transmit best.)"""
BRIDGE_INTRO = """### The proof: two independent models, one latent channel
A random secret key is shown **only to ➗ Math**. 📖 Language never sees it — but by reading Math's
latent through the trained RecursiveLink, it **reproduces the key, character by character**. Zero out
the latent and it collapses to chance. That gap *is* the result: real information crossing between two
models that were trained on different data and never met. **Hit the button.**"""
INTRO_BYTE = """## 🧩 Experiment — Modular Mind as a Mixture of Experts
Three tiny ~10M byte-level specialists (language, math, tool-use), each streamed-trained on its own
dataset. A coordinator **routes** your query to whichever expert is most fluent (perplexity-based MoE)
and fuses their latents through a **RecursiveLink**. Try a math problem vs. a sentence."""
def _routing_block():
with gr.Row():
q = gr.Textbox(label="Your prompt", value="Solve for x: 2x + 3 = 11",
scale=4, placeholder="a sentence or a math problem…")
n = gr.Slider(40, 300, value=80, step=20, label="generate tokens", scale=1)
btn = gr.Button("🧭 Route & generate", variant="primary")
out = gr.HTML()
btn.click(moe_run, [q, n], out)
gr.Examples(examples=[["The theory of evolution explains", 80],
["Compute the derivative of x^2 + 3x", 80],
["The history of the Roman Empire began", 80]],
inputs=[q, n])
def build_moe_panel():
"""Create the MoE demo components inside the current gr.Blocks context."""
if not _SPIKEWHALE:
with gr.Accordion("🧩 Experiment: Modular Mind = Mixture of Experts (3 specialists)", open=False):
gr.Markdown(INTRO_BYTE)
_routing_block()
return
with gr.Accordion("🧩 Modular Mind — independent specialists communicating in latent space", open=True):
gr.Markdown(HERO)
with gr.Tabs():
# The headline result, FIRST.
with gr.Tab("🔑 The latent bridge — the proof"):
gr.Markdown(BRIDGE_INTRO)
with gr.Row():
kn = gr.Slider(4, 16, value=8, step=1, label="keys to test", scale=3)
kbtn = gr.Button("🔑 Run the proof", variant="primary", scale=1)
kout = gr.HTML()
kbtn.click(moe_key_recall, [kn], kout)
# Interactive: the user's own secret crosses the bridge.
with gr.Tab("📨 Tell Math a secret"):
gr.Markdown(SECRET_INTRO)
with gr.Row():
sq = gr.Textbox(label="Your 6-character secret (letters & digits)",
value="Xy9Qz2", max_length=12, scale=3)
sbtn = gr.Button("📨 Show it ONLY to Math → let Language read it",
variant="primary", scale=2)
sout = gr.HTML()
sbtn.click(moe_secret, [sq], sout)
gr.Examples(examples=[["Xy9Qz2"], ["Tk7Bn2"], ["q0t0Mz"], ["gG5hH6"]], inputs=[sq])
# Q->A: Language answers a question only Math ever saw.
with gr.Tab("🧮 Ask Math a question"):
gr.Markdown(QA_INTRO)
with gr.Row():
qa_a = gr.Number(value=23, precision=0, label="a", scale=1)
qa_op = gr.Dropdown(["+", "−", "×"], value="+", label="op", scale=1)
qa_b = gr.Number(value=54, precision=0, label="b", scale=1)
qa_btn = gr.Button("🧮 Show ONLY Math the question → Language answers",
variant="primary", scale=2)
qa_out = gr.HTML()
qa_btn.click(moe_ask, [qa_a, qa_op, qa_b], qa_out)
gr.Examples(examples=[[23, "+", 54], [81, "−", 27], [7, "×", 8], [62, "+", 39]],
inputs=[qa_a, qa_op, qa_b])
# Routing — the supporting act.
with gr.Tab("🧭 Routing & generation"):
gr.Markdown("Type a math problem vs. a sentence and watch the **route flip** — each "
"expert is most fluent (lowest bits/byte) on its own domain. Generation "
"streams in live.")
_routing_block()
# MoE Modular Minds — TWO ways to blend the specialists, compared side by side.
with gr.Tab("🧬 MoE Modular Minds"):
gr.Markdown(
"**Two ways to blend the two specialists**, shown side by side at the same mix ratio:\n"
"- **① Weight merge** — fuse the *parameters* into one model `(1-α)·Language + α·Math` "
"(works because they're the identical dense architecture).\n"
"- **② Output blend** — run both models separately and average their next-token "
"distributions (works because they share the 16k tokenizer).\n\n"
"Slide the mix, and tick *consult* to also route Reasoning's latent into each through the "
"trained bridge.")
with gr.Row():
mq = gr.Textbox(label="Prompt", value="The water cycle works by", scale=4)
mn = gr.Slider(40, 160, value=70, step=10, label="generate tokens", scale=1)
with gr.Row():
mblend = gr.Slider(0.0, 1.0, value=0.5, step=0.1,
label="mix α: 0 = 📖 Language ⟷ 1 = ➗ Math", scale=3)
mconsult = gr.Checkbox(value=False, label="consult (inject Reasoning's latent)", scale=1)
mbtn = gr.Button("🧬 Blend both ways (weight-merge vs output-blend)", variant="primary")
mout = gr.HTML()
mbtn.click(moe_combine, [mq, mn, mblend, mconsult], mout)