Spaces:
Runtime error
Runtime error
Commit Β·
c699da7
1
Parent(s): 44f11d2
fix: quality demo
Browse files- configs/training_config.yaml +1 -1
- demo/streamlit_app.py +302 -12
- env/spindleflow_env.py +3 -1
configs/training_config.yaml
CHANGED
|
@@ -60,7 +60,7 @@ environment:
|
|
| 60 |
max_delegation_depth: 2 # 2 for hackathon demo; architecture supports 4
|
| 61 |
max_specialists_per_episode: 6
|
| 62 |
specialist_timeout_ms: 8000
|
| 63 |
-
spawn_threshold: 0.
|
| 64 |
auto_spawn_specialists: true # set false to disable spawning entirely
|
| 65 |
spawn_max_total: 8 # hard cap on lifetime spawns β prevents registry bloat over 100k steps
|
| 66 |
spawn_cooldown_episodes: 20 # minimum episodes between consecutive spawns
|
|
|
|
| 60 |
max_delegation_depth: 2 # 2 for hackathon demo; architecture supports 4
|
| 61 |
max_specialists_per_episode: 6
|
| 62 |
specialist_timeout_ms: 8000
|
| 63 |
+
spawn_threshold: 0.40 # spawn when best specialist similarity < 0.40 (ML/adjacent tasks ~0.35, SE tasks ~0.55+)
|
| 64 |
auto_spawn_specialists: true # set false to disable spawning entirely
|
| 65 |
spawn_max_total: 8 # hard cap on lifetime spawns β prevents registry bloat over 100k steps
|
| 66 |
spawn_cooldown_episodes: 20 # minimum episodes between consecutive spawns
|
demo/streamlit_app.py
CHANGED
|
@@ -691,6 +691,131 @@ def fig_training_entropy() -> go.Figure:
|
|
| 691 |
return fig
|
| 692 |
|
| 693 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 694 |
# ββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½ββββββββββββββββββββββββββ
|
| 695 |
# UI helpers
|
| 696 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -1218,6 +1343,37 @@ def tab_specialists():
|
|
| 1218 |
specialists = [_SP(d) for d in _load_catalog()]
|
| 1219 |
source_note = "Showing YAML catalog β run an episode to load the live registry (includes dynamic additions)."
|
| 1220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1221 |
n = len(specialists)
|
| 1222 |
sec(f"Roster β {n} specialist{'s' if n != 1 else ''}, capability-embedded")
|
| 1223 |
if source_note:
|
|
@@ -1363,31 +1519,148 @@ def tab_training():
|
|
| 1363 |
# Tab 4 β Quality Demo
|
| 1364 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1365 |
def tab_quality():
|
| 1366 |
-
|
| 1367 |
-
|
| 1368 |
-
|
| 1369 |
-
|
| 1370 |
-
|
| 1371 |
-
|
| 1372 |
-
|
| 1373 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1374 |
c1, c2 = st.columns(2)
|
| 1375 |
with c1:
|
| 1376 |
st.markdown(
|
| 1377 |
'<div style="font-size:10px;font-weight:700;color:#ef4444;'
|
| 1378 |
'text-transform:uppercase;letter-spacing:1px;margin-bottom:8px;">'
|
| 1379 |
-
'
|
| 1380 |
unsafe_allow_html=True,
|
| 1381 |
)
|
| 1382 |
-
st.code(
|
| 1383 |
with c2:
|
| 1384 |
st.markdown(
|
| 1385 |
'<div style="font-size:10px;font-weight:700;color:#10b981;'
|
| 1386 |
'text-transform:uppercase;letter-spacing:1px;margin-bottom:8px;">'
|
| 1387 |
-
'Specialist-Routed Output (
|
| 1388 |
unsafe_allow_html=True,
|
| 1389 |
)
|
| 1390 |
-
st.code(
|
| 1391 |
|
| 1392 |
sec("Policy Tuning β Quality vs Latency")
|
| 1393 |
c1, c2 = st.columns(2)
|
|
@@ -1645,6 +1918,23 @@ def tab_output():
|
|
| 1645 |
# Keep env alive for delegation-graph rendering
|
| 1646 |
st.session_state.output_env = env
|
| 1647 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1648 |
except Exception as exc:
|
| 1649 |
import traceback
|
| 1650 |
st.error(f"Episode failed: {exc}")
|
|
|
|
| 691 |
return fig
|
| 692 |
|
| 693 |
|
| 694 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 695 |
+
# Quality-comparison helpers
|
| 696 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 697 |
+
def _generate_generic_output(task: str) -> str:
|
| 698 |
+
"""Call GPT-4o-mini directly with the task β no specialist routing."""
|
| 699 |
+
import os
|
| 700 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
| 701 |
+
if not api_key:
|
| 702 |
+
return (
|
| 703 |
+
"General problem-solving approach:\n"
|
| 704 |
+
"1. Gather and clarify requirements\n"
|
| 705 |
+
"2. Research common solution patterns\n"
|
| 706 |
+
"3. Draft a high-level architecture\n"
|
| 707 |
+
"4. Implement in small, testable increments\n"
|
| 708 |
+
"5. Validate against acceptance criteria and deploy\n"
|
| 709 |
+
"No specialist domain expertise applied."
|
| 710 |
+
)
|
| 711 |
+
try:
|
| 712 |
+
from openai import OpenAI
|
| 713 |
+
resp = OpenAI(api_key=api_key).chat.completions.create(
|
| 714 |
+
model="gpt-4o-mini",
|
| 715 |
+
max_tokens=600,
|
| 716 |
+
messages=[
|
| 717 |
+
{"role": "system",
|
| 718 |
+
"content": "You are a general-purpose software engineering assistant."},
|
| 719 |
+
{"role": "user",
|
| 720 |
+
"content": f"Provide a detailed solution approach for this task:\n\n{task}"},
|
| 721 |
+
],
|
| 722 |
+
)
|
| 723 |
+
return resp.choices[0].message.content
|
| 724 |
+
except Exception as exc:
|
| 725 |
+
return f"(Generic output generation failed: {exc})"
|
| 726 |
+
|
| 727 |
+
|
| 728 |
+
def _t1_relevance(task: str, output: str, registry) -> float:
|
| 729 |
+
"""Cosine similarity between task and output embeddings, scaled 0β10."""
|
| 730 |
+
try:
|
| 731 |
+
import numpy as np
|
| 732 |
+
t = registry.embed_query(task)
|
| 733 |
+
o = registry.embed_query(output[:800])
|
| 734 |
+
if t is None or o is None:
|
| 735 |
+
return 0.0
|
| 736 |
+
cos = float(np.dot(t, o) / (np.linalg.norm(t) * np.linalg.norm(o) + 1e-8))
|
| 737 |
+
return round(max(0.0, cos) * 10, 2)
|
| 738 |
+
except Exception:
|
| 739 |
+
return 0.0
|
| 740 |
+
|
| 741 |
+
|
| 742 |
+
def _judge_compare(task: str, generic: str, specialist: str) -> dict | None:
|
| 743 |
+
"""GPT-4o-mini rates both outputs on 4 dimensions. Returns {dim: [generic, specialist]}."""
|
| 744 |
+
import os, json
|
| 745 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
| 746 |
+
if not api_key:
|
| 747 |
+
return None
|
| 748 |
+
prompt = (
|
| 749 |
+
f"Task:\n{task[:400]}\n\n"
|
| 750 |
+
f"Output A (generic, no specialist routing):\n{generic[:700]}\n\n"
|
| 751 |
+
f"Output B (specialist-routed by trained policy):\n{specialist[:700]}\n\n"
|
| 752 |
+
"Rate each output 1β10 on: technical_depth, specificity, actionability, coverage.\n"
|
| 753 |
+
'Return JSON only: {"technical_depth":[A,B],"specificity":[A,B],'
|
| 754 |
+
'"actionability":[A,B],"coverage":[A,B]}'
|
| 755 |
+
)
|
| 756 |
+
try:
|
| 757 |
+
from openai import OpenAI
|
| 758 |
+
resp = OpenAI(api_key=api_key).chat.completions.create(
|
| 759 |
+
model="gpt-4o-mini",
|
| 760 |
+
max_tokens=150,
|
| 761 |
+
response_format={"type": "json_object"},
|
| 762 |
+
messages=[{"role": "user", "content": prompt}],
|
| 763 |
+
)
|
| 764 |
+
return json.loads(resp.choices[0].message.content)
|
| 765 |
+
except Exception:
|
| 766 |
+
return None
|
| 767 |
+
|
| 768 |
+
|
| 769 |
+
def fig_radar_comparison(
|
| 770 |
+
gen_scores: dict,
|
| 771 |
+
spec_scores: dict,
|
| 772 |
+
) -> go.Figure:
|
| 773 |
+
dims = list(gen_scores.keys())
|
| 774 |
+
g_vals = [gen_scores[d] for d in dims]
|
| 775 |
+
s_vals = [spec_scores[d] for d in dims]
|
| 776 |
+
dims_c = dims + [dims[0]]
|
| 777 |
+
g_c = g_vals + [g_vals[0]]
|
| 778 |
+
s_c = s_vals + [s_vals[0]]
|
| 779 |
+
|
| 780 |
+
fig = go.Figure()
|
| 781 |
+
fig.add_trace(go.Scatterpolar(
|
| 782 |
+
r=g_c, theta=dims_c, fill="toself",
|
| 783 |
+
fillcolor="rgba(239,68,68,0.10)",
|
| 784 |
+
line=dict(color="#ef4444", width=2),
|
| 785 |
+
name="Generic (no routing)",
|
| 786 |
+
))
|
| 787 |
+
fig.add_trace(go.Scatterpolar(
|
| 788 |
+
r=s_c, theta=dims_c, fill="toself",
|
| 789 |
+
fillcolor="rgba(0,212,255,0.13)",
|
| 790 |
+
line=dict(color="#00d4ff", width=2.5),
|
| 791 |
+
name="Specialist-routed",
|
| 792 |
+
))
|
| 793 |
+
fig.update_layout(
|
| 794 |
+
paper_bgcolor="rgba(0,0,0,0)",
|
| 795 |
+
font=dict(color="#e2e8f0", family="Inter, system-ui, sans-serif"),
|
| 796 |
+
polar=dict(
|
| 797 |
+
bgcolor="rgba(0,0,0,0)",
|
| 798 |
+
radialaxis=dict(
|
| 799 |
+
visible=True, range=[0, 10],
|
| 800 |
+
gridcolor="rgba(255,255,255,0.08)",
|
| 801 |
+
tickfont=dict(size=9, color="#475569"),
|
| 802 |
+
),
|
| 803 |
+
angularaxis=dict(
|
| 804 |
+
gridcolor="rgba(255,255,255,0.08)",
|
| 805 |
+
tickfont=dict(size=11, color="#94a3b8"),
|
| 806 |
+
),
|
| 807 |
+
),
|
| 808 |
+
title=dict(
|
| 809 |
+
text="Quality Radar β Generic vs Specialist-Routed",
|
| 810 |
+
font=dict(size=13, color="#94a3b8"),
|
| 811 |
+
),
|
| 812 |
+
legend=dict(bgcolor="rgba(0,0,0,0)", font=dict(color="#94a3b8", size=11)),
|
| 813 |
+
height=420,
|
| 814 |
+
margin=dict(l=60, r=60, t=60, b=40),
|
| 815 |
+
)
|
| 816 |
+
return fig
|
| 817 |
+
|
| 818 |
+
|
| 819 |
# ββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½ββββββββββββββββββββββββββ
|
| 820 |
# UI helpers
|
| 821 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 1343 |
specialists = [_SP(d) for d in _load_catalog()]
|
| 1344 |
source_note = "Showing YAML catalog β run an episode to load the live registry (includes dynamic additions)."
|
| 1345 |
|
| 1346 |
+
# ββ Dynamically spawned specialists (accumulated from Output tab runs) ββ
|
| 1347 |
+
spawned_pool = st.session_state.get("spawned_pool", [])
|
| 1348 |
+
if spawned_pool:
|
| 1349 |
+
sec(f"β‘ Dynamically Spawned Β· {len(spawned_pool)} new agent{'s' if len(spawned_pool) != 1 else ''}")
|
| 1350 |
+
st.caption(
|
| 1351 |
+
"These specialists were auto-created during Output tab runs β "
|
| 1352 |
+
"triggered when no existing specialist had sufficient domain coverage (similarity < threshold)."
|
| 1353 |
+
)
|
| 1354 |
+
pool_cols = st.columns(min(len(spawned_pool), 4))
|
| 1355 |
+
for i, sp in enumerate(spawned_pool):
|
| 1356 |
+
with pool_cols[i % 4]:
|
| 1357 |
+
st.markdown(f"""
|
| 1358 |
+
<div style="background:rgba(251,191,36,0.06);border:1px solid rgba(251,191,36,0.28);
|
| 1359 |
+
border-left:3px solid #fbbf24;border-radius:12px;
|
| 1360 |
+
padding:14px;margin-bottom:10px;">
|
| 1361 |
+
<div style="font-size:11px;font-weight:700;color:#fbbf24;margin-bottom:5px;">
|
| 1362 |
+
β‘ {_html.escape(sp['role'])}
|
| 1363 |
+
</div>
|
| 1364 |
+
<div style="font-size:10px;color:#475569;margin-bottom:6px;font-style:italic;">
|
| 1365 |
+
Triggered by: {_html.escape(sp['triggered_by'][:70])}β¦
|
| 1366 |
+
</div>
|
| 1367 |
+
<div style="font-size:11px;color:#64748b;line-height:1.5;">
|
| 1368 |
+
{_html.escape(sp['description'][:100])}β¦
|
| 1369 |
+
</div>
|
| 1370 |
+
<div style="font-size:10px;color:#334155;margin-top:8px;padding-top:8px;
|
| 1371 |
+
border-top:1px solid rgba(255,255,255,0.05);">
|
| 1372 |
+
{sp['avg_latency_ms']} ms Β· {', '.join(sp.get('complexity_affinity', []))}
|
| 1373 |
+
</div>
|
| 1374 |
+
</div>""", unsafe_allow_html=True)
|
| 1375 |
+
st.markdown("---")
|
| 1376 |
+
|
| 1377 |
n = len(specialists)
|
| 1378 |
sec(f"Roster β {n} specialist{'s' if n != 1 else ''}, capability-embedded")
|
| 1379 |
if source_note:
|
|
|
|
| 1519 |
# Tab 4 β Quality Demo
|
| 1520 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1521 |
def tab_quality():
|
| 1522 |
+
results = st.session_state.get("output_results")
|
| 1523 |
+
env_obj = st.session_state.get("output_env")
|
| 1524 |
+
|
| 1525 |
+
sec("Live Quality Comparison β Generic vs Specialist-Routed")
|
| 1526 |
+
|
| 1527 |
+
if results is None:
|
| 1528 |
+
st.markdown(
|
| 1529 |
+
'<div style="background:rgba(245,158,11,0.05);border:1px solid rgba(245,158,11,0.2);'
|
| 1530 |
+
'border-radius:12px;padding:28px;text-align:center;">'
|
| 1531 |
+
'<div style="font-size:13px;color:#fbbf24;font-weight:600;margin-bottom:8px;">'
|
| 1532 |
+
'No Output run yet</div>'
|
| 1533 |
+
'<div style="font-size:12px;color:#64748b;">'
|
| 1534 |
+
'Go to the <b>π― Output</b> tab, enter a task, and click '
|
| 1535 |
+
'"Run Trained Policy" β then return here to generate the quality comparison.'
|
| 1536 |
+
'</div></div>',
|
| 1537 |
+
unsafe_allow_html=True,
|
| 1538 |
+
)
|
| 1539 |
+
else:
|
| 1540 |
+
task = results["task"]
|
| 1541 |
+
spec_results = results["specialist_results"]
|
| 1542 |
+
specialist_text = "\n\n".join(
|
| 1543 |
+
f"[{sr['id'].upper()}]\n{sr['output'] or ''}"
|
| 1544 |
+
for sr in spec_results if sr.get("output")
|
| 1545 |
+
) or "(no specialist output)"
|
| 1546 |
+
|
| 1547 |
+
# Task banner
|
| 1548 |
+
st.markdown(
|
| 1549 |
+
f'<div style="background:rgba(0,212,255,0.04);border:1px solid rgba(0,212,255,0.18);'
|
| 1550 |
+
f'border-radius:10px;padding:12px 18px;margin-bottom:16px;">'
|
| 1551 |
+
f'<span style="font-size:9px;font-weight:700;color:#475569;text-transform:uppercase;'
|
| 1552 |
+
f'letter-spacing:1px;">Comparing outputs for: </span>'
|
| 1553 |
+
f'<span style="font-size:12px;color:#e2e8f0;">{_html.escape(task[:140])}</span>'
|
| 1554 |
+
f'</div>',
|
| 1555 |
+
unsafe_allow_html=True,
|
| 1556 |
+
)
|
| 1557 |
+
|
| 1558 |
+
comp_data = st.session_state.get("quality_comparison")
|
| 1559 |
+
already_computed = comp_data is not None and comp_data.get("task") == task
|
| 1560 |
+
|
| 1561 |
+
if not already_computed:
|
| 1562 |
+
if st.button("β‘ Generate Quality Comparison", type="primary", key="gen_comp_btn"):
|
| 1563 |
+
with st.spinner("Generating generic output + running GPT-4o-mini judgeβ¦"):
|
| 1564 |
+
generic_text = _generate_generic_output(task)
|
| 1565 |
+
registry = env_obj.registry if env_obj else None
|
| 1566 |
+
|
| 1567 |
+
gen_t1 = _t1_relevance(task, generic_text, registry) if registry else 5.0
|
| 1568 |
+
spec_t1 = _t1_relevance(task, specialist_text, registry) if registry else 7.0
|
| 1569 |
+
|
| 1570 |
+
judge = _judge_compare(task, generic_text, specialist_text)
|
| 1571 |
+
|
| 1572 |
+
def _pick(key, fallback_g, fallback_s):
|
| 1573 |
+
pair = (judge or {}).get(key, [fallback_g, fallback_s])
|
| 1574 |
+
return float(pair[0]), float(pair[1])
|
| 1575 |
+
|
| 1576 |
+
td_g, td_s = _pick("technical_depth", 5, 7)
|
| 1577 |
+
sp_g, sp_s = _pick("specificity", 4, 8)
|
| 1578 |
+
ac_g, ac_s = _pick("actionability", 4, 7)
|
| 1579 |
+
cv_g, cv_s = _pick("coverage", 5, 8)
|
| 1580 |
+
|
| 1581 |
+
gen_scores = {"Task Relevance": gen_t1, "Technical Depth": td_g,
|
| 1582 |
+
"Specificity": sp_g, "Actionability": ac_g, "Coverage": cv_g}
|
| 1583 |
+
spec_scores = {"Task Relevance": spec_t1, "Technical Depth": td_s,
|
| 1584 |
+
"Specificity": sp_s, "Actionability": ac_s, "Coverage": cv_s}
|
| 1585 |
+
|
| 1586 |
+
st.session_state.quality_comparison = {
|
| 1587 |
+
"task": task,
|
| 1588 |
+
"generic": generic_text,
|
| 1589 |
+
"specialist": specialist_text,
|
| 1590 |
+
"gen_scores": gen_scores,
|
| 1591 |
+
"spec_scores": spec_scores,
|
| 1592 |
+
}
|
| 1593 |
+
st.rerun()
|
| 1594 |
+
|
| 1595 |
+
comp_data = st.session_state.get("quality_comparison")
|
| 1596 |
+
if comp_data and comp_data.get("task") == task:
|
| 1597 |
+
gen_scores = comp_data["gen_scores"]
|
| 1598 |
+
spec_scores = comp_data["spec_scores"]
|
| 1599 |
+
|
| 1600 |
+
# ββ Score summary strip βββββββββββββββββββββββββββββββββββββ
|
| 1601 |
+
sec("Score Summary")
|
| 1602 |
+
cols = st.columns(len(gen_scores))
|
| 1603 |
+
for i, (dim, g_val) in enumerate(gen_scores.items()):
|
| 1604 |
+
s_val = spec_scores[dim]
|
| 1605 |
+
delta = round(s_val - g_val, 1)
|
| 1606 |
+
cols[i].metric(
|
| 1607 |
+
dim,
|
| 1608 |
+
f"{s_val:.1f} / 10",
|
| 1609 |
+
f"{delta:+.1f} vs generic",
|
| 1610 |
+
)
|
| 1611 |
+
|
| 1612 |
+
# ββ Radar chart βββββββββββββββββββββββββββββββββββββββββββββ
|
| 1613 |
+
sec("Quality Radar")
|
| 1614 |
+
st.plotly_chart(
|
| 1615 |
+
fig_radar_comparison(gen_scores, spec_scores),
|
| 1616 |
+
use_container_width=True,
|
| 1617 |
+
key="quality_radar",
|
| 1618 |
+
)
|
| 1619 |
+
|
| 1620 |
+
# ββ Side-by-side score bars ββββββββββββββββββββββββββββββββββ
|
| 1621 |
+
sec("Per-Dimension Score Breakdown")
|
| 1622 |
+
dims = list(gen_scores.keys())
|
| 1623 |
+
g_vals = [gen_scores[d] for d in dims]
|
| 1624 |
+
s_vals = [spec_scores[d] for d in dims]
|
| 1625 |
+
bar_fig = go.Figure()
|
| 1626 |
+
bar_fig.add_trace(go.Bar(
|
| 1627 |
+
name="Generic", x=dims, y=g_vals,
|
| 1628 |
+
marker_color="rgba(239,68,68,0.75)", marker_line_width=0,
|
| 1629 |
+
text=[f"{v:.1f}" for v in g_vals], textposition="outside",
|
| 1630 |
+
textfont=dict(size=10, color="#94a3b8"),
|
| 1631 |
+
))
|
| 1632 |
+
bar_fig.add_trace(go.Bar(
|
| 1633 |
+
name="Specialist", x=dims, y=s_vals,
|
| 1634 |
+
marker_color="rgba(0,212,255,0.75)", marker_line_width=0,
|
| 1635 |
+
text=[f"{v:.1f}" for v in s_vals], textposition="outside",
|
| 1636 |
+
textfont=dict(size=10, color="#94a3b8"),
|
| 1637 |
+
))
|
| 1638 |
+
bar_fig.update_layout(
|
| 1639 |
+
**DARK, **DARK_AXES, height=300, barmode="group",
|
| 1640 |
+
legend=dict(bgcolor="rgba(0,0,0,0)", font=dict(color="#94a3b8")),
|
| 1641 |
+
)
|
| 1642 |
+
bar_fig.update_yaxes(range=[0, 11], gridcolor="rgba(255,255,255,0.05)")
|
| 1643 |
+
st.plotly_chart(bar_fig, use_container_width=True, key="quality_bars")
|
| 1644 |
+
|
| 1645 |
+
# ββ Side-by-side text ββββββββββββββββββββββββββββββββββββββββ
|
| 1646 |
+
sec("Output Text Comparison")
|
| 1647 |
c1, c2 = st.columns(2)
|
| 1648 |
with c1:
|
| 1649 |
st.markdown(
|
| 1650 |
'<div style="font-size:10px;font-weight:700;color:#ef4444;'
|
| 1651 |
'text-transform:uppercase;letter-spacing:1px;margin-bottom:8px;">'
|
| 1652 |
+
'β Generic Output (No Delegation)</div>',
|
| 1653 |
unsafe_allow_html=True,
|
| 1654 |
)
|
| 1655 |
+
st.code(comp_data["generic"][:1200], language=None)
|
| 1656 |
with c2:
|
| 1657 |
st.markdown(
|
| 1658 |
'<div style="font-size:10px;font-weight:700;color:#10b981;'
|
| 1659 |
'text-transform:uppercase;letter-spacing:1px;margin-bottom:8px;">'
|
| 1660 |
+
'β Specialist-Routed Output (Trained Policy)</div>',
|
| 1661 |
unsafe_allow_html=True,
|
| 1662 |
)
|
| 1663 |
+
st.code(comp_data["specialist"][:1200], language=None)
|
| 1664 |
|
| 1665 |
sec("Policy Tuning β Quality vs Latency")
|
| 1666 |
c1, c2 = st.columns(2)
|
|
|
|
| 1918 |
# Keep env alive for delegation-graph rendering
|
| 1919 |
st.session_state.output_env = env
|
| 1920 |
|
| 1921 |
+
# Persist spawned specialists to shared pool for Specialists tab
|
| 1922 |
+
if "spawned_pool" not in st.session_state:
|
| 1923 |
+
st.session_state.spawned_pool = []
|
| 1924 |
+
existing_ids = {sp["id"] for sp in st.session_state.spawned_pool}
|
| 1925 |
+
for sid in spawned:
|
| 1926 |
+
if sid not in existing_ids:
|
| 1927 |
+
sp_obj = env.registry.get(sid)
|
| 1928 |
+
if sp_obj:
|
| 1929 |
+
st.session_state.spawned_pool.append({
|
| 1930 |
+
"id": sid,
|
| 1931 |
+
"role": sp_obj.role,
|
| 1932 |
+
"description": sp_obj.description,
|
| 1933 |
+
"complexity_affinity": list(sp_obj.complexity_affinity),
|
| 1934 |
+
"avg_latency_ms": sp_obj.avg_latency_ms,
|
| 1935 |
+
"triggered_by": task_used[:120],
|
| 1936 |
+
})
|
| 1937 |
+
|
| 1938 |
except Exception as exc:
|
| 1939 |
import traceback
|
| 1940 |
st.error(f"Episode failed: {exc}")
|
env/spindleflow_env.py
CHANGED
|
@@ -210,7 +210,9 @@ class SpindleFlowEnv(gym.Env):
|
|
| 210 |
|
| 211 |
self.spawned_this_episode = []
|
| 212 |
self._pending_spawn_records = []
|
| 213 |
-
#
|
|
|
|
|
|
|
| 214 |
|
| 215 |
# ββ Build per-episode active roster (top-K by task similarity) ββ
|
| 216 |
self.active_specialist_ids = self._select_active_specialists(task_emb)
|
|
|
|
| 210 |
|
| 211 |
self.spawned_this_episode = []
|
| 212 |
self._pending_spawn_records = []
|
| 213 |
+
# Auto-spawn: if no existing specialist covers this task well, create one via LLM.
|
| 214 |
+
if self.auto_spawn:
|
| 215 |
+
self._maybe_spawn_specialist(task_emb, task_desc)
|
| 216 |
|
| 217 |
# ββ Build per-episode active roster (top-K by task similarity) ββ
|
| 218 |
self.active_specialist_ids = self._select_active_specialists(task_emb)
|