garvitsachdeva commited on
Commit
c699da7
Β·
1 Parent(s): 44f11d2

fix: quality demo

Browse files
configs/training_config.yaml CHANGED
@@ -60,7 +60,7 @@ environment:
60
  max_delegation_depth: 2 # 2 for hackathon demo; architecture supports 4
61
  max_specialists_per_episode: 6
62
  specialist_timeout_ms: 8000
63
- spawn_threshold: 0.50 # all-MiniLM-L6-v2 related-domain sims are 0.35–0.70; 0.50 triggers meaningfully
64
  auto_spawn_specialists: true # set false to disable spawning entirely
65
  spawn_max_total: 8 # hard cap on lifetime spawns β€” prevents registry bloat over 100k steps
66
  spawn_cooldown_episodes: 20 # minimum episodes between consecutive spawns
 
60
  max_delegation_depth: 2 # 2 for hackathon demo; architecture supports 4
61
  max_specialists_per_episode: 6
62
  specialist_timeout_ms: 8000
63
+ spawn_threshold: 0.40 # spawn when best specialist similarity < 0.40 (ML/adjacent tasks ~0.35, SE tasks ~0.55+)
64
  auto_spawn_specialists: true # set false to disable spawning entirely
65
  spawn_max_total: 8 # hard cap on lifetime spawns β€” prevents registry bloat over 100k steps
66
  spawn_cooldown_episodes: 20 # minimum episodes between consecutive spawns
demo/streamlit_app.py CHANGED
@@ -691,6 +691,131 @@ def fig_training_entropy() -> go.Figure:
691
  return fig
692
 
693
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
694
  # ──────────────────────────────���──────────────────────────
695
  # UI helpers
696
  # ─────────────────────────────────────────────────────────
@@ -1218,6 +1343,37 @@ def tab_specialists():
1218
  specialists = [_SP(d) for d in _load_catalog()]
1219
  source_note = "Showing YAML catalog β€” run an episode to load the live registry (includes dynamic additions)."
1220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1221
  n = len(specialists)
1222
  sec(f"Roster β€” {n} specialist{'s' if n != 1 else ''}, capability-embedded")
1223
  if source_note:
@@ -1363,31 +1519,148 @@ def tab_training():
1363
  # Tab 4 β€” Quality Demo
1364
  # ─────────────────────────────────────────────────────────
1365
  def tab_quality():
1366
- sec("Before vs After Delegation Learning")
1367
- if st.button("Load Demo Comparison", type="primary", key="load_demo"):
1368
- p = ASSETS / "demo_moment_1.json"
1369
- if not p.exists():
1370
- st.error("Run `python demo/precompute_demo.py` first to generate demo assets.")
1371
- else:
1372
- with open(p) as f:
1373
- d = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1374
  c1, c2 = st.columns(2)
1375
  with c1:
1376
  st.markdown(
1377
  '<div style="font-size:10px;font-weight:700;color:#ef4444;'
1378
  'text-transform:uppercase;letter-spacing:1px;margin-bottom:8px;">'
1379
- 'Generalist Output (No Delegation)</div>',
1380
  unsafe_allow_html=True,
1381
  )
1382
- st.code(d["generalist_output"][:700], language=None)
1383
  with c2:
1384
  st.markdown(
1385
  '<div style="font-size:10px;font-weight:700;color:#10b981;'
1386
  'text-transform:uppercase;letter-spacing:1px;margin-bottom:8px;">'
1387
- 'Specialist-Routed Output (Learned Policy)</div>',
1388
  unsafe_allow_html=True,
1389
  )
1390
- st.code(d["specialist_output"][:700], language=None)
1391
 
1392
  sec("Policy Tuning β€” Quality vs Latency")
1393
  c1, c2 = st.columns(2)
@@ -1645,6 +1918,23 @@ def tab_output():
1645
  # Keep env alive for delegation-graph rendering
1646
  st.session_state.output_env = env
1647
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1648
  except Exception as exc:
1649
  import traceback
1650
  st.error(f"Episode failed: {exc}")
 
691
  return fig
692
 
693
 
694
+ # ─────────────────────────────────────────────────────────
695
+ # Quality-comparison helpers
696
+ # ─────────────────────────────────────────────────────────
697
+ def _generate_generic_output(task: str) -> str:
698
+ """Call GPT-4o-mini directly with the task β€” no specialist routing."""
699
+ import os
700
+ api_key = os.getenv("OPENAI_API_KEY")
701
+ if not api_key:
702
+ return (
703
+ "General problem-solving approach:\n"
704
+ "1. Gather and clarify requirements\n"
705
+ "2. Research common solution patterns\n"
706
+ "3. Draft a high-level architecture\n"
707
+ "4. Implement in small, testable increments\n"
708
+ "5. Validate against acceptance criteria and deploy\n"
709
+ "No specialist domain expertise applied."
710
+ )
711
+ try:
712
+ from openai import OpenAI
713
+ resp = OpenAI(api_key=api_key).chat.completions.create(
714
+ model="gpt-4o-mini",
715
+ max_tokens=600,
716
+ messages=[
717
+ {"role": "system",
718
+ "content": "You are a general-purpose software engineering assistant."},
719
+ {"role": "user",
720
+ "content": f"Provide a detailed solution approach for this task:\n\n{task}"},
721
+ ],
722
+ )
723
+ return resp.choices[0].message.content
724
+ except Exception as exc:
725
+ return f"(Generic output generation failed: {exc})"
726
+
727
+
728
+ def _t1_relevance(task: str, output: str, registry) -> float:
729
+ """Cosine similarity between task and output embeddings, scaled 0–10."""
730
+ try:
731
+ import numpy as np
732
+ t = registry.embed_query(task)
733
+ o = registry.embed_query(output[:800])
734
+ if t is None or o is None:
735
+ return 0.0
736
+ cos = float(np.dot(t, o) / (np.linalg.norm(t) * np.linalg.norm(o) + 1e-8))
737
+ return round(max(0.0, cos) * 10, 2)
738
+ except Exception:
739
+ return 0.0
740
+
741
+
742
+ def _judge_compare(task: str, generic: str, specialist: str) -> dict | None:
743
+ """GPT-4o-mini rates both outputs on 4 dimensions. Returns {dim: [generic, specialist]}."""
744
+ import os, json
745
+ api_key = os.getenv("OPENAI_API_KEY")
746
+ if not api_key:
747
+ return None
748
+ prompt = (
749
+ f"Task:\n{task[:400]}\n\n"
750
+ f"Output A (generic, no specialist routing):\n{generic[:700]}\n\n"
751
+ f"Output B (specialist-routed by trained policy):\n{specialist[:700]}\n\n"
752
+ "Rate each output 1–10 on: technical_depth, specificity, actionability, coverage.\n"
753
+ 'Return JSON only: {"technical_depth":[A,B],"specificity":[A,B],'
754
+ '"actionability":[A,B],"coverage":[A,B]}'
755
+ )
756
+ try:
757
+ from openai import OpenAI
758
+ resp = OpenAI(api_key=api_key).chat.completions.create(
759
+ model="gpt-4o-mini",
760
+ max_tokens=150,
761
+ response_format={"type": "json_object"},
762
+ messages=[{"role": "user", "content": prompt}],
763
+ )
764
+ return json.loads(resp.choices[0].message.content)
765
+ except Exception:
766
+ return None
767
+
768
+
769
+ def fig_radar_comparison(
770
+ gen_scores: dict,
771
+ spec_scores: dict,
772
+ ) -> go.Figure:
773
+ dims = list(gen_scores.keys())
774
+ g_vals = [gen_scores[d] for d in dims]
775
+ s_vals = [spec_scores[d] for d in dims]
776
+ dims_c = dims + [dims[0]]
777
+ g_c = g_vals + [g_vals[0]]
778
+ s_c = s_vals + [s_vals[0]]
779
+
780
+ fig = go.Figure()
781
+ fig.add_trace(go.Scatterpolar(
782
+ r=g_c, theta=dims_c, fill="toself",
783
+ fillcolor="rgba(239,68,68,0.10)",
784
+ line=dict(color="#ef4444", width=2),
785
+ name="Generic (no routing)",
786
+ ))
787
+ fig.add_trace(go.Scatterpolar(
788
+ r=s_c, theta=dims_c, fill="toself",
789
+ fillcolor="rgba(0,212,255,0.13)",
790
+ line=dict(color="#00d4ff", width=2.5),
791
+ name="Specialist-routed",
792
+ ))
793
+ fig.update_layout(
794
+ paper_bgcolor="rgba(0,0,0,0)",
795
+ font=dict(color="#e2e8f0", family="Inter, system-ui, sans-serif"),
796
+ polar=dict(
797
+ bgcolor="rgba(0,0,0,0)",
798
+ radialaxis=dict(
799
+ visible=True, range=[0, 10],
800
+ gridcolor="rgba(255,255,255,0.08)",
801
+ tickfont=dict(size=9, color="#475569"),
802
+ ),
803
+ angularaxis=dict(
804
+ gridcolor="rgba(255,255,255,0.08)",
805
+ tickfont=dict(size=11, color="#94a3b8"),
806
+ ),
807
+ ),
808
+ title=dict(
809
+ text="Quality Radar β€” Generic vs Specialist-Routed",
810
+ font=dict(size=13, color="#94a3b8"),
811
+ ),
812
+ legend=dict(bgcolor="rgba(0,0,0,0)", font=dict(color="#94a3b8", size=11)),
813
+ height=420,
814
+ margin=dict(l=60, r=60, t=60, b=40),
815
+ )
816
+ return fig
817
+
818
+
819
  # ──────────────────────────────���──────────────────────────
820
  # UI helpers
821
  # ─────────────────────────────────────────────────────────
 
1343
  specialists = [_SP(d) for d in _load_catalog()]
1344
  source_note = "Showing YAML catalog β€” run an episode to load the live registry (includes dynamic additions)."
1345
 
1346
+ # ── Dynamically spawned specialists (accumulated from Output tab runs) ──
1347
+ spawned_pool = st.session_state.get("spawned_pool", [])
1348
+ if spawned_pool:
1349
+ sec(f"⚑ Dynamically Spawned · {len(spawned_pool)} new agent{'s' if len(spawned_pool) != 1 else ''}")
1350
+ st.caption(
1351
+ "These specialists were auto-created during Output tab runs β€” "
1352
+ "triggered when no existing specialist had sufficient domain coverage (similarity < threshold)."
1353
+ )
1354
+ pool_cols = st.columns(min(len(spawned_pool), 4))
1355
+ for i, sp in enumerate(spawned_pool):
1356
+ with pool_cols[i % 4]:
1357
+ st.markdown(f"""
1358
+ <div style="background:rgba(251,191,36,0.06);border:1px solid rgba(251,191,36,0.28);
1359
+ border-left:3px solid #fbbf24;border-radius:12px;
1360
+ padding:14px;margin-bottom:10px;">
1361
+ <div style="font-size:11px;font-weight:700;color:#fbbf24;margin-bottom:5px;">
1362
+ ⚑ {_html.escape(sp['role'])}
1363
+ </div>
1364
+ <div style="font-size:10px;color:#475569;margin-bottom:6px;font-style:italic;">
1365
+ Triggered by: {_html.escape(sp['triggered_by'][:70])}…
1366
+ </div>
1367
+ <div style="font-size:11px;color:#64748b;line-height:1.5;">
1368
+ {_html.escape(sp['description'][:100])}…
1369
+ </div>
1370
+ <div style="font-size:10px;color:#334155;margin-top:8px;padding-top:8px;
1371
+ border-top:1px solid rgba(255,255,255,0.05);">
1372
+ {sp['avg_latency_ms']} ms &nbsp;Β·&nbsp; {', '.join(sp.get('complexity_affinity', []))}
1373
+ </div>
1374
+ </div>""", unsafe_allow_html=True)
1375
+ st.markdown("---")
1376
+
1377
  n = len(specialists)
1378
  sec(f"Roster β€” {n} specialist{'s' if n != 1 else ''}, capability-embedded")
1379
  if source_note:
 
1519
  # Tab 4 β€” Quality Demo
1520
  # ─────────────────────────────────────────────────────────
1521
  def tab_quality():
1522
+ results = st.session_state.get("output_results")
1523
+ env_obj = st.session_state.get("output_env")
1524
+
1525
+ sec("Live Quality Comparison β€” Generic vs Specialist-Routed")
1526
+
1527
+ if results is None:
1528
+ st.markdown(
1529
+ '<div style="background:rgba(245,158,11,0.05);border:1px solid rgba(245,158,11,0.2);'
1530
+ 'border-radius:12px;padding:28px;text-align:center;">'
1531
+ '<div style="font-size:13px;color:#fbbf24;font-weight:600;margin-bottom:8px;">'
1532
+ 'No Output run yet</div>'
1533
+ '<div style="font-size:12px;color:#64748b;">'
1534
+ 'Go to the <b>🎯 Output</b> tab, enter a task, and click '
1535
+ '"Run Trained Policy" β€” then return here to generate the quality comparison.'
1536
+ '</div></div>',
1537
+ unsafe_allow_html=True,
1538
+ )
1539
+ else:
1540
+ task = results["task"]
1541
+ spec_results = results["specialist_results"]
1542
+ specialist_text = "\n\n".join(
1543
+ f"[{sr['id'].upper()}]\n{sr['output'] or ''}"
1544
+ for sr in spec_results if sr.get("output")
1545
+ ) or "(no specialist output)"
1546
+
1547
+ # Task banner
1548
+ st.markdown(
1549
+ f'<div style="background:rgba(0,212,255,0.04);border:1px solid rgba(0,212,255,0.18);'
1550
+ f'border-radius:10px;padding:12px 18px;margin-bottom:16px;">'
1551
+ f'<span style="font-size:9px;font-weight:700;color:#475569;text-transform:uppercase;'
1552
+ f'letter-spacing:1px;">Comparing outputs for: </span>'
1553
+ f'<span style="font-size:12px;color:#e2e8f0;">{_html.escape(task[:140])}</span>'
1554
+ f'</div>',
1555
+ unsafe_allow_html=True,
1556
+ )
1557
+
1558
+ comp_data = st.session_state.get("quality_comparison")
1559
+ already_computed = comp_data is not None and comp_data.get("task") == task
1560
+
1561
+ if not already_computed:
1562
+ if st.button("⚑ Generate Quality Comparison", type="primary", key="gen_comp_btn"):
1563
+ with st.spinner("Generating generic output + running GPT-4o-mini judge…"):
1564
+ generic_text = _generate_generic_output(task)
1565
+ registry = env_obj.registry if env_obj else None
1566
+
1567
+ gen_t1 = _t1_relevance(task, generic_text, registry) if registry else 5.0
1568
+ spec_t1 = _t1_relevance(task, specialist_text, registry) if registry else 7.0
1569
+
1570
+ judge = _judge_compare(task, generic_text, specialist_text)
1571
+
1572
+ def _pick(key, fallback_g, fallback_s):
1573
+ pair = (judge or {}).get(key, [fallback_g, fallback_s])
1574
+ return float(pair[0]), float(pair[1])
1575
+
1576
+ td_g, td_s = _pick("technical_depth", 5, 7)
1577
+ sp_g, sp_s = _pick("specificity", 4, 8)
1578
+ ac_g, ac_s = _pick("actionability", 4, 7)
1579
+ cv_g, cv_s = _pick("coverage", 5, 8)
1580
+
1581
+ gen_scores = {"Task Relevance": gen_t1, "Technical Depth": td_g,
1582
+ "Specificity": sp_g, "Actionability": ac_g, "Coverage": cv_g}
1583
+ spec_scores = {"Task Relevance": spec_t1, "Technical Depth": td_s,
1584
+ "Specificity": sp_s, "Actionability": ac_s, "Coverage": cv_s}
1585
+
1586
+ st.session_state.quality_comparison = {
1587
+ "task": task,
1588
+ "generic": generic_text,
1589
+ "specialist": specialist_text,
1590
+ "gen_scores": gen_scores,
1591
+ "spec_scores": spec_scores,
1592
+ }
1593
+ st.rerun()
1594
+
1595
+ comp_data = st.session_state.get("quality_comparison")
1596
+ if comp_data and comp_data.get("task") == task:
1597
+ gen_scores = comp_data["gen_scores"]
1598
+ spec_scores = comp_data["spec_scores"]
1599
+
1600
+ # ── Score summary strip ─────────────────────────────────────
1601
+ sec("Score Summary")
1602
+ cols = st.columns(len(gen_scores))
1603
+ for i, (dim, g_val) in enumerate(gen_scores.items()):
1604
+ s_val = spec_scores[dim]
1605
+ delta = round(s_val - g_val, 1)
1606
+ cols[i].metric(
1607
+ dim,
1608
+ f"{s_val:.1f} / 10",
1609
+ f"{delta:+.1f} vs generic",
1610
+ )
1611
+
1612
+ # ── Radar chart ─────────────────────────────────────────────
1613
+ sec("Quality Radar")
1614
+ st.plotly_chart(
1615
+ fig_radar_comparison(gen_scores, spec_scores),
1616
+ use_container_width=True,
1617
+ key="quality_radar",
1618
+ )
1619
+
1620
+ # ── Side-by-side score bars ──────────────────────────────────
1621
+ sec("Per-Dimension Score Breakdown")
1622
+ dims = list(gen_scores.keys())
1623
+ g_vals = [gen_scores[d] for d in dims]
1624
+ s_vals = [spec_scores[d] for d in dims]
1625
+ bar_fig = go.Figure()
1626
+ bar_fig.add_trace(go.Bar(
1627
+ name="Generic", x=dims, y=g_vals,
1628
+ marker_color="rgba(239,68,68,0.75)", marker_line_width=0,
1629
+ text=[f"{v:.1f}" for v in g_vals], textposition="outside",
1630
+ textfont=dict(size=10, color="#94a3b8"),
1631
+ ))
1632
+ bar_fig.add_trace(go.Bar(
1633
+ name="Specialist", x=dims, y=s_vals,
1634
+ marker_color="rgba(0,212,255,0.75)", marker_line_width=0,
1635
+ text=[f"{v:.1f}" for v in s_vals], textposition="outside",
1636
+ textfont=dict(size=10, color="#94a3b8"),
1637
+ ))
1638
+ bar_fig.update_layout(
1639
+ **DARK, **DARK_AXES, height=300, barmode="group",
1640
+ legend=dict(bgcolor="rgba(0,0,0,0)", font=dict(color="#94a3b8")),
1641
+ )
1642
+ bar_fig.update_yaxes(range=[0, 11], gridcolor="rgba(255,255,255,0.05)")
1643
+ st.plotly_chart(bar_fig, use_container_width=True, key="quality_bars")
1644
+
1645
+ # ── Side-by-side text ────────────────────────────────────────
1646
+ sec("Output Text Comparison")
1647
  c1, c2 = st.columns(2)
1648
  with c1:
1649
  st.markdown(
1650
  '<div style="font-size:10px;font-weight:700;color:#ef4444;'
1651
  'text-transform:uppercase;letter-spacing:1px;margin-bottom:8px;">'
1652
+ 'βœ— Generic Output (No Delegation)</div>',
1653
  unsafe_allow_html=True,
1654
  )
1655
+ st.code(comp_data["generic"][:1200], language=None)
1656
  with c2:
1657
  st.markdown(
1658
  '<div style="font-size:10px;font-weight:700;color:#10b981;'
1659
  'text-transform:uppercase;letter-spacing:1px;margin-bottom:8px;">'
1660
+ 'βœ“ Specialist-Routed Output (Trained Policy)</div>',
1661
  unsafe_allow_html=True,
1662
  )
1663
+ st.code(comp_data["specialist"][:1200], language=None)
1664
 
1665
  sec("Policy Tuning β€” Quality vs Latency")
1666
  c1, c2 = st.columns(2)
 
1918
  # Keep env alive for delegation-graph rendering
1919
  st.session_state.output_env = env
1920
 
1921
+ # Persist spawned specialists to shared pool for Specialists tab
1922
+ if "spawned_pool" not in st.session_state:
1923
+ st.session_state.spawned_pool = []
1924
+ existing_ids = {sp["id"] for sp in st.session_state.spawned_pool}
1925
+ for sid in spawned:
1926
+ if sid not in existing_ids:
1927
+ sp_obj = env.registry.get(sid)
1928
+ if sp_obj:
1929
+ st.session_state.spawned_pool.append({
1930
+ "id": sid,
1931
+ "role": sp_obj.role,
1932
+ "description": sp_obj.description,
1933
+ "complexity_affinity": list(sp_obj.complexity_affinity),
1934
+ "avg_latency_ms": sp_obj.avg_latency_ms,
1935
+ "triggered_by": task_used[:120],
1936
+ })
1937
+
1938
  except Exception as exc:
1939
  import traceback
1940
  st.error(f"Episode failed: {exc}")
env/spindleflow_env.py CHANGED
@@ -210,7 +210,9 @@ class SpindleFlowEnv(gym.Env):
210
 
211
  self.spawned_this_episode = []
212
  self._pending_spawn_records = []
213
- # Spawning is now a learned action; no auto-spawn at reset.
 
 
214
 
215
  # ── Build per-episode active roster (top-K by task similarity) ──
216
  self.active_specialist_ids = self._select_active_specialists(task_emb)
 
210
 
211
  self.spawned_this_episode = []
212
  self._pending_spawn_records = []
213
+ # Auto-spawn: if no existing specialist covers this task well, create one via LLM.
214
+ if self.auto_spawn:
215
+ self._maybe_spawn_specialist(task_emb, task_desc)
216
 
217
  # ── Build per-episode active roster (top-K by task similarity) ──
218
  self.active_specialist_ids = self._select_active_specialists(task_emb)