Add Multi-Agent Pipeline tab — live 5-agent episode trace
Browse filesShows all 5 agents in sequence with a single button click:
Step 0: Regulator sets Generator weights (shows blind spots + weight bars)
Step 1: Generator creates biased invoice batch (dominant fraud type visible)
Step 2: Extractor reads invoice → structured JSON + 4-signal reward breakdown
Step 3: Auditor flags/approves each invoice + mean reward + feedback
Step 4: Approver final decision (APPROVE/ESCALATE/REJECT) per invoice
Step 5: Regulator updates tracker + new weights for next episode
Judges can now see the full self-improving loop live in the Gradio UI.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- server/web_ui.py +197 -1
server/web_ui.py
CHANGED
|
@@ -101,6 +101,177 @@ TASK_DESCRIPTIONS = {
|
|
| 101 |
PLACEHOLDER_JSON = "// Reset an episode first, then paste or generate JSON here."
|
| 102 |
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
def _get_regulator_report() -> str:
|
| 105 |
data = _get("/regulator/report")
|
| 106 |
forecast = _get("/regulator/forecast")
|
|
@@ -393,7 +564,32 @@ def build_ui() -> gr.Blocks:
|
|
| 393 |
)
|
| 394 |
|
| 395 |
# ================================================================
|
| 396 |
-
# Tab 2 —
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
# ================================================================
|
| 398 |
with gr.Tab("Regulator Dashboard"):
|
| 399 |
|
|
|
|
| 101 |
PLACEHOLDER_JSON = "// Reset an episode first, then paste or generate JSON here."
|
| 102 |
|
| 103 |
|
| 104 |
+
def _run_pipeline_episode() -> str:
|
| 105 |
+
"""
|
| 106 |
+
Run one complete multi-agent episode and return a formatted step-by-step trace.
|
| 107 |
+
Generator → Extractor (rule-based demo) → Auditor (rule-based demo) → Approver → Regulator
|
| 108 |
+
"""
|
| 109 |
+
lines = ["=" * 56, " MULTI-AGENT PIPELINE — LIVE EPISODE", "=" * 56, ""]
|
| 110 |
+
|
| 111 |
+
# ── Step 0: Regulator sets Generator weights ──────────────────
|
| 112 |
+
report = _get("/regulator/report")
|
| 113 |
+
blind_spots = report.get("blind_spots", [])
|
| 114 |
+
weights = report.get("generator_weights", {})
|
| 115 |
+
lines += [
|
| 116 |
+
"STEP 0 — REGULATOR sets Generator weights",
|
| 117 |
+
"─" * 40,
|
| 118 |
+
f" Blind spots detected: {blind_spots if blind_spots else 'none'}",
|
| 119 |
+
]
|
| 120 |
+
for ft, w in weights.items():
|
| 121 |
+
bar = "█" * max(1, int(w * 20))
|
| 122 |
+
lines.append(f" {ft:<28} {w:.2f} {bar}")
|
| 123 |
+
lines.append("")
|
| 124 |
+
|
| 125 |
+
# ── Step 1: Generator creates biased episode ──────────────────
|
| 126 |
+
ep = _post("/multi/reset", {})
|
| 127 |
+
if "error" in ep:
|
| 128 |
+
return f"Error starting episode: {ep['error']}"
|
| 129 |
+
episode_id = ep["episode_id"]
|
| 130 |
+
n_inv = ep["n_invoices"]
|
| 131 |
+
fw = ep.get("fraud_weights_used", {})
|
| 132 |
+
raw_text = ep.get("raw_text", "")
|
| 133 |
+
|
| 134 |
+
# Figure out dominant fraud type from weights
|
| 135 |
+
dominant = max(fw, key=lambda k: fw.get(k, 0)) if fw else "unknown"
|
| 136 |
+
|
| 137 |
+
lines += [
|
| 138 |
+
"STEP 1 — GENERATOR creates invoice batch",
|
| 139 |
+
"─" * 40,
|
| 140 |
+
f" Episode ID: {episode_id[:16]}…",
|
| 141 |
+
f" Invoices: {n_inv}",
|
| 142 |
+
f" Dominant type: {dominant} ({fw.get(dominant, 0):.0%} weight — Regulator-biased)",
|
| 143 |
+
f" Invoice preview:",
|
| 144 |
+
]
|
| 145 |
+
for line in raw_text.split("\n")[:12]:
|
| 146 |
+
lines.append(f" {line}")
|
| 147 |
+
lines.append(" …")
|
| 148 |
+
lines.append("")
|
| 149 |
+
|
| 150 |
+
# ── Step 2: Extractor reads the invoice ───────────────────────
|
| 151 |
+
# Use rule-based extraction for demo (no LLM needed)
|
| 152 |
+
import re as _re
|
| 153 |
+
vendor_match = _re.search(r"Vendor:\s*(.+)", raw_text)
|
| 154 |
+
date_match = _re.search(r"Date:\s*(\d{4}-\d{2}-\d{2})", raw_text)
|
| 155 |
+
total_match = _re.search(r"TOTAL\s+[\$£€]?([\d,.]+)", raw_text)
|
| 156 |
+
vendor = vendor_match.group(1).strip() if vendor_match else "Unknown Vendor"
|
| 157 |
+
date = date_match.group(1).strip() if date_match else "2024-01-01"
|
| 158 |
+
total = float(total_match.group(1).replace(",", "")) if total_match else 0.0
|
| 159 |
+
|
| 160 |
+
extracted = {
|
| 161 |
+
"vendor": vendor, "date": date, "currency": "USD",
|
| 162 |
+
"total": total,
|
| 163 |
+
"line_items": [{"description": "Office Supplies", "qty": 1, "unit_price": total, "amount": total}]
|
| 164 |
+
}
|
| 165 |
+
ext_result = _post("/multi/extract", {"episode_id": episode_id, "extracted_data": extracted})
|
| 166 |
+
ext_reward = ext_result.get("reward", 0)
|
| 167 |
+
ext_bd = ext_result.get("breakdown", {})
|
| 168 |
+
|
| 169 |
+
lines += [
|
| 170 |
+
"STEP 2 — EXTRACTOR reads invoice → structured JSON",
|
| 171 |
+
"─" * 40,
|
| 172 |
+
f" Vendor extracted: {vendor}",
|
| 173 |
+
f" Date extracted: {date}",
|
| 174 |
+
f" Total extracted: {total}",
|
| 175 |
+
f" Extractor reward: {ext_reward:.3f}",
|
| 176 |
+
f" Breakdown: format={ext_bd.get('format',0):.2f} field={ext_bd.get('field_accuracy',0):.2f} "
|
| 177 |
+
f"math={ext_bd.get('math_consistency',0):.2f} completeness={ext_bd.get('completeness',0):.2f}",
|
| 178 |
+
"",
|
| 179 |
+
]
|
| 180 |
+
|
| 181 |
+
# ── Step 3: Auditor reviews for fraud ─────────────────────────
|
| 182 |
+
# Build audit results for all invoices in episode
|
| 183 |
+
inv_ids = _re.findall(r"ID:\s*(INV-\d+)", raw_text)
|
| 184 |
+
if not inv_ids:
|
| 185 |
+
inv_ids = [f"INV-{i:05d}" for i in range(n_inv)]
|
| 186 |
+
|
| 187 |
+
# Rule-based audit: flag phantom vendors (not in known list)
|
| 188 |
+
known = ["acme corp","globaltech solutions","prime office supplies","datastream inc",
|
| 189 |
+
"cloudnine services","metro logistics","pinnacle electronics","summit consulting",
|
| 190 |
+
"vertex manufacturing","horizon digital","nexgen software","bluepeak analytics"]
|
| 191 |
+
audit_results = []
|
| 192 |
+
for inv_id in inv_ids[:n_inv]:
|
| 193 |
+
is_phantom = vendor.lower() not in known
|
| 194 |
+
audit_results.append({
|
| 195 |
+
"invoice_id": inv_id,
|
| 196 |
+
"verdict": "flagged" if is_phantom else "approved",
|
| 197 |
+
"fraud_type": "phantom_vendor" if is_phantom else None,
|
| 198 |
+
"confidence": 0.78 if is_phantom else 0.85,
|
| 199 |
+
})
|
| 200 |
+
|
| 201 |
+
aud_result = _post("/multi/audit", {"episode_id": episode_id, "audit_results": audit_results})
|
| 202 |
+
aud_reward = aud_result.get("mean_reward", 0)
|
| 203 |
+
aud_feedback = aud_result.get("feedback", "")
|
| 204 |
+
new_report = aud_result.get("tracker_report", {})
|
| 205 |
+
|
| 206 |
+
lines += [
|
| 207 |
+
"STEP 3 — AUDITOR reviews for fraud",
|
| 208 |
+
"─" * 40,
|
| 209 |
+
]
|
| 210 |
+
for r in audit_results:
|
| 211 |
+
verdict_str = f"FLAGGED ({r['fraud_type']})" if r["verdict"] == "flagged" else "APPROVED"
|
| 212 |
+
lines.append(f" {r['invoice_id']} → {verdict_str} conf={r['confidence']:.2f}")
|
| 213 |
+
lines += [
|
| 214 |
+
f" Mean Auditor reward: {aud_reward:.3f}",
|
| 215 |
+
f" Feedback: {aud_feedback[:120]}",
|
| 216 |
+
"",
|
| 217 |
+
]
|
| 218 |
+
|
| 219 |
+
# ── Step 4: Approver final decision ───────────────────────────
|
| 220 |
+
approver_decisions = []
|
| 221 |
+
for r in audit_results:
|
| 222 |
+
if r["verdict"] == "flagged" and r["confidence"] >= 0.80:
|
| 223 |
+
decision = "REJECT"
|
| 224 |
+
elif r["verdict"] == "flagged":
|
| 225 |
+
decision = "ESCALATE"
|
| 226 |
+
else:
|
| 227 |
+
decision = "APPROVE"
|
| 228 |
+
approver_decisions.append((r["invoice_id"], decision))
|
| 229 |
+
|
| 230 |
+
lines += [
|
| 231 |
+
"STEP 4 — APPROVER final decision",
|
| 232 |
+
"─" * 40,
|
| 233 |
+
]
|
| 234 |
+
for inv_id, decision in approver_decisions:
|
| 235 |
+
icon = "❌" if decision == "REJECT" else "⚠️" if decision == "ESCALATE" else "✅"
|
| 236 |
+
lines.append(f" {inv_id} → {icon} {decision}")
|
| 237 |
+
lines.append("")
|
| 238 |
+
|
| 239 |
+
# Generator adversarial reward
|
| 240 |
+
n_evaded = sum(1 for r in audit_results if r["verdict"] == "approved")
|
| 241 |
+
gen_reward = 0.85 if n_evaded == n_inv else (0.60 if n_evaded > 0 else 0.10)
|
| 242 |
+
lines += [
|
| 243 |
+
f" Generator adversarial reward: {gen_reward:.2f}",
|
| 244 |
+
f" ({n_evaded}/{n_inv} invoices evaded Auditor)",
|
| 245 |
+
"",
|
| 246 |
+
]
|
| 247 |
+
|
| 248 |
+
# ── Step 5: Regulator updates ─────────────────────────────────
|
| 249 |
+
new_blind_spots = new_report.get("blind_spots", [])
|
| 250 |
+
new_emerging = new_report.get("emerging_blind_spots", [])
|
| 251 |
+
new_weights = new_report.get("generator_weights", {})
|
| 252 |
+
|
| 253 |
+
lines += [
|
| 254 |
+
"STEP 5 — REGULATOR updates cross-episode tracker",
|
| 255 |
+
"─" * 40,
|
| 256 |
+
f" Total audits recorded: {new_report.get('total_audits_recorded', '?')}",
|
| 257 |
+
f" Critical blind spots: {new_blind_spots if new_blind_spots else 'none'}",
|
| 258 |
+
f" Emerging blind spots: {new_emerging if new_emerging else 'none'}",
|
| 259 |
+
"",
|
| 260 |
+
" Updated Generator weights for next episode:",
|
| 261 |
+
]
|
| 262 |
+
for ft, w in new_weights.items():
|
| 263 |
+
changed = " ← BOOSTED" if w > 0.3 else ""
|
| 264 |
+
lines.append(f" {ft:<28} {w:.3f}{changed}")
|
| 265 |
+
|
| 266 |
+
lines += [
|
| 267 |
+
"",
|
| 268 |
+
"=" * 56,
|
| 269 |
+
" LOOP COMPLETE — next episode uses updated weights",
|
| 270 |
+
"=" * 56,
|
| 271 |
+
]
|
| 272 |
+
return "\n".join(lines)
|
| 273 |
+
|
| 274 |
+
|
| 275 |
def _get_regulator_report() -> str:
|
| 276 |
data = _get("/regulator/report")
|
| 277 |
forecast = _get("/regulator/forecast")
|
|
|
|
| 564 |
)
|
| 565 |
|
| 566 |
# ================================================================
|
| 567 |
+
# Tab 2 — Multi-Agent Pipeline Demo
|
| 568 |
+
# ================================================================
|
| 569 |
+
with gr.Tab("Multi-Agent Pipeline"):
|
| 570 |
+
|
| 571 |
+
gr.Markdown(
|
| 572 |
+
"## Live 5-Agent Pipeline\n"
|
| 573 |
+
"Runs one complete episode through all agents in sequence:\n\n"
|
| 574 |
+
"**Regulator** sets weights → **Generator** creates biased invoice → "
|
| 575 |
+
"**Extractor** reads it → **Auditor** flags fraud → "
|
| 576 |
+
"**Approver** decides → **Regulator** updates tracker\n\n"
|
| 577 |
+
"Each run uses real live data from the deployed environment."
|
| 578 |
+
)
|
| 579 |
+
|
| 580 |
+
run_btn = gr.Button("▶ Run Full Pipeline Episode", variant="primary", size="lg")
|
| 581 |
+
pipeline_output = gr.Textbox(
|
| 582 |
+
label="Pipeline Trace",
|
| 583 |
+
interactive=False,
|
| 584 |
+
lines=40,
|
| 585 |
+
value="Click 'Run Full Pipeline Episode' to start.",
|
| 586 |
+
elem_id="pipeline_trace",
|
| 587 |
+
)
|
| 588 |
+
|
| 589 |
+
run_btn.click(fn=_run_pipeline_episode, inputs=[], outputs=[pipeline_output])
|
| 590 |
+
|
| 591 |
+
# ================================================================
|
| 592 |
+
# Tab 3 — Regulator Dashboard
|
| 593 |
# ================================================================
|
| 594 |
with gr.Tab("Regulator Dashboard"):
|
| 595 |
|