copilot-swe-agent[bot] CatoG commited on
Commit
cdc0ab9
ยท
1 Parent(s): 06ff8e3

Implement feedback-driven revision improvements to multi-role agent workflow

Browse files
Files changed (1) hide show
  1. app.py +795 -175
app.py CHANGED
@@ -559,45 +559,78 @@ class WorkflowState(TypedDict):
559
  draft_output: str # latest specialist/finalized output forwarded to QA
560
  qa_report: str
561
  qa_data: Dict[str, Any] # parsed QA JSON data for structured access
562
- qa_role_feedback: Dict[str, str] # role key โ†’ targeted QA feedback for that specific role
563
  qa_passed: bool
564
  revision_count: int
565
  best_artifact: str # best non-empty real deliverable seen so far
 
566
  final_answer: str
567
 
568
 
569
  # --- Role system prompts ---
570
 
571
- _PLANNER_SYSTEM = (
572
- "You are the Planner in a multi-role AI workflow.\n"
573
- "Your job is to:\n"
574
- "1. Break the user's task into clear subtasks.\n"
575
- "2. Decide which specialist to call as the PRIMARY lead:\n"
576
- " - 'Creative Expert' (ideas, framing, wording, brainstorming)\n"
577
- " - 'Technical Expert' (code, architecture, implementation)\n"
578
- " - 'Research Analyst' (information gathering, literature review, fact-finding)\n"
579
- " - 'Security Reviewer' (security analysis, vulnerability checks, best practices)\n"
580
- " - 'Data Analyst' (data analysis, statistics, pattern recognition, insights)\n"
581
- " - 'Mad Professor' (radical scientific hypotheses, unhinged groundbreaking theories, extreme scientific speculation)\n"
582
- " - 'Accountant' (extreme cost scrutiny, ruthless cost-cutting, cheapest alternatives regardless of quality)\n"
583
- " - 'Artist' (wildly unhinged creative vision, cosmic feeling and vibes, impractical but spectacular ideas)\n"
584
- " - 'Lazy Slacker' (minimum viable effort, shortcuts, good-enough solutions, questioning whether anything needs to be done)\n"
585
- " - 'Black Metal Fundamentalist' (nihilistic kvlt critique, uncompromising rejection of mainstream approaches, raw truth)\n"
586
- " - 'Labour Union Representative' (worker rights, fair wages, job security, collective bargaining)\n"
587
- " - 'UX Designer' (user needs, user-centricity, usability, accessibility)\n"
588
- " - 'Doris' (well-meaning but clueless, rambling, off-topic observations)\n"
589
- " - 'Chairman of the Board' (corporate governance, shareholder value, strategic vision, fiduciary duty)\n"
590
- " - 'MAGA Appointee' (America First perspective, anti-globalism, deregulation, patriotism)\n"
591
- " - 'Lawyer' (legal compliance, liability, contracts, risk management)\n"
592
- "3. State clear success criteria.\n\n"
593
- "Note: ALL active specialists will also contribute their own perspective on the task.\n"
594
- "Your PRIMARY ROLE choice sets the lead voice, but every active role will be heard.\n\n"
595
- "Respond in this exact format:\n"
596
- "TASK BREAKDOWN:\n<subtask list>\n\n"
597
- "ROLE TO CALL: <Creative Expert | Technical Expert | Research Analyst | Security Reviewer | Data Analyst | Mad Professor | Accountant | Artist | Lazy Slacker | Black Metal Fundamentalist | Labour Union Representative | UX Designer | Doris | Chairman of the Board | MAGA Appointee | Lawyer>\n\n"
598
- "SUCCESS CRITERIA:\n<what a correct, complete answer looks like>\n\n"
599
- "GUIDANCE FOR SPECIALIST:\n<any constraints or focus areas>"
600
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
 
602
  _CREATIVE_SYSTEM = (
603
  "You are the Creative Expert in a multi-role AI workflow.\n"
@@ -621,20 +654,30 @@ _TECHNICAL_SYSTEM = (
621
 
622
  _QA_SYSTEM = (
623
  "You are the QA Tester in a multi-role AI workflow.\n"
624
- "Check whether the output satisfies the original request and success criteria.\n"
625
- "When individual specialist contributions are provided, give targeted feedback for each role\n"
626
- "so they can refine their specific propositions in the next iteration.\n\n"
627
  "Respond with ONLY a valid JSON object in this exact format (no text before or after the JSON):\n"
628
  "{\n"
629
- ' "requirements_checked": ["<requirement>: <met|not met>"],\n'
630
- ' "issues_found": ["<issue description>"],\n'
631
- ' "role_feedback": {"<Role Display Name>": "<targeted feedback or Satisfactory>"},\n'
632
- ' "result": "<PASS|FAIL>",\n'
633
- ' "recommended_fixes": ["<fix description>"]\n'
 
 
 
 
 
634
  "}\n\n"
635
- 'Set "result" to "PASS" only if ALL requirements are met and issues_found is empty or contains only minor notes.\n'
636
- 'Set "result" to "FAIL" if any significant requirement is unmet.\n'
637
- 'In "role_feedback", include one entry per specialist role that contributed.'
 
 
 
 
 
638
  )
639
 
640
  _PLANNER_REVIEW_SYSTEM = (
@@ -862,12 +905,16 @@ def _llm_call(chat_model, system_prompt: str, user_content: str) -> str:
862
  return content_to_text(response.content)
863
 
864
 
865
- def _decide_role(text: str) -> str:
866
  """Parse which specialist role the Planner wants to invoke.
867
 
868
  Normalises the 'ROLE TO CALL:' line (strips surrounding whitespace and
869
  collapses internal spaces) before matching, then falls back to a
870
  word-boundary search. Returns 'technical' when no clear signal is found.
 
 
 
 
871
  """
872
  # Normalise: collapse runs of whitespace so "ROLE TO CALL : Creative Expert" still matches
873
  normalised = re.sub(r"\s+", " ", text).strip()
@@ -891,43 +938,45 @@ def _decide_role(text: str) -> str:
891
  ("Lawyer", "lawyer"),
892
  ("Doris", "doris"),
893
  ]
 
894
  for label, key in _LABEL_TO_KEY_ORDERED:
895
  # Match "ROLE TO CALL: <label>" with optional surrounding whitespace
896
  if re.search(r"ROLE\s+TO\s+CALL\s*:\s*" + re.escape(label), normalised, re.IGNORECASE):
897
- return key
898
-
899
- # Fallback: word-boundary match on the full (normalised) text
900
- if re.search(r"\bcreative\b", normalised, re.IGNORECASE):
901
- return "creative"
902
- if re.search(r"\bresearch\b", normalised, re.IGNORECASE):
903
- return "research"
904
- if re.search(r"\bsecurity\b", normalised, re.IGNORECASE):
905
- return "security"
906
- if re.search(r"\bdata\s+analyst\b", normalised, re.IGNORECASE):
907
- return "data_analyst"
908
- if re.search(r"\bmad\s+professor\b", normalised, re.IGNORECASE):
909
- return "mad_professor"
910
- if re.search(r"\baccountant\b", normalised, re.IGNORECASE):
911
- return "accountant"
912
- if re.search(r"\bartist\b", normalised, re.IGNORECASE):
913
- return "artist"
914
- if re.search(r"\blazy\s+slacker\b", normalised, re.IGNORECASE):
915
- return "lazy_slacker"
916
- if re.search(r"\bblack\s+metal\b", normalised, re.IGNORECASE):
917
- return "black_metal_fundamentalist"
918
- if re.search(r"\blabour\s+union\b", normalised, re.IGNORECASE):
919
- return "labour_union_rep"
920
- if re.search(r"\bux\s+designer\b", normalised, re.IGNORECASE):
921
- return "ux_designer"
922
- if re.search(r"\bdoris\b", normalised, re.IGNORECASE):
923
- return "doris"
924
- if re.search(r"\bchairman\b", normalised, re.IGNORECASE):
925
- return "chairman_of_board"
926
- if re.search(r"\bmaga\b", normalised, re.IGNORECASE):
927
- return "maga_appointee"
928
- if re.search(r"\blawyer\b", normalised, re.IGNORECASE):
929
- return "lawyer"
930
- return "technical"
 
931
 
932
 
933
  def _parse_qa_json(qa_text: str) -> Optional[Dict[str, Any]]:
@@ -956,13 +1005,16 @@ def _parse_qa_json(qa_text: str) -> Optional[Dict[str, Any]]:
956
  def _qa_passed_check(qa_text: str) -> bool:
957
  """Return True only if the QA report contains an explicit PASS result.
958
 
959
- Prefers the structured JSON format produced by the updated QA Tester prompt.
960
- Falls back to plain-text detection for backwards-compatibility.
961
- Returns False when the expected format is absent to avoid false positives
962
- from words like 'bypass' or 'password'.
963
  """
964
  data = _parse_qa_json(qa_text)
965
  if data is not None:
 
 
 
 
966
  result = str(data.get("result", "")).strip().upper()
967
  if result == "PASS":
968
  return True
@@ -977,23 +1029,34 @@ def _qa_passed_check(qa_text: str) -> bool:
977
  return False
978
 
979
 
980
- def _parse_qa_role_feedback(qa_text: str) -> Dict[str, str]:
981
  """Extract per-role targeted feedback from a QA report.
982
 
983
- Prefers the structured JSON format. Falls back to the legacy bullet-list
984
- text format for backwards-compatibility.
985
- Returns a dict mapping role keys (e.g. 'creative', 'technical') to the
986
- feedback string targeted at that role.
 
 
987
  """
988
- feedback: Dict[str, str] = {}
989
 
990
- # JSON path
991
  data = _parse_qa_json(qa_text)
992
  if data is not None and isinstance(data.get("role_feedback"), dict):
993
- for label, fb in data["role_feedback"].items():
994
- role_key = _ROLE_LABEL_TO_KEY.get(str(label).strip())
 
 
 
 
 
995
  if role_key and fb:
996
- feedback[role_key] = str(fb).strip()
 
 
 
 
997
  return feedback
998
 
999
  # Legacy text fallback
@@ -1013,7 +1076,7 @@ def _parse_qa_role_feedback(qa_text: str) -> Dict[str, str]:
1013
  role_fb = role_fb.strip()
1014
  role_key = _ROLE_LABEL_TO_KEY.get(role_label)
1015
  if role_key and role_fb:
1016
- feedback[role_key] = role_fb
1017
 
1018
  return feedback
1019
 
@@ -1040,59 +1103,266 @@ def _is_meta_summary(text: str) -> bool:
1040
  return count >= 2
1041
 
1042
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1043
  # --- Workflow step functions ---
1044
  # Each step receives the shared state and an append-only trace list,
1045
  # updates state in place, appends log lines, and returns updated state.
1046
 
1047
- def _step_plan(chat_model, state: WorkflowState, trace: List[str]) -> WorkflowState:
1048
- """Planner: analyse the task, produce a plan, decide which specialist to call."""
 
 
 
 
 
 
 
 
 
1049
  trace.append("\nโ•”โ•โ• [PLANNER] Analysing task... โ•โ•โ•—")
 
1050
  content = f"User request: {state['user_request']}"
1051
  if state["revision_count"] > 0:
1052
  # Use structured QA data when available for clearer revision guidance
1053
  qa_data = state.get("qa_data") or {}
1054
- issues = qa_data.get("issues_found") or []
1055
- fixes = qa_data.get("recommended_fixes") or []
1056
- qa_summary = state["qa_report"]
1057
- if issues or fixes:
1058
- qa_summary = (
1059
- "Issues found:\n" + "\n".join(f"- {i}" for i in issues) + "\n\n"
1060
- "Recommended fixes:\n" + "\n".join(f"- {f}" for f in fixes)
1061
- )
 
 
 
 
 
 
 
 
 
 
1062
  content += (
1063
  f"\n\nThis is revision {state['revision_count']} of {MAX_REVISIONS}."
1064
  f"\nQA concerns to address:\n{qa_summary}"
1065
- "\nAdjust the plan so the specialist produces the actual deliverable that addresses these concerns."
 
 
1066
  )
1067
- plan_text = _llm_call(chat_model, _PLANNER_SYSTEM, content)
1068
  state["plan"] = plan_text
1069
- state["current_role"] = _decide_role(plan_text)
 
1070
  trace.append(plan_text)
1071
- trace.append(f"โ•šโ•โ• [PLANNER] โ†’ routing to: {AGENT_ROLES.get(state['current_role'], state['current_role']).upper()} โ•โ•โ•")
1072
  return state
1073
 
1074
 
1075
- def _build_specialist_content(state: WorkflowState, role_key: str, previous_output_key: str) -> str:
 
 
 
 
 
 
1076
  """Build the user-facing content string for a specialist LLM call.
1077
 
1078
  On the first pass this is just the request + plan.
1079
  On revision passes it additionally includes:
1080
- - The specialist's previous output so they can improve on it
1081
- - Targeted QA feedback for this specific role (or the full QA report as fallback)
 
 
 
 
1082
  """
 
1083
  content = (
1084
  f"User request: {state['user_request']}\n\n"
1085
  f"Planner instructions:\n{state['plan']}"
1086
  )
1087
- if state["revision_count"] > 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1088
  previous_output = state.get(previous_output_key, "") # type: ignore[literal-required]
1089
  if previous_output:
1090
- content += f"\n\nYour previous output (improve on this):\n{previous_output}"
1091
- role_feedback = state["qa_role_feedback"].get(role_key, "")
 
 
 
 
 
 
1092
  if role_feedback:
1093
- content += f"\n\nQA feedback specific to your contribution:\n{role_feedback}"
1094
- elif state["qa_report"]:
1095
- content += f"\n\nQA feedback to address:\n{state['qa_report']}"
 
 
 
 
 
 
 
 
 
 
1096
  return content
1097
 
1098
 
@@ -1136,63 +1406,91 @@ def _step_qa(
1136
  trace: List[str],
1137
  all_outputs: Optional[List[Tuple[str, str]]] = None,
1138
  ) -> WorkflowState:
1139
- """QA Tester: check the draft against the original request and success criteria.
 
 
 
 
 
1140
 
1141
- When *all_outputs* is provided (list of (role_key, output) pairs from this
1142
- iteration), each specialist's individual contribution is included in the
1143
- review prompt so the QA can supply targeted, per-role feedback. This
1144
- feedback is stored in ``state['qa_role_feedback']`` and consumed by the
1145
- specialist step functions on the next revision pass.
1146
  """
1147
- trace.append("\nโ•”โ•โ• [QA TESTER] Reviewing output... โ•โ•โ•—")
 
 
 
1148
  content = (
1149
  f"Original user request: {state['user_request']}\n\n"
1150
  f"Planner's plan and success criteria:\n{state['plan']}\n\n"
1151
  )
1152
  if all_outputs:
1153
- # Include each specialist's individual output so QA can give role-specific feedback
1154
- content += "Individual specialist contributions:\n\n"
1155
  for r_key, r_output in all_outputs:
1156
  r_label = AGENT_ROLES.get(r_key, r_key)
1157
- content += f"=== {r_label} ===\n{r_output}\n\n"
1158
- content += f"Finalized deliverable:\n{state['draft_output']}"
1159
- else:
1160
- content += f"Specialist output to review:\n{state['draft_output']}"
1161
  text = _llm_call(chat_model, _QA_SYSTEM, content)
1162
  state["qa_report"] = text
1163
  state["qa_data"] = _parse_qa_json(text) or {}
1164
  state["qa_role_feedback"] = _parse_qa_role_feedback(text)
1165
  state["qa_passed"] = _qa_passed_check(text)
 
 
 
 
 
1166
  result_label = "โœ… PASS" if state["qa_passed"] else "โŒ FAIL"
 
1167
  trace.append(text)
 
 
 
 
 
1168
  if state["qa_role_feedback"]:
1169
- feedback_summary = ", ".join(
1170
- f"{AGENT_ROLES.get(k, k)}: {v[:60]}{'โ€ฆ' if len(v) > 60 else ''}"
1171
- for k, v in state["qa_role_feedback"].items()
1172
- )
1173
- trace.append(f" โ„น Role-specific feedback dispatched โ†’ {feedback_summary}")
1174
  trace.append(f"โ•šโ•โ• [QA TESTER] Result: {result_label} โ•โ•โ•")
1175
  return state
1176
 
1177
 
1178
- def _step_planner_review(chat_model, state: WorkflowState, trace: List[str]) -> WorkflowState:
 
 
 
 
 
1179
  """Planner: review QA feedback and either approve the result or request a revision."""
1180
  trace.append("\nโ•”โ•โ• [PLANNER] Reviewing QA feedback... โ•โ•โ•—")
1181
  # Format issues/fixes from structured QA data when available
1182
  qa_data = state.get("qa_data") or {}
1183
- issues = qa_data.get("issues_found") or []
1184
- fixes = qa_data.get("recommended_fixes") or []
1185
- if issues or fixes:
1186
- qa_summary = (
1187
- "Issues:\n" + "\n".join(f"- {i}" for i in issues) + "\n\n"
1188
- "Recommended fixes:\n" + "\n".join(f"- {f}" for f in fixes)
1189
- )
 
 
 
 
 
1190
  else:
1191
  qa_summary = state["qa_report"]
 
 
 
 
1192
  content = (
1193
  f"User request: {state['user_request']}\n\n"
1194
  f"Plan:\n{state['plan']}\n\n"
1195
- f"Current deliverable:\n{state['draft_output']}\n\n"
1196
  f"QA result: {'PASS' if state['qa_passed'] else 'FAIL'}\n"
1197
  f"QA details:\n{qa_summary}"
1198
  )
@@ -1224,7 +1522,12 @@ def _step_planner_review(chat_model, state: WorkflowState, trace: List[str]) ->
1224
  else:
1225
  # Revision requested but REVISED INSTRUCTIONS section missing โ€” keep current plan
1226
  trace.append(" โš  REVISED INSTRUCTIONS section missing; retrying with existing plan.")
1227
- new_role = _decide_role(review)
 
 
 
 
 
1228
  state["current_role"] = new_role
1229
  trace.append(
1230
  f"โ•šโ•โ• [PLANNER] โ†’ ๐Ÿ”„ REVISE โ€” routing to {AGENT_ROLES.get(new_role, new_role).upper()} โ•โ•โ•"
@@ -1232,6 +1535,7 @@ def _step_planner_review(chat_model, state: WorkflowState, trace: List[str]) ->
1232
  return state
1233
 
1234
 
 
1235
  def _step_research(chat_model, state: WorkflowState, trace: List[str]) -> WorkflowState:
1236
  """Research Analyst: gather information and produce a comprehensive research summary."""
1237
  trace.append("\nโ•”โ•โ• [RESEARCH ANALYST] Gathering information... โ•โ•โ•—")
@@ -1396,35 +1700,75 @@ def _step_finalize(
1396
  ) -> WorkflowState:
1397
  """Finalizer: synthesise all specialist perspectives and produce the actual deliverable.
1398
 
1399
- Unlike the old Synthesizer which produced a meta-summary of perspectives,
1400
- the Finalizer outputs the real artifact the user asked for (menu, code, plan, etc.).
1401
- If the output looks like a meta-summary (e.g. the model ignored instructions),
1402
- we fall back to the primary specialist's output to avoid polluting best_artifact.
 
 
 
1403
  """
1404
  trace.append("\nโ•”โ•โ• [FINALIZER] Producing the deliverable from all perspectives... โ•โ•โ•—")
1405
- perspectives = []
 
 
 
 
 
 
 
1406
  for r_key, r_output in all_outputs:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1407
  r_label = AGENT_ROLES.get(r_key, r_key)
1408
  perspectives.append(f"=== {r_label} ===\n{r_output}")
1409
  combined = "\n\n".join(perspectives)
 
 
 
 
 
 
 
1410
  content = (
1411
  f"User request: {state['user_request']}\n\n"
1412
- f"Specialist perspectives collected:\n\n{combined}"
 
1413
  )
1414
  text = _llm_call(chat_model, _FINALIZER_SYSTEM, content)
1415
 
 
 
1416
  if not text or not text.strip():
1417
- trace.append(" โš  [FINALIZER] returned empty output โ€” using primary specialist output as draft.")
1418
- text = all_outputs[0][1] if all_outputs else state.get("draft_output", "")
1419
-
1420
- if _is_meta_summary(text):
1421
- # Model produced a meta-summary instead of the deliverable โ€” use primary output
1422
- primary_output = all_outputs[0][1] if all_outputs else ""
1423
  trace.append(
1424
- " โš  [FINALIZER] output looks like a meta-summary (specialist report headers detected).\n"
1425
- " Falling back to primary specialist output as the draft deliverable."
1426
  )
1427
- text = primary_output or text
 
 
 
1428
 
1429
  state["finalized_output"] = text
1430
  state["draft_output"] = text
@@ -1433,6 +1777,7 @@ def _step_finalize(
1433
  return state
1434
 
1435
 
 
1436
  # Mapping from role key โ†’ step function, used by the orchestration loop
1437
  _SPECIALIST_STEPS = {
1438
  "creative": _step_creative,
@@ -1453,8 +1798,76 @@ _SPECIALIST_STEPS = {
1453
  "lawyer": _step_lawyer,
1454
  }
1455
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1456
 
1457
- # --- Specialist role tools ---
1458
  # These wrap the step functions as @tool so the Planner (or any LangChain agent)
1459
  # can invoke specialists in a standard tool-use pattern.
1460
 
@@ -1471,7 +1884,7 @@ _EMPTY_STATE_BASE: WorkflowState = {
1471
  "chairman_of_board_output": "", "maga_appointee_output": "", "lawyer_output": "",
1472
  "finalized_output": "",
1473
  "draft_output": "", "qa_report": "", "qa_data": {}, "qa_role_feedback": {}, "qa_passed": False,
1474
- "revision_count": 0, "best_artifact": "", "final_answer": "",
1475
  }
1476
 
1477
 
@@ -1718,6 +2131,7 @@ def run_multi_role_workflow(
1718
  "qa_passed": False,
1719
  "revision_count": 0,
1720
  "best_artifact": "",
 
1721
  "final_answer": "",
1722
  }
1723
 
@@ -1731,8 +2145,8 @@ def run_multi_role_workflow(
1731
 
1732
  try:
1733
  if planner_active:
1734
- # Step 1: Planner creates the initial plan
1735
- state = _step_plan(chat_model, state, trace)
1736
  else:
1737
  # No planner: auto-select first active specialist
1738
  state["current_role"] = active_specialist_keys[0]
@@ -1747,19 +2161,31 @@ def run_multi_role_workflow(
1747
  # then run every other active specialist so all voices are heard.
1748
  primary_role = state["current_role"]
1749
  if primary_role not in active_specialist_keys:
1750
- # Safe fallback: requested role not in active set
1751
  fallback_role = active_specialist_keys[0]
1752
  trace.append(
1753
- f" โš  Role '{primary_role}' not active โ€” falling back to {AGENT_ROLES.get(fallback_role, fallback_role).upper()}"
 
1754
  )
1755
  primary_role = fallback_role
1756
  state["current_role"] = primary_role
1757
 
 
 
 
 
 
 
1758
  # Run the primary (planner-chosen) specialist
1759
  primary_fn = _SPECIALIST_STEPS.get(primary_role, _step_technical)
1760
  state = primary_fn(chat_model, state, trace)
1761
  primary_output = state["draft_output"]
1762
 
 
 
 
 
 
1763
  # Run all other active specialists and collect their perspectives
1764
  all_outputs: List[Tuple[str, str]] = [(primary_role, primary_output)]
1765
  for specialist_role in active_specialist_keys:
@@ -1767,8 +2193,38 @@ def run_multi_role_workflow(
1767
  continue # already ran above
1768
  step_fn = _SPECIALIST_STEPS[specialist_role]
1769
  state = step_fn(chat_model, state, trace)
1770
- # state["draft_output"] is set by every step function immediately
1771
- # before returning, so it is always this specialist's fresh output.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1772
  all_outputs.append((specialist_role, state["draft_output"]))
1773
 
1774
  # Finalize all perspectives into the actual deliverable.
@@ -1780,15 +2236,39 @@ def run_multi_role_workflow(
1780
  state = _step_finalize(chat_model, state, trace, all_outputs)
1781
  else:
1782
  state["draft_output"] = primary_output
 
1783
 
1784
- # Update best-candidate tracking: keep the best non-empty, non-meta-summary artifact
1785
  current_draft = state["draft_output"]
1786
- if current_draft and current_draft.strip() and not _is_meta_summary(current_draft):
1787
- state["best_artifact"] = current_draft
1788
- trace.append(f" โœ” Best artifact updated (rev {state['revision_count']}): {len(current_draft)} chars")
1789
- elif not state.get("best_artifact") and current_draft and current_draft.strip():
1790
- # Even a meta-summary is better than nothing as a safety net
1791
- state["best_artifact"] = current_draft
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1792
 
1793
  # Step 3: QA reviews the finalized draft (if enabled)
1794
  if qa_active:
@@ -1802,7 +2282,9 @@ def run_multi_role_workflow(
1802
  # Step 4: Planner reviews QA and either approves or schedules a revision
1803
  if planner_active and qa_active:
1804
  prev_plan = state["plan"]
1805
- state = _step_planner_review(chat_model, state, trace)
 
 
1806
 
1807
  # Exit if the Planner approved the result
1808
  if state["final_answer"]:
@@ -1829,7 +2311,8 @@ def run_multi_role_workflow(
1829
  state["final_answer"] = best
1830
  trace.append(
1831
  f"\nโ•โ•โ• MAX REVISIONS REACHED ({MAX_REVISIONS}) โ•โ•โ•\n"
1832
- f"Returning best artifact ({len(best)} chars). Outstanding QA concerns:\n{state['qa_report']}"
 
1833
  )
1834
  break
1835
  else:
@@ -1851,6 +2334,7 @@ def run_multi_role_workflow(
1851
  return state["final_answer"], "\n".join(trace)
1852
 
1853
 
 
1854
  # ============================================================
1855
  # Agent builder
1856
  # ============================================================
@@ -2330,6 +2814,142 @@ def run_demo_lunch_menu(
2330
  return final_answer, trace
2331
 
2332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2333
  if __name__ == "__main__":
2334
  port = int(os.environ.get("PORT", 7860))
2335
  demo.launch(
 
559
  draft_output: str # latest specialist/finalized output forwarded to QA
560
  qa_report: str
561
  qa_data: Dict[str, Any] # parsed QA JSON data for structured access
562
+ qa_role_feedback: Dict[str, Any] # role key โ†’ list[str] of targeted QA feedback items
563
  qa_passed: bool
564
  revision_count: int
565
  best_artifact: str # best non-empty real deliverable seen so far
566
+ best_artifact_score: float # requirement-coverage score of best_artifact
567
  final_answer: str
568
 
569
 
570
  # --- Role system prompts ---
571
 
572
+ # Role description snippets used to build the dynamic planner prompt
573
+ _ROLE_DESCRIPTIONS: Dict[str, str] = {
574
+ "creative": "Creative Expert (ideas, framing, wording, brainstorming)",
575
+ "technical": "Technical Expert (code, architecture, implementation)",
576
+ "research": "Research Analyst (information gathering, literature review, fact-finding)",
577
+ "security": "Security Reviewer (security analysis, vulnerability checks, best practices)",
578
+ "data_analyst": "Data Analyst (data analysis, statistics, pattern recognition, insights)",
579
+ "mad_professor": "Mad Professor (radical scientific hypotheses, unhinged groundbreaking theories)",
580
+ "accountant": "Accountant (extreme cost scrutiny, ruthless cost-cutting, cheapest alternatives)",
581
+ "artist": "Artist (wildly unhinged creative vision, cosmic feeling and vibes)",
582
+ "lazy_slacker": "Lazy Slacker (minimum viable effort, shortcuts, good-enough solutions)",
583
+ "black_metal_fundamentalist": "Black Metal Fundamentalist (nihilistic kvlt critique, rejection of mainstream approaches)",
584
+ "labour_union_rep": "Labour Union Representative (worker rights, fair wages, job security)",
585
+ "ux_designer": "UX Designer (user needs, user-centricity, usability, accessibility)",
586
+ "doris": "Doris (well-meaning but clueless, rambling, off-topic observations)",
587
+ "chairman_of_board": "Chairman of the Board (corporate governance, shareholder value, strategic vision)",
588
+ "maga_appointee": "MAGA Appointee (America First perspective, anti-globalism, deregulation)",
589
+ "lawyer": "Lawyer (legal compliance, liability, contracts, risk management)",
590
+ }
591
+
592
+
593
+ def _build_planner_system(active_specialist_keys: List[str]) -> str:
594
+ """Return a Planner system prompt that only lists *active* specialist roles.
595
+
596
+ This prevents the Planner from routing to roles that are not enabled in the
597
+ current run, which was a source of broken routing.
598
+ """
599
+ if not active_specialist_keys:
600
+ active_specialist_keys = list(_ROLE_DESCRIPTIONS.keys())
601
+
602
+ role_lines = "\n".join(
603
+ f" - '{AGENT_ROLES.get(k, k)}'"
604
+ + (f" โ€” {_ROLE_DESCRIPTIONS[k].split('(', 1)[1].rstrip(')')}" if k in _ROLE_DESCRIPTIONS else "")
605
+ for k in active_specialist_keys
606
+ if k in AGENT_ROLES
607
+ )
608
+ role_choices = " | ".join(
609
+ f"'{AGENT_ROLES.get(k, k)}'" for k in active_specialist_keys if k in AGENT_ROLES
610
+ )
611
+ return (
612
+ "You are the Planner in a multi-role AI workflow.\n"
613
+ "Your job is to:\n"
614
+ "1. Break the user's task into clear subtasks.\n"
615
+ "2. Decide which specialist to call as the PRIMARY lead "
616
+ "(choose ONLY from the active roles listed below):\n"
617
+ f"{role_lines}\n"
618
+ "3. State clear success criteria.\n\n"
619
+ "IMPORTANT: You MUST select the PRIMARY ROLE from the active roles listed above ONLY.\n"
620
+ "Do NOT invent or request roles that are not in this list.\n"
621
+ "ALL active specialists will also contribute their own perspective.\n"
622
+ "Your PRIMARY ROLE choice sets the lead voice, but every active role will be heard.\n\n"
623
+ "Respond in this exact format:\n"
624
+ "TASK BREAKDOWN:\n<subtask list>\n\n"
625
+ f"ROLE TO CALL: <{role_choices}>\n\n"
626
+ "SUCCESS CRITERIA:\n<what a correct, complete answer looks like>\n\n"
627
+ "GUIDANCE FOR SPECIALIST:\n<any constraints or focus areas>"
628
+ )
629
+
630
+
631
+ # Fallback static planner system (used when active roles are not yet known)
632
+ _PLANNER_SYSTEM = _build_planner_system(list(_ROLE_DESCRIPTIONS.keys()))
633
+
634
 
635
  _CREATIVE_SYSTEM = (
636
  "You are the Creative Expert in a multi-role AI workflow.\n"
 
654
 
655
  _QA_SYSTEM = (
656
  "You are the QA Tester in a multi-role AI workflow.\n"
657
+ "Evaluate ONLY the finalized deliverable (not specialist process notes or summaries).\n"
658
+ "When individual specialist contributions are provided, give targeted, actionable feedback per role\n"
659
+ "so each specialist knows exactly what to fix in the next revision.\n\n"
660
  "Respond with ONLY a valid JSON object in this exact format (no text before or after the JSON):\n"
661
  "{\n"
662
+ ' "passed": false,\n'
663
+ ' "score": 0.0,\n'
664
+ ' "artifact_present": true,\n'
665
+ ' "missing_requirements": ["<specific missing item>"],\n'
666
+ ' "required_fixes": ["<concrete fix needed>"],\n'
667
+ ' "role_feedback": {\n'
668
+ ' "<role_key e.g. mad_professor>": ["<specific fix 1>", "<specific fix 2>"],\n'
669
+ ' "<role_key>": ["<fix>"] \n'
670
+ " },\n"
671
+ ' "summary": "<one-sentence overall assessment>"\n'
672
  "}\n\n"
673
+ 'Set "passed" to true only if ALL requirements are met and missing_requirements is empty.\n'
674
+ 'Set "score" to a float 0.0-1.0 representing how much of the deliverable is complete.\n'
675
+ 'Set "artifact_present" to true if a real deliverable (not just meta-commentary) is present.\n'
676
+ 'In "role_feedback", use the internal role key (e.g. "mad_professor", "artist", "technical") not the display name.\n'
677
+ 'Each role_feedback value must be a list of specific, actionable fixes for that role.\n'
678
+ 'Focus role_feedback on concrete missing content, not general style.\n'
679
+ 'In "required_fixes", list the concrete changes needed to make the deliverable pass.\n'
680
+ 'Evaluate ONLY the finalized deliverable under "Finalized deliverable:" โ€” not the specialist inputs.'
681
  )
682
 
683
  _PLANNER_REVIEW_SYSTEM = (
 
905
  return content_to_text(response.content)
906
 
907
 
908
+ def _decide_role(text: str, active_keys: Optional[List[str]] = None) -> str:
909
  """Parse which specialist role the Planner wants to invoke.
910
 
911
  Normalises the 'ROLE TO CALL:' line (strips surrounding whitespace and
912
  collapses internal spaces) before matching, then falls back to a
913
  word-boundary search. Returns 'technical' when no clear signal is found.
914
+
915
+ When *active_keys* is provided the returned role key is guaranteed to be in
916
+ that set. If the detected role is not active a deterministic fallback is
917
+ applied (first active key, or 'technical').
918
  """
919
  # Normalise: collapse runs of whitespace so "ROLE TO CALL : Creative Expert" still matches
920
  normalised = re.sub(r"\s+", " ", text).strip()
 
938
  ("Lawyer", "lawyer"),
939
  ("Doris", "doris"),
940
  ]
941
+ detected: Optional[str] = None
942
  for label, key in _LABEL_TO_KEY_ORDERED:
943
  # Match "ROLE TO CALL: <label>" with optional surrounding whitespace
944
  if re.search(r"ROLE\s+TO\s+CALL\s*:\s*" + re.escape(label), normalised, re.IGNORECASE):
945
+ detected = key
946
+ break
947
+
948
+ if detected is None:
949
+ # Fallback: word-boundary match on the full (normalised) text
950
+ _WORD_FALLBACKS = [
951
+ (r"\bcreative\b", "creative"),
952
+ (r"\bresearch\b", "research"),
953
+ (r"\bsecurity\b", "security"),
954
+ (r"\bdata\s+analyst\b", "data_analyst"),
955
+ (r"\bmad\s+professor\b", "mad_professor"),
956
+ (r"\baccountant\b", "accountant"),
957
+ (r"\bartist\b", "artist"),
958
+ (r"\blazy\s+slacker\b", "lazy_slacker"),
959
+ (r"\bblack\s+metal\b", "black_metal_fundamentalist"),
960
+ (r"\blabour\s+union\b", "labour_union_rep"),
961
+ (r"\bux\s+designer\b", "ux_designer"),
962
+ (r"\bdoris\b", "doris"),
963
+ (r"\bchairman\b", "chairman_of_board"),
964
+ (r"\bmaga\b", "maga_appointee"),
965
+ (r"\blawyer\b", "lawyer"),
966
+ ]
967
+ for pattern, key in _WORD_FALLBACKS:
968
+ if re.search(pattern, normalised, re.IGNORECASE):
969
+ detected = key
970
+ break
971
+
972
+ detected = detected or "technical"
973
+
974
+ # Filter to active keys when provided
975
+ if active_keys:
976
+ if detected not in active_keys:
977
+ fallback = active_keys[0] if active_keys else "technical"
978
+ return fallback
979
+ return detected
980
 
981
 
982
  def _parse_qa_json(qa_text: str) -> Optional[Dict[str, Any]]:
 
1005
  def _qa_passed_check(qa_text: str) -> bool:
1006
  """Return True only if the QA report contains an explicit PASS result.
1007
 
1008
+ Handles both the new format (``passed`` bool + ``score``) and the legacy
1009
+ format (``result: PASS|FAIL``). Returns False when the expected format is
1010
+ absent to avoid false positives from words like 'bypass' or 'password'.
 
1011
  """
1012
  data = _parse_qa_json(qa_text)
1013
  if data is not None:
1014
+ # New format: "passed" bool field
1015
+ if "passed" in data:
1016
+ return bool(data["passed"])
1017
+ # Legacy format: "result" string field
1018
  result = str(data.get("result", "")).strip().upper()
1019
  if result == "PASS":
1020
  return True
 
1029
  return False
1030
 
1031
 
1032
+ def _parse_qa_role_feedback(qa_text: str) -> Dict[str, Any]:
1033
  """Extract per-role targeted feedback from a QA report.
1034
 
1035
+ Supports the new format where each role maps to a list of strings, and the
1036
+ legacy format where each role maps to a single string. Also accepts role
1037
+ keys (e.g. 'mad_professor') directly in addition to display labels.
1038
+
1039
+ Returns a dict mapping role keys (e.g. 'creative', 'technical') to either
1040
+ a list of fix strings (new format) or a plain string (legacy format).
1041
  """
1042
+ feedback: Dict[str, Any] = {}
1043
 
1044
+ # JSON path โ€” handles both new (list) and legacy (string) values
1045
  data = _parse_qa_json(qa_text)
1046
  if data is not None and isinstance(data.get("role_feedback"), dict):
1047
+ for label_or_key, fb in data["role_feedback"].items():
1048
+ label_or_key = str(label_or_key).strip()
1049
+ # Try direct role key first, then display-label lookup
1050
+ if label_or_key in AGENT_ROLES:
1051
+ role_key = label_or_key
1052
+ else:
1053
+ role_key = _ROLE_LABEL_TO_KEY.get(label_or_key)
1054
  if role_key and fb:
1055
+ # Normalise to list for consistent downstream use
1056
+ if isinstance(fb, list):
1057
+ feedback[role_key] = [str(x).strip() for x in fb if x]
1058
+ else:
1059
+ feedback[role_key] = [str(fb).strip()]
1060
  return feedback
1061
 
1062
  # Legacy text fallback
 
1076
  role_fb = role_fb.strip()
1077
  role_key = _ROLE_LABEL_TO_KEY.get(role_label)
1078
  if role_key and role_fb:
1079
+ feedback[role_key] = [role_fb]
1080
 
1081
  return feedback
1082
 
 
1103
  return count >= 2
1104
 
1105
 
1106
+ # โ”€โ”€โ”€ Output quality helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1107
+
1108
+ # Persona-heavy roles that need revision-mode behaviour enforcement
1109
+ _PERSONA_ROLE_KEYS = frozenset({
1110
+ "mad_professor", "artist", "lazy_slacker",
1111
+ "black_metal_fundamentalist", "doris", "maga_appointee",
1112
+ })
1113
+
1114
+ # Heuristics that suggest a real, structured deliverable
1115
+ _DELIVERABLE_SIGNALS = [
1116
+ r"(?:option|choice|step|ingredient|method|approach|alternative)\s*\d*\s*[:\-]",
1117
+ r"^\s*\d+[\.\)]\s+\w+", # numbered list item
1118
+ r"^#{1,3}\s+\w+", # markdown header
1119
+ r"\*\*[^\*]+\*\*", # bold text
1120
+ r"^\s*[-โ€ข]\s+\w+", # bullet point
1121
+ r"\b(?:recipe|menu|plan|schedule|budget|code|function|class|def |import )\b",
1122
+ ]
1123
+
1124
+
1125
+ def _is_empty_output(text: str) -> bool:
1126
+ """Return True when *text* is blank or shorter than a minimal threshold."""
1127
+ return not text or len(text.strip()) < 30
1128
+
1129
+
1130
+ def _is_relevant_to_request(text: str, request: str) -> bool:
1131
+ """Return True when *text* shares enough key words with *request*.
1132
+
1133
+ Uses a simple word-overlap heuristic: at least 25 % of significant words
1134
+ from the request must appear somewhere in the output.
1135
+ """
1136
+ if not text or not request:
1137
+ return False
1138
+ request_words = {w.lower() for w in re.findall(r"\b\w{4,}\b", request)}
1139
+ if not request_words:
1140
+ return True
1141
+ text_lower = text.lower()
1142
+ overlap = sum(1 for w in request_words if w in text_lower)
1143
+ return overlap / len(request_words) >= 0.25
1144
+
1145
+
1146
+ def _looks_like_actionable_deliverable(text: str) -> bool:
1147
+ """Return True when *text* shows structural markers typical of a real deliverable."""
1148
+ if not text or len(text.strip()) < 50:
1149
+ return False
1150
+ text_lower = text.lower()
1151
+ matches = sum(
1152
+ 1 for sig in _DELIVERABLE_SIGNALS
1153
+ if re.search(sig, text_lower, re.MULTILINE | re.IGNORECASE)
1154
+ )
1155
+ return matches >= 2
1156
+
1157
+
1158
+ def _score_candidate(text: str, requirements: List[str], request: str) -> float:
1159
+ """Score an output 0.0-1.0 against requirement coverage + structural quality.
1160
+
1161
+ Combines:
1162
+ - Fraction of explicit requirements whose key words appear in the text (60 %)
1163
+ - Whether the text looks like an actionable deliverable (20 %)
1164
+ - Whether the text is relevant to the request (20 %)
1165
+ """
1166
+ if not text or not text.strip():
1167
+ return 0.0
1168
+
1169
+ req_score = 0.0
1170
+ if requirements:
1171
+ text_lower = text.lower()
1172
+ covered = sum(
1173
+ 1 for req in requirements
1174
+ if any(w in text_lower for w in re.findall(r"\b\w{3,}\b", req.lower()))
1175
+ )
1176
+ req_score = covered / len(requirements)
1177
+
1178
+ deliverable_score = 1.0 if _looks_like_actionable_deliverable(text) else 0.0
1179
+ relevance_score = 1.0 if _is_relevant_to_request(text, request) else 0.0
1180
+
1181
+ return 0.6 * req_score + 0.2 * deliverable_score + 0.2 * relevance_score
1182
+
1183
+
1184
+ def _format_role_feedback_for_prompt(feedback: Any) -> str:
1185
+ """Render QA role feedback (list or string) as a readable string for prompts."""
1186
+ if isinstance(feedback, list):
1187
+ return "\n".join(f"- {item}" for item in feedback if item)
1188
+ return str(feedback).strip()
1189
+
1190
+
1191
+ def _strict_retry_specialist(
1192
+ chat_model,
1193
+ system_prompt: str,
1194
+ state: "WorkflowState",
1195
+ role_key: str,
1196
+ output_key: str,
1197
+ label: str,
1198
+ trace: List[str],
1199
+ ) -> str:
1200
+ """Re-invoke a specialist with a strict, no-theatrics prompt after a poor output.
1201
+
1202
+ Returns the new output text (may still be empty/poor; callers should check).
1203
+ """
1204
+ previous_output = state.get(output_key, "") # type: ignore[literal-required]
1205
+ qa_data = state.get("qa_data") or {}
1206
+ fixes = qa_data.get("required_fixes") or qa_data.get("recommended_fixes") or []
1207
+ role_fb = state["qa_role_feedback"].get(role_key)
1208
+
1209
+ strict_content = (
1210
+ f"User request: {state['user_request']}\n\n"
1211
+ f"Planner instructions:\n{state['plan']}\n\n"
1212
+ "STRICT RETRY: Your previous response did not satisfy the task requirements.\n"
1213
+ "Drop persona theatrics entirely. Provide directly usable, task-relevant content.\n"
1214
+ "Address every required fix listed below. Return the deliverable content only โ€” "
1215
+ "no explanations of what you are about to do, no process notes.\n\n"
1216
+ )
1217
+ if fixes:
1218
+ strict_content += "Required fixes:\n" + "\n".join(f"- {f}" for f in fixes) + "\n\n"
1219
+ if role_fb:
1220
+ strict_content += "Role-specific fixes:\n" + _format_role_feedback_for_prompt(role_fb) + "\n\n"
1221
+ if previous_output:
1222
+ strict_content += f"Your previous (unsatisfactory) output:\n{previous_output}\n\n"
1223
+ strict_content += "Now produce the corrected, improved deliverable:"
1224
+
1225
+ trace.append(f" โ†ฉ [{label}] strict retry invoked.")
1226
+ new_output = _llm_call(chat_model, system_prompt, strict_content)
1227
+ trace.append(f" โœ” [{label}] strict retry complete ({len(new_output)} chars).")
1228
+ return new_output
1229
+
1230
+
1231
  # --- Workflow step functions ---
1232
  # Each step receives the shared state and an append-only trace list,
1233
  # updates state in place, appends log lines, and returns updated state.
1234
 
1235
+ def _step_plan(
1236
+ chat_model,
1237
+ state: WorkflowState,
1238
+ trace: List[str],
1239
+ active_specialist_keys: Optional[List[str]] = None,
1240
+ ) -> WorkflowState:
1241
+ """Planner: analyse the task, produce a plan, decide which specialist to call.
1242
+
1243
+ Uses a dynamically generated system prompt that lists ONLY the active
1244
+ specialist roles, preventing routing to inactive roles.
1245
+ """
1246
  trace.append("\nโ•”โ•โ• [PLANNER] Analysing task... โ•โ•โ•—")
1247
+ planner_system = _build_planner_system(active_specialist_keys or [])
1248
  content = f"User request: {state['user_request']}"
1249
  if state["revision_count"] > 0:
1250
  # Use structured QA data when available for clearer revision guidance
1251
  qa_data = state.get("qa_data") or {}
1252
+ missing = qa_data.get("missing_requirements") or []
1253
+ fixes = qa_data.get("required_fixes") or qa_data.get("recommended_fixes") or []
1254
+ score = qa_data.get("score", "?")
1255
+ summary = qa_data.get("summary", "")
1256
+ if missing or fixes:
1257
+ qa_summary = ""
1258
+ if summary:
1259
+ qa_summary += f"QA summary (score {score}): {summary}\n\n"
1260
+ if missing:
1261
+ qa_summary += "Missing requirements:\n" + "\n".join(f"- {m}" for m in missing) + "\n\n"
1262
+ if fixes:
1263
+ qa_summary += "Required fixes:\n" + "\n".join(f"- {f}" for f in fixes)
1264
+ else:
1265
+ qa_summary = state["qa_report"]
1266
+ # Identify which role produced the best content (prefer routing to it)
1267
+ best_note = ""
1268
+ if state.get("best_artifact") and not _is_meta_summary(state["best_artifact"]):
1269
+ best_note = "\nNOTE: A good concrete artifact was found in a previous round. Route to the role most likely to improve on it."
1270
  content += (
1271
  f"\n\nThis is revision {state['revision_count']} of {MAX_REVISIONS}."
1272
  f"\nQA concerns to address:\n{qa_summary}"
1273
+ f"{best_note}"
1274
+ "\nChoose the role most likely to produce the ACTUAL DELIVERABLE that addresses these concerns."
1275
+ "\nRemember to select ONLY from the active roles listed in your instructions."
1276
  )
1277
+ plan_text = _llm_call(chat_model, planner_system, content)
1278
  state["plan"] = plan_text
1279
+ state["current_role"] = _decide_role(plan_text, active_keys=active_specialist_keys)
1280
+ role_label = AGENT_ROLES.get(state["current_role"], state["current_role"]).upper()
1281
  trace.append(plan_text)
1282
+ trace.append(f"โ•šโ•โ• [PLANNER] โ†’ routing to: {role_label} โ•โ•โ•")
1283
  return state
1284
 
1285
 
1286
+
1287
+ def _build_specialist_content(
1288
+ state: WorkflowState,
1289
+ role_key: str,
1290
+ previous_output_key: str,
1291
+ strict: bool = False,
1292
+ ) -> str:
1293
  """Build the user-facing content string for a specialist LLM call.
1294
 
1295
  On the first pass this is just the request + plan.
1296
  On revision passes it additionally includes:
1297
+ - Explicit revision-mode prompt contract
1298
+ - The specialist's own previous output
1299
+ - The previous finalizer output (so specialists can see the last merged answer)
1300
+ - Structured QA feedback: global required_fixes + role-specific fixes
1301
+ - A note if persona mode should be suppressed (persona roles only)
1302
+ When *strict* is True the tone is even more direct (used by retry logic).
1303
  """
1304
+ revision = state["revision_count"]
1305
  content = (
1306
  f"User request: {state['user_request']}\n\n"
1307
  f"Planner instructions:\n{state['plan']}"
1308
  )
1309
+
1310
+ if revision > 0:
1311
+ # โ”€โ”€ Revision-mode prompt contract โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1312
+ contract = (
1313
+ f"\n\n{'โ”€'*60}\n"
1314
+ f"REVISION {revision} of {MAX_REVISIONS} โ€” REVISION MODE ACTIVE\n"
1315
+ "You are revising your previous contribution.\n"
1316
+ "Fix the specific issues identified by QA.\n"
1317
+ "Return ONLY the improved content for your role.\n"
1318
+ "Do NOT return commentary, summaries of opinions, or statements about "
1319
+ "what you plan to do.\n"
1320
+ )
1321
+ if role_key in _PERSONA_ROLE_KEYS and not strict:
1322
+ contract += (
1323
+ "PERSONA NOTE: Keep your characteristic voice for style only. "
1324
+ "In revision mode you MUST prioritise usefulness and requirement "
1325
+ "satisfaction over persona performance. "
1326
+ "Limit persona intro text. Provide concrete deliverable content first.\n"
1327
+ )
1328
+ if strict:
1329
+ contract += (
1330
+ "STRICT RETRY: Your previous response failed quality checks. "
1331
+ "Drop all persona theatrics. Produce directly usable content only.\n"
1332
+ )
1333
+ contract += "โ”€" * 60
1334
+ content += contract
1335
+
1336
+ # โ”€โ”€ Previous finalizer output โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1337
+ prev_final = state.get("finalized_output", "")
1338
+ if prev_final and not _is_meta_summary(prev_final):
1339
+ content += f"\n\nPrevious finalizer output (the merged answer from last round):\n{short_text(prev_final, 800)}"
1340
+
1341
+ # โ”€โ”€ This role's own previous output โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1342
  previous_output = state.get(previous_output_key, "") # type: ignore[literal-required]
1343
  if previous_output:
1344
+ content += f"\n\nYour previous output (revise and improve this):\n{previous_output}"
1345
+
1346
+ # โ”€โ”€ QA feedback โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1347
+ qa_data = state.get("qa_data") or {}
1348
+ required_fixes = qa_data.get("required_fixes") or qa_data.get("recommended_fixes") or []
1349
+ missing = qa_data.get("missing_requirements") or []
1350
+ role_feedback = state["qa_role_feedback"].get(role_key)
1351
+
1352
  if role_feedback:
1353
+ fb_text = _format_role_feedback_for_prompt(role_feedback)
1354
+ content += f"\n\nQA feedback specific to your role โ€” address each item:\n{fb_text}"
1355
+ if missing:
1356
+ content += "\n\nMissing requirements to add:\n" + "\n".join(f"- {m}" for m in missing)
1357
+ if required_fixes:
1358
+ content += "\n\nRequired fixes from QA:\n" + "\n".join(f"- {f}" for f in required_fixes)
1359
+ if not role_feedback and not required_fixes and not missing and state["qa_report"]:
1360
+ # Fallback: include score/summary from QA report
1361
+ score = qa_data.get("score", "")
1362
+ summary = qa_data.get("summary", "")
1363
+ if summary:
1364
+ content += f"\n\nQA summary (score {score}): {summary}"
1365
+
1366
  return content
1367
 
1368
 
 
1406
  trace: List[str],
1407
  all_outputs: Optional[List[Tuple[str, str]]] = None,
1408
  ) -> WorkflowState:
1409
+ """QA Tester: check the FINALIZED deliverable against the original request.
1410
+
1411
+ Evaluates ``state['draft_output']`` (the finalizer output), not individual
1412
+ specialist contributions. Individual specialist outputs are still passed
1413
+ so QA can supply targeted, per-role feedback stored in
1414
+ ``state['qa_role_feedback']`` for the next revision pass.
1415
 
1416
+ Logs exactly what text was evaluated, the score, and the missing requirements.
 
 
 
 
1417
  """
1418
+ trace.append("\nโ•”โ•โ• [QA TESTER] Reviewing finalized deliverable... โ•โ•โ•—")
1419
+ evaluated_text = state["draft_output"]
1420
+ trace.append(f" โ„น QA evaluating: {len(evaluated_text)} chars of finalized output")
1421
+
1422
  content = (
1423
  f"Original user request: {state['user_request']}\n\n"
1424
  f"Planner's plan and success criteria:\n{state['plan']}\n\n"
1425
  )
1426
  if all_outputs:
1427
+ # Include specialist contributions for role-specific feedback only
1428
+ content += "Individual specialist contributions (for per-role feedback only):\n\n"
1429
  for r_key, r_output in all_outputs:
1430
  r_label = AGENT_ROLES.get(r_key, r_key)
1431
+ content += f"=== {r_label} (role key: {r_key}) ===\n{short_text(r_output, 600)}\n\n"
1432
+ content += f"Finalized deliverable (evaluate this for PASS/FAIL):\n{evaluated_text}"
1433
+
 
1434
  text = _llm_call(chat_model, _QA_SYSTEM, content)
1435
  state["qa_report"] = text
1436
  state["qa_data"] = _parse_qa_json(text) or {}
1437
  state["qa_role_feedback"] = _parse_qa_role_feedback(text)
1438
  state["qa_passed"] = _qa_passed_check(text)
1439
+
1440
+ qa_data = state["qa_data"]
1441
+ score = qa_data.get("score", "?")
1442
+ artifact_present = qa_data.get("artifact_present", "?")
1443
+ missing = qa_data.get("missing_requirements") or []
1444
  result_label = "โœ… PASS" if state["qa_passed"] else "โŒ FAIL"
1445
+
1446
  trace.append(text)
1447
+ trace.append(
1448
+ f" โ„น QA result: {result_label} | score: {score} | artifact_present: {artifact_present}"
1449
+ )
1450
+ if missing:
1451
+ trace.append(" โ„น Missing requirements: " + "; ".join(str(m) for m in missing[:5]))
1452
  if state["qa_role_feedback"]:
1453
+ fb_parts = []
1454
+ for k, v in state["qa_role_feedback"].items():
1455
+ preview = (v[0][:60] + "โ€ฆ") if isinstance(v, list) and v else (str(v)[:60])
1456
+ fb_parts.append(f"{AGENT_ROLES.get(k, k)}: {preview}")
1457
+ trace.append(f" โ„น Role-specific feedback dispatched โ†’ {', '.join(fb_parts)}")
1458
  trace.append(f"โ•šโ•โ• [QA TESTER] Result: {result_label} โ•โ•โ•")
1459
  return state
1460
 
1461
 
1462
+ def _step_planner_review(
1463
+ chat_model,
1464
+ state: WorkflowState,
1465
+ trace: List[str],
1466
+ active_specialist_keys: Optional[List[str]] = None,
1467
+ ) -> WorkflowState:
1468
  """Planner: review QA feedback and either approve the result or request a revision."""
1469
  trace.append("\nโ•”โ•โ• [PLANNER] Reviewing QA feedback... โ•โ•โ•—")
1470
  # Format issues/fixes from structured QA data when available
1471
  qa_data = state.get("qa_data") or {}
1472
+ missing = qa_data.get("missing_requirements") or []
1473
+ fixes = qa_data.get("required_fixes") or qa_data.get("recommended_fixes") or []
1474
+ score = qa_data.get("score", "?")
1475
+ summary = qa_data.get("summary", "")
1476
+ if missing or fixes:
1477
+ qa_summary = ""
1478
+ if summary:
1479
+ qa_summary += f"QA summary (score {score}): {summary}\n\n"
1480
+ if missing:
1481
+ qa_summary += "Missing requirements:\n" + "\n".join(f"- {m}" for m in missing) + "\n\n"
1482
+ if fixes:
1483
+ qa_summary += "Required fixes:\n" + "\n".join(f"- {f}" for f in fixes)
1484
  else:
1485
  qa_summary = state["qa_report"]
1486
+
1487
+ # Use best artifact for approval check, not just the latest draft
1488
+ deliverable = state.get("best_artifact") or state["draft_output"]
1489
+
1490
  content = (
1491
  f"User request: {state['user_request']}\n\n"
1492
  f"Plan:\n{state['plan']}\n\n"
1493
+ f"Current deliverable:\n{deliverable}\n\n"
1494
  f"QA result: {'PASS' if state['qa_passed'] else 'FAIL'}\n"
1495
  f"QA details:\n{qa_summary}"
1496
  )
 
1522
  else:
1523
  # Revision requested but REVISED INSTRUCTIONS section missing โ€” keep current plan
1524
  trace.append(" โš  REVISED INSTRUCTIONS section missing; retrying with existing plan.")
1525
+ new_role = _decide_role(review, active_keys=active_specialist_keys)
1526
+ # If the new role was inactive (fallback triggered), log it clearly
1527
+ if active_specialist_keys and new_role not in active_specialist_keys:
1528
+ fallback = active_specialist_keys[0]
1529
+ trace.append(f" โš  Planner requested inactive role โ†’ falling back to {AGENT_ROLES.get(fallback, fallback).upper()}")
1530
+ new_role = fallback
1531
  state["current_role"] = new_role
1532
  trace.append(
1533
  f"โ•šโ•โ• [PLANNER] โ†’ ๐Ÿ”„ REVISE โ€” routing to {AGENT_ROLES.get(new_role, new_role).upper()} โ•โ•โ•"
 
1535
  return state
1536
 
1537
 
1538
+
1539
  def _step_research(chat_model, state: WorkflowState, trace: List[str]) -> WorkflowState:
1540
  """Research Analyst: gather information and produce a comprehensive research summary."""
1541
  trace.append("\nโ•”โ•โ• [RESEARCH ANALYST] Gathering information... โ•โ•โ•—")
 
1700
  ) -> WorkflowState:
1701
  """Finalizer: synthesise all specialist perspectives and produce the actual deliverable.
1702
 
1703
+ Strategy:
1704
+ 1. Score each specialist output for requirement coverage + actionability.
1705
+ 2. Pass the best-scoring outputs (rather than all) to the LLM, so the
1706
+ prompt is not dominated by irrelevant persona content.
1707
+ 3. Hard-guard against meta-summary output: if the LLM ignores instructions,
1708
+ fall back to the best-scoring specialist output directly.
1709
+ 4. Never let meta-summary content overwrite best_artifact.
1710
  """
1711
  trace.append("\nโ•”โ•โ• [FINALIZER] Producing the deliverable from all perspectives... โ•โ•โ•—")
1712
+
1713
+ # โ”€โ”€ Score and rank specialist outputs โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1714
+ qa_data = state.get("qa_data") or {}
1715
+ requirements = (
1716
+ (qa_data.get("missing_requirements") or [])
1717
+ + (qa_data.get("required_fixes") or qa_data.get("recommended_fixes") or [])
1718
+ )
1719
+ scored: List[Tuple[float, str, str]] = [] # (score, role_key, output)
1720
  for r_key, r_output in all_outputs:
1721
+ if not r_output or not r_output.strip():
1722
+ continue
1723
+ s = _score_candidate(r_output, requirements, state["user_request"])
1724
+ scored.append((s, r_key, r_output))
1725
+ scored.sort(key=lambda x: x[0], reverse=True)
1726
+
1727
+ # Log candidate scores
1728
+ for s, r_key, _ in scored:
1729
+ trace.append(f" โ„น [FINALIZER] candidate score {s:.2f} โ†’ {AGENT_ROLES.get(r_key, r_key)}")
1730
+
1731
+ # Use the top-3 highest-scoring (or all if fewer) to limit LLM context
1732
+ top_outputs = scored[:3] if len(scored) > 3 else scored
1733
+ base_role = top_outputs[0][1] if top_outputs else (all_outputs[0][0] if all_outputs else "")
1734
+ base_output = top_outputs[0][2] if top_outputs else (all_outputs[0][1] if all_outputs else "")
1735
+
1736
+ trace.append(f" โ„น [FINALIZER] base source: {AGENT_ROLES.get(base_role, base_role)}")
1737
+
1738
+ perspectives = []
1739
+ for _, r_key, r_output in top_outputs:
1740
  r_label = AGENT_ROLES.get(r_key, r_key)
1741
  perspectives.append(f"=== {r_label} ===\n{r_output}")
1742
  combined = "\n\n".join(perspectives)
1743
+
1744
+ # Include the previous finalizer output if this is a revision
1745
+ prev_final_note = ""
1746
+ prev_final = state.get("finalized_output", "")
1747
+ if prev_final and not _is_meta_summary(prev_final) and state["revision_count"] > 0:
1748
+ prev_final_note = f"\nPrevious finalizer output (improve on this, do not summarise it):\n{short_text(prev_final, 800)}\n\n"
1749
+
1750
  content = (
1751
  f"User request: {state['user_request']}\n\n"
1752
+ f"{prev_final_note}"
1753
+ f"Top specialist perspectives to synthesise into the deliverable:\n\n{combined}"
1754
  )
1755
  text = _llm_call(chat_model, _FINALIZER_SYSTEM, content)
1756
 
1757
+ # โ”€โ”€ Meta-summary guard โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
1758
+ meta_guard_triggered = False
1759
  if not text or not text.strip():
1760
+ trace.append(" โš  [FINALIZER] returned empty output โ€” using best specialist output.")
1761
+ text = base_output
1762
+ meta_guard_triggered = True
1763
+ elif _is_meta_summary(text):
 
 
1764
  trace.append(
1765
+ " โš  [FINALIZER] meta-summary guard triggered (perspective-summary headers detected).\n"
1766
+ f" Substituting best specialist output from {AGENT_ROLES.get(base_role, base_role)}."
1767
  )
1768
+ text = base_output
1769
+ meta_guard_triggered = True
1770
+
1771
+ trace.append(f" โ„น [FINALIZER] meta-summary guard: {'triggered' if meta_guard_triggered else 'not triggered'}")
1772
 
1773
  state["finalized_output"] = text
1774
  state["draft_output"] = text
 
1777
  return state
1778
 
1779
 
1780
+
1781
  # Mapping from role key โ†’ step function, used by the orchestration loop
1782
  _SPECIALIST_STEPS = {
1783
  "creative": _step_creative,
 
1798
  "lawyer": _step_lawyer,
1799
  }
1800
 
1801
+ # Mapping from role key โ†’ system prompt string (used for strict retries)
1802
+ _SPECIALIST_SYSTEM_PROMPTS: Dict[str, str] = {
1803
+ "creative": _CREATIVE_SYSTEM,
1804
+ "technical": _TECHNICAL_SYSTEM,
1805
+ "research": _RESEARCH_SYSTEM,
1806
+ "security": _SECURITY_SYSTEM,
1807
+ "data_analyst": _DATA_ANALYST_SYSTEM,
1808
+ "mad_professor": _MAD_PROFESSOR_SYSTEM,
1809
+ "accountant": _ACCOUNTANT_SYSTEM,
1810
+ "artist": _ARTIST_SYSTEM,
1811
+ "lazy_slacker": _LAZY_SLACKER_SYSTEM,
1812
+ "black_metal_fundamentalist": _BLACK_METAL_FUNDAMENTALIST_SYSTEM,
1813
+ "labour_union_rep": _LABOUR_UNION_REP_SYSTEM,
1814
+ "ux_designer": _UX_DESIGNER_SYSTEM,
1815
+ "doris": _DORIS_SYSTEM,
1816
+ "chairman_of_board": _CHAIRMAN_SYSTEM,
1817
+ "maga_appointee": _MAGA_APPOINTEE_SYSTEM,
1818
+ "lawyer": _LAWYER_SYSTEM,
1819
+ }
1820
+
1821
+
1822
+ def _log_revision_tracking(
1823
+ trace: List[str],
1824
+ role_key: str,
1825
+ state: "WorkflowState",
1826
+ prev_outputs: Dict[str, str],
1827
+ is_primary: bool = False,
1828
+ ) -> None:
1829
+ """Append a revision-tracking log line for a specialist role.
1830
+
1831
+ Logs whether:
1832
+ - Previous output was passed in
1833
+ - QA feedback was passed in
1834
+ - Role-specific feedback was passed in
1835
+ - Output changed materially from the previous round
1836
+ - Output is relevant to the request
1837
+ """
1838
+ revision = state["revision_count"]
1839
+ if revision == 0:
1840
+ return # nothing to track on the first pass
1841
+
1842
+ output_key = f"{role_key}_output"
1843
+ current_out = state.get(output_key, "") # type: ignore[literal-required]
1844
+ prev_out = prev_outputs.get(role_key, "")
1845
+
1846
+ had_prev = bool(prev_out and prev_out.strip())
1847
+ had_qa = bool(state.get("qa_report"))
1848
+ had_role_fb = bool(state["qa_role_feedback"].get(role_key))
1849
+ changed = had_prev and current_out != prev_out
1850
+ relevant = _is_relevant_to_request(current_out, state["user_request"])
1851
+ qa_data = state.get("qa_data") or {}
1852
+ requirements = (
1853
+ (qa_data.get("missing_requirements") or [])
1854
+ + (qa_data.get("required_fixes") or qa_data.get("recommended_fixes") or [])
1855
+ )
1856
+ coverage = _score_candidate(current_out, requirements, state["user_request"])
1857
+
1858
+ label = AGENT_ROLES.get(role_key, role_key)
1859
+ primary_tag = " [PRIMARY]" if is_primary else ""
1860
+ trace.append(
1861
+ f" ๐Ÿ“‹ [{label}{primary_tag}] revision tracking โ€” "
1862
+ f"prev_output: {'yes' if had_prev else 'no'} | "
1863
+ f"qa_feedback: {'yes' if had_qa else 'no'} | "
1864
+ f"role_feedback: {'yes' if had_role_fb else 'no'} | "
1865
+ f"output_changed: {'yes' if changed else 'no'} | "
1866
+ f"relevant: {'yes' if relevant else 'no'} | "
1867
+ f"coverage_score: {coverage:.2f}"
1868
+ )
1869
+
1870
 
 
1871
  # These wrap the step functions as @tool so the Planner (or any LangChain agent)
1872
  # can invoke specialists in a standard tool-use pattern.
1873
 
 
1884
  "chairman_of_board_output": "", "maga_appointee_output": "", "lawyer_output": "",
1885
  "finalized_output": "",
1886
  "draft_output": "", "qa_report": "", "qa_data": {}, "qa_role_feedback": {}, "qa_passed": False,
1887
+ "revision_count": 0, "best_artifact": "", "best_artifact_score": 0.0, "final_answer": "",
1888
  }
1889
 
1890
 
 
2131
  "qa_passed": False,
2132
  "revision_count": 0,
2133
  "best_artifact": "",
2134
+ "best_artifact_score": 0.0,
2135
  "final_answer": "",
2136
  }
2137
 
 
2145
 
2146
  try:
2147
  if planner_active:
2148
+ # Step 1: Planner creates the initial plan using only active specialist roles
2149
+ state = _step_plan(chat_model, state, trace, active_specialist_keys)
2150
  else:
2151
  # No planner: auto-select first active specialist
2152
  state["current_role"] = active_specialist_keys[0]
 
2161
  # then run every other active specialist so all voices are heard.
2162
  primary_role = state["current_role"]
2163
  if primary_role not in active_specialist_keys:
2164
+ # Safe fallback: requested role not in active set โ€” log clearly
2165
  fallback_role = active_specialist_keys[0]
2166
  trace.append(
2167
+ f" โš  FALLBACK: Role '{AGENT_ROLES.get(primary_role, primary_role)}' is not active "
2168
+ f"โ€” falling back to {AGENT_ROLES.get(fallback_role, fallback_role).upper()}"
2169
  )
2170
  primary_role = fallback_role
2171
  state["current_role"] = primary_role
2172
 
2173
+ # โ”€โ”€ Track previous outputs for revision logging โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
2174
+ prev_outputs_snapshot: Dict[str, str] = {
2175
+ k: state.get(f"{k}_output", "") # type: ignore[literal-required]
2176
+ for k in active_specialist_keys
2177
+ }
2178
+
2179
  # Run the primary (planner-chosen) specialist
2180
  primary_fn = _SPECIALIST_STEPS.get(primary_role, _step_technical)
2181
  state = primary_fn(chat_model, state, trace)
2182
  primary_output = state["draft_output"]
2183
 
2184
+ # Log revision tracking for primary specialist
2185
+ _log_revision_tracking(
2186
+ trace, primary_role, state, prev_outputs_snapshot, is_primary=True
2187
+ )
2188
+
2189
  # Run all other active specialists and collect their perspectives
2190
  all_outputs: List[Tuple[str, str]] = [(primary_role, primary_output)]
2191
  for specialist_role in active_specialist_keys:
 
2193
  continue # already ran above
2194
  step_fn = _SPECIALIST_STEPS[specialist_role]
2195
  state = step_fn(chat_model, state, trace)
2196
+ specialist_output = state["draft_output"]
2197
+
2198
+ # Log revision tracking for this specialist
2199
+ _log_revision_tracking(
2200
+ trace, specialist_role, state, prev_outputs_snapshot, is_primary=False
2201
+ )
2202
+
2203
+ # โ”€โ”€ Strict retry for poor outputs (revision rounds only) โ”€โ”€โ”€
2204
+ if state["revision_count"] > 0:
2205
+ output_key = f"{specialist_role}_output"
2206
+ current_output = state.get(output_key, "") # type: ignore[literal-required]
2207
+ needs_retry = (
2208
+ _is_empty_output(current_output)
2209
+ or not _is_relevant_to_request(current_output, message)
2210
+ )
2211
+ if needs_retry:
2212
+ sys_prompt = _SPECIALIST_SYSTEM_PROMPTS.get(specialist_role)
2213
+ if sys_prompt:
2214
+ retry_text = _strict_retry_specialist(
2215
+ chat_model, sys_prompt, state,
2216
+ specialist_role, output_key, specialist_role.upper(), trace
2217
+ )
2218
+ if retry_text and retry_text.strip():
2219
+ state[output_key] = retry_text # type: ignore[literal-required]
2220
+ state["draft_output"] = retry_text
2221
+ specialist_output = retry_text
2222
+ trace.append(
2223
+ f" โ„น [{AGENT_ROLES.get(specialist_role, specialist_role)}] "
2224
+ f"strict retry: output changed "
2225
+ f"({'relevant' if _is_relevant_to_request(retry_text, message) else 'still poor'})"
2226
+ )
2227
+
2228
  all_outputs.append((specialist_role, state["draft_output"]))
2229
 
2230
  # Finalize all perspectives into the actual deliverable.
 
2236
  state = _step_finalize(chat_model, state, trace, all_outputs)
2237
  else:
2238
  state["draft_output"] = primary_output
2239
+ state["finalized_output"] = primary_output
2240
 
2241
+ # Update best-candidate tracking with scoring: only update if better than current best
2242
  current_draft = state["draft_output"]
2243
+ if current_draft and current_draft.strip():
2244
+ qa_data = state.get("qa_data") or {}
2245
+ requirements = (
2246
+ (qa_data.get("missing_requirements") or [])
2247
+ + (qa_data.get("required_fixes") or qa_data.get("recommended_fixes") or [])
2248
+ )
2249
+ current_score = _score_candidate(current_draft, requirements, message)
2250
+ best_score = state.get("best_artifact_score", 0.0)
2251
+
2252
+ if not _is_meta_summary(current_draft):
2253
+ if current_score >= best_score or not state.get("best_artifact"):
2254
+ prev_best = state.get("best_artifact", "")
2255
+ state["best_artifact"] = current_draft
2256
+ state["best_artifact_score"] = current_score
2257
+ updated = current_draft != prev_best
2258
+ trace.append(
2259
+ f" โœ” Best artifact {'updated' if updated else 'confirmed'} "
2260
+ f"(rev {state['revision_count']}, score {current_score:.2f}): "
2261
+ f"{len(current_draft)} chars"
2262
+ )
2263
+ else:
2264
+ trace.append(
2265
+ f" โ„น Current draft score {current_score:.2f} < best {best_score:.2f} "
2266
+ f"โ€” keeping existing best artifact"
2267
+ )
2268
+ elif not state.get("best_artifact"):
2269
+ # Safety net: even a meta-summary beats nothing
2270
+ state["best_artifact"] = current_draft
2271
+ state["best_artifact_score"] = 0.0
2272
 
2273
  # Step 3: QA reviews the finalized draft (if enabled)
2274
  if qa_active:
 
2282
  # Step 4: Planner reviews QA and either approves or schedules a revision
2283
  if planner_active and qa_active:
2284
  prev_plan = state["plan"]
2285
+ state = _step_planner_review(
2286
+ chat_model, state, trace, active_specialist_keys
2287
+ )
2288
 
2289
  # Exit if the Planner approved the result
2290
  if state["final_answer"]:
 
2311
  state["final_answer"] = best
2312
  trace.append(
2313
  f"\nโ•โ•โ• MAX REVISIONS REACHED ({MAX_REVISIONS}) โ•โ•โ•\n"
2314
+ f"Returning best artifact (score {state.get('best_artifact_score', 0.0):.2f}, "
2315
+ f"{len(best)} chars)."
2316
  )
2317
  break
2318
  else:
 
2334
  return state["final_answer"], "\n".join(trace)
2335
 
2336
 
2337
+
2338
  # ============================================================
2339
  # Agent builder
2340
  # ============================================================
 
2814
  return final_answer, trace
2815
 
2816
 
2817
+ def run_demo_three_course_menu(
2818
+ model_id: str = DEFAULT_MODEL_ID,
2819
+ active_roles: Optional[List[str]] = None,
2820
+ ) -> Tuple[str, str]:
2821
+ """Regression demo A: three-course menu request with structured QA revision.
2822
+
2823
+ Verifies:
2824
+ - At least one revision round occurs (QA initially fails the underspecified output)
2825
+ - Some role gets role-specific QA feedback
2826
+ - Finalizer returns an actual structured menu, not a meta-summary
2827
+ - Final result contains at least two options per course and practical details
2828
+
2829
+ Example::
2830
+
2831
+ answer, trace = run_demo_three_course_menu()
2832
+ # Verify the final answer is a structured menu:
2833
+ assert "appetizer" in answer.lower() or "starter" in answer.lower()
2834
+ assert not any(
2835
+ h in answer.upper()
2836
+ for h in ("PERSPECTIVES SUMMARY", "COMMON GROUND", "TENSIONS AND TRADE-OFFS")
2837
+ )
2838
+ """
2839
+ task = (
2840
+ "Discuss best options for a three-course menu for tonight. "
2841
+ "Include at least two options per course (appetizer, main, dessert). "
2842
+ "For each option provide: name, key ingredients, prep/cook time, skill level, "
2843
+ "suggested pairings, and whether it is suitable for vegetarians or vegans. "
2844
+ "Also include total estimated time and a note on dietary variations."
2845
+ )
2846
+ if active_roles is None:
2847
+ active_roles = [
2848
+ AGENT_ROLES["planner"],
2849
+ AGENT_ROLES["creative"],
2850
+ AGENT_ROLES["technical"],
2851
+ AGENT_ROLES["mad_professor"],
2852
+ AGENT_ROLES["qa_tester"],
2853
+ ]
2854
+ print(f"\n[DEMO-A] Three-course menu regression demo")
2855
+ print(f"[DEMO-A] Task: {task!r}")
2856
+ print(f"[DEMO-A] Active roles: {active_roles}")
2857
+ final_answer, trace = run_multi_role_workflow(task, model_id, active_roles)
2858
+ print("\n[DEMO-A] === FINAL ANSWER ===")
2859
+ print(final_answer or "(empty โ€” check trace for details)")
2860
+ print("\n[DEMO-A] === WORKFLOW TRACE ===")
2861
+ print(trace)
2862
+
2863
+ # โ”€โ”€ Assertions for regression verification โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
2864
+ failures = []
2865
+ answer_lower = final_answer.lower()
2866
+ # Check it's not a pure meta-summary
2867
+ if _is_meta_summary(final_answer):
2868
+ failures.append("FAIL: Final answer looks like a meta-summary, not a menu.")
2869
+ # Check for course structure
2870
+ if not any(k in answer_lower for k in ("appetizer", "starter", "first course")):
2871
+ failures.append("FAIL: No appetizer/starter section found in final answer.")
2872
+ if not any(k in answer_lower for k in ("main", "entrรฉe", "entree", "second course")):
2873
+ failures.append("FAIL: No main course section found in final answer.")
2874
+ if not any(k in answer_lower for k in ("dessert", "sweet", "third course")):
2875
+ failures.append("FAIL: No dessert section found in final answer.")
2876
+ # Check QA role feedback was dispatched (visible in trace)
2877
+ if "Role-specific feedback dispatched" not in trace and "role_feedback" not in trace:
2878
+ failures.append("WARN: No role-specific QA feedback detected in trace.")
2879
+ # Check at least one revision occurred
2880
+ if "REVISION 1" not in trace:
2881
+ failures.append("WARN: No revision round detected โ€” QA may have passed immediately.")
2882
+
2883
+ if failures:
2884
+ print("\n[DEMO-A] โš  Regression warnings:")
2885
+ for f in failures:
2886
+ print(f" {f}")
2887
+ else:
2888
+ print("\n[DEMO-A] โœ… All regression checks passed.")
2889
+
2890
+ return final_answer, trace
2891
+
2892
+
2893
+ def run_demo_inactive_role_fallback(
2894
+ model_id: str = DEFAULT_MODEL_ID,
2895
+ ) -> Tuple[str, str]:
2896
+ """Regression demo B: planner asks for inactive role โ€” clean fallback must occur.
2897
+
2898
+ Verifies:
2899
+ - Planner attempts to route to 'Creative Expert' (only active specialist role is 'Technical Expert')
2900
+ - Clean fallback is logged (no crash, no broken routing)
2901
+ - Final result still produces a useful deliverable
2902
+
2903
+ Example::
2904
+
2905
+ answer, trace = run_demo_inactive_role_fallback()
2906
+ assert "FALLBACK" in trace or "not active" in trace
2907
+ assert answer and len(answer.strip()) > 20
2908
+ """
2909
+ task = "Write a short, engaging product description for a new AI-powered coffee maker."
2910
+ # Deliberately enable only a small set that does NOT include Creative Expert
2911
+ # This forces a fallback when the planner (naturally) requests Creative Expert for a writing task
2912
+ active_roles = [
2913
+ AGENT_ROLES["planner"],
2914
+ AGENT_ROLES["technical"],
2915
+ AGENT_ROLES["qa_tester"],
2916
+ ]
2917
+ print(f"\n[DEMO-B] Inactive role fallback regression demo")
2918
+ print(f"[DEMO-B] Task: {task!r}")
2919
+ print(f"[DEMO-B] Active roles: {active_roles}")
2920
+ print(f"[DEMO-B] NOTE: 'Creative Expert' is intentionally disabled to test fallback routing.")
2921
+ final_answer, trace = run_multi_role_workflow(task, model_id, active_roles)
2922
+ print("\n[DEMO-B] === FINAL ANSWER ===")
2923
+ print(final_answer or "(empty โ€” check trace for details)")
2924
+ print("\n[DEMO-B] === WORKFLOW TRACE ===")
2925
+ print(trace)
2926
+
2927
+ # โ”€โ”€ Assertions for regression verification โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
2928
+ failures = []
2929
+ if not final_answer or len(final_answer.strip()) < 20:
2930
+ failures.append("FAIL: Final answer is empty or too short.")
2931
+ # Fallback should be logged when the planner requests an inactive role
2932
+ fallback_logged = "FALLBACK" in trace or "not active" in trace or "falling back" in trace.lower()
2933
+ if not fallback_logged:
2934
+ failures.append(
2935
+ "WARN: No fallback log detected โ€” the planner may have correctly chosen "
2936
+ "an active role directly, or the fallback path was not triggered."
2937
+ )
2938
+ # Should NOT crash
2939
+ if "[ERROR]" in trace:
2940
+ failures.append("FAIL: Workflow error detected in trace.")
2941
+
2942
+ if failures:
2943
+ print("\n[DEMO-B] โš  Regression warnings:")
2944
+ for f in failures:
2945
+ print(f" {f}")
2946
+ else:
2947
+ print("\n[DEMO-B] โœ… All regression checks passed.")
2948
+
2949
+ return final_answer, trace
2950
+
2951
+
2952
+
2953
  if __name__ == "__main__":
2954
  port = int(os.environ.get("PORT", 7860))
2955
  demo.launch(