copilot-swe-agent[bot] CatoG commited on
Commit ยท
cdc0ab9
1
Parent(s): 06ff8e3
Implement feedback-driven revision improvements to multi-role agent workflow
Browse files
app.py
CHANGED
|
@@ -559,45 +559,78 @@ class WorkflowState(TypedDict):
|
|
| 559 |
draft_output: str # latest specialist/finalized output forwarded to QA
|
| 560 |
qa_report: str
|
| 561 |
qa_data: Dict[str, Any] # parsed QA JSON data for structured access
|
| 562 |
-
qa_role_feedback: Dict[str,
|
| 563 |
qa_passed: bool
|
| 564 |
revision_count: int
|
| 565 |
best_artifact: str # best non-empty real deliverable seen so far
|
|
|
|
| 566 |
final_answer: str
|
| 567 |
|
| 568 |
|
| 569 |
# --- Role system prompts ---
|
| 570 |
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
"
|
| 574 |
-
"
|
| 575 |
-
"
|
| 576 |
-
"
|
| 577 |
-
"
|
| 578 |
-
"
|
| 579 |
-
"
|
| 580 |
-
"
|
| 581 |
-
"
|
| 582 |
-
"
|
| 583 |
-
"
|
| 584 |
-
"
|
| 585 |
-
"
|
| 586 |
-
"
|
| 587 |
-
"
|
| 588 |
-
"
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
"
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
"
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
|
| 602 |
_CREATIVE_SYSTEM = (
|
| 603 |
"You are the Creative Expert in a multi-role AI workflow.\n"
|
|
@@ -621,20 +654,30 @@ _TECHNICAL_SYSTEM = (
|
|
| 621 |
|
| 622 |
_QA_SYSTEM = (
|
| 623 |
"You are the QA Tester in a multi-role AI workflow.\n"
|
| 624 |
-
"
|
| 625 |
-
"When individual specialist contributions are provided, give targeted feedback
|
| 626 |
-
"so
|
| 627 |
"Respond with ONLY a valid JSON object in this exact format (no text before or after the JSON):\n"
|
| 628 |
"{\n"
|
| 629 |
-
' "
|
| 630 |
-
' "
|
| 631 |
-
' "
|
| 632 |
-
' "
|
| 633 |
-
' "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 634 |
"}\n\n"
|
| 635 |
-
'Set "
|
| 636 |
-
'Set "
|
| 637 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 638 |
)
|
| 639 |
|
| 640 |
_PLANNER_REVIEW_SYSTEM = (
|
|
@@ -862,12 +905,16 @@ def _llm_call(chat_model, system_prompt: str, user_content: str) -> str:
|
|
| 862 |
return content_to_text(response.content)
|
| 863 |
|
| 864 |
|
| 865 |
-
def _decide_role(text: str) -> str:
|
| 866 |
"""Parse which specialist role the Planner wants to invoke.
|
| 867 |
|
| 868 |
Normalises the 'ROLE TO CALL:' line (strips surrounding whitespace and
|
| 869 |
collapses internal spaces) before matching, then falls back to a
|
| 870 |
word-boundary search. Returns 'technical' when no clear signal is found.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 871 |
"""
|
| 872 |
# Normalise: collapse runs of whitespace so "ROLE TO CALL : Creative Expert" still matches
|
| 873 |
normalised = re.sub(r"\s+", " ", text).strip()
|
|
@@ -891,43 +938,45 @@ def _decide_role(text: str) -> str:
|
|
| 891 |
("Lawyer", "lawyer"),
|
| 892 |
("Doris", "doris"),
|
| 893 |
]
|
|
|
|
| 894 |
for label, key in _LABEL_TO_KEY_ORDERED:
|
| 895 |
# Match "ROLE TO CALL: <label>" with optional surrounding whitespace
|
| 896 |
if re.search(r"ROLE\s+TO\s+CALL\s*:\s*" + re.escape(label), normalised, re.IGNORECASE):
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
-
if
|
| 901 |
-
|
| 902 |
-
|
| 903 |
-
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
-
|
| 909 |
-
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
-
|
| 915 |
-
|
| 916 |
-
|
| 917 |
-
|
| 918 |
-
|
| 919 |
-
|
| 920 |
-
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
|
| 927 |
-
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
|
|
|
|
| 931 |
|
| 932 |
|
| 933 |
def _parse_qa_json(qa_text: str) -> Optional[Dict[str, Any]]:
|
|
@@ -956,13 +1005,16 @@ def _parse_qa_json(qa_text: str) -> Optional[Dict[str, Any]]:
|
|
| 956 |
def _qa_passed_check(qa_text: str) -> bool:
|
| 957 |
"""Return True only if the QA report contains an explicit PASS result.
|
| 958 |
|
| 959 |
-
|
| 960 |
-
|
| 961 |
-
|
| 962 |
-
from words like 'bypass' or 'password'.
|
| 963 |
"""
|
| 964 |
data = _parse_qa_json(qa_text)
|
| 965 |
if data is not None:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 966 |
result = str(data.get("result", "")).strip().upper()
|
| 967 |
if result == "PASS":
|
| 968 |
return True
|
|
@@ -977,23 +1029,34 @@ def _qa_passed_check(qa_text: str) -> bool:
|
|
| 977 |
return False
|
| 978 |
|
| 979 |
|
| 980 |
-
def _parse_qa_role_feedback(qa_text: str) -> Dict[str,
|
| 981 |
"""Extract per-role targeted feedback from a QA report.
|
| 982 |
|
| 983 |
-
|
| 984 |
-
|
| 985 |
-
|
| 986 |
-
|
|
|
|
|
|
|
| 987 |
"""
|
| 988 |
-
feedback: Dict[str,
|
| 989 |
|
| 990 |
-
# JSON path
|
| 991 |
data = _parse_qa_json(qa_text)
|
| 992 |
if data is not None and isinstance(data.get("role_feedback"), dict):
|
| 993 |
-
for
|
| 994 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 995 |
if role_key and fb:
|
| 996 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 997 |
return feedback
|
| 998 |
|
| 999 |
# Legacy text fallback
|
|
@@ -1013,7 +1076,7 @@ def _parse_qa_role_feedback(qa_text: str) -> Dict[str, str]:
|
|
| 1013 |
role_fb = role_fb.strip()
|
| 1014 |
role_key = _ROLE_LABEL_TO_KEY.get(role_label)
|
| 1015 |
if role_key and role_fb:
|
| 1016 |
-
feedback[role_key] = role_fb
|
| 1017 |
|
| 1018 |
return feedback
|
| 1019 |
|
|
@@ -1040,59 +1103,266 @@ def _is_meta_summary(text: str) -> bool:
|
|
| 1040 |
return count >= 2
|
| 1041 |
|
| 1042 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1043 |
# --- Workflow step functions ---
|
| 1044 |
# Each step receives the shared state and an append-only trace list,
|
| 1045 |
# updates state in place, appends log lines, and returns updated state.
|
| 1046 |
|
| 1047 |
-
def _step_plan(
|
| 1048 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1049 |
trace.append("\nโโโ [PLANNER] Analysing task... โโโ")
|
|
|
|
| 1050 |
content = f"User request: {state['user_request']}"
|
| 1051 |
if state["revision_count"] > 0:
|
| 1052 |
# Use structured QA data when available for clearer revision guidance
|
| 1053 |
qa_data = state.get("qa_data") or {}
|
| 1054 |
-
|
| 1055 |
-
fixes = qa_data.get("recommended_fixes") or []
|
| 1056 |
-
|
| 1057 |
-
|
| 1058 |
-
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1062 |
content += (
|
| 1063 |
f"\n\nThis is revision {state['revision_count']} of {MAX_REVISIONS}."
|
| 1064 |
f"\nQA concerns to address:\n{qa_summary}"
|
| 1065 |
-
"
|
|
|
|
|
|
|
| 1066 |
)
|
| 1067 |
-
plan_text = _llm_call(chat_model,
|
| 1068 |
state["plan"] = plan_text
|
| 1069 |
-
state["current_role"] = _decide_role(plan_text)
|
|
|
|
| 1070 |
trace.append(plan_text)
|
| 1071 |
-
trace.append(f"โโโ [PLANNER] โ routing to: {
|
| 1072 |
return state
|
| 1073 |
|
| 1074 |
|
| 1075 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1076 |
"""Build the user-facing content string for a specialist LLM call.
|
| 1077 |
|
| 1078 |
On the first pass this is just the request + plan.
|
| 1079 |
On revision passes it additionally includes:
|
| 1080 |
-
-
|
| 1081 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1082 |
"""
|
|
|
|
| 1083 |
content = (
|
| 1084 |
f"User request: {state['user_request']}\n\n"
|
| 1085 |
f"Planner instructions:\n{state['plan']}"
|
| 1086 |
)
|
| 1087 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1088 |
previous_output = state.get(previous_output_key, "") # type: ignore[literal-required]
|
| 1089 |
if previous_output:
|
| 1090 |
-
content += f"\n\nYour previous output (
|
| 1091 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1092 |
if role_feedback:
|
| 1093 |
-
|
| 1094 |
-
|
| 1095 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1096 |
return content
|
| 1097 |
|
| 1098 |
|
|
@@ -1136,63 +1406,91 @@ def _step_qa(
|
|
| 1136 |
trace: List[str],
|
| 1137 |
all_outputs: Optional[List[Tuple[str, str]]] = None,
|
| 1138 |
) -> WorkflowState:
|
| 1139 |
-
"""QA Tester: check the
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1140 |
|
| 1141 |
-
|
| 1142 |
-
iteration), each specialist's individual contribution is included in the
|
| 1143 |
-
review prompt so the QA can supply targeted, per-role feedback. This
|
| 1144 |
-
feedback is stored in ``state['qa_role_feedback']`` and consumed by the
|
| 1145 |
-
specialist step functions on the next revision pass.
|
| 1146 |
"""
|
| 1147 |
-
trace.append("\nโโโ [QA TESTER] Reviewing
|
|
|
|
|
|
|
|
|
|
| 1148 |
content = (
|
| 1149 |
f"Original user request: {state['user_request']}\n\n"
|
| 1150 |
f"Planner's plan and success criteria:\n{state['plan']}\n\n"
|
| 1151 |
)
|
| 1152 |
if all_outputs:
|
| 1153 |
-
# Include
|
| 1154 |
-
content += "Individual specialist contributions:\n\n"
|
| 1155 |
for r_key, r_output in all_outputs:
|
| 1156 |
r_label = AGENT_ROLES.get(r_key, r_key)
|
| 1157 |
-
content += f"=== {r_label} ===\n{r_output}\n\n"
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
-
content += f"Specialist output to review:\n{state['draft_output']}"
|
| 1161 |
text = _llm_call(chat_model, _QA_SYSTEM, content)
|
| 1162 |
state["qa_report"] = text
|
| 1163 |
state["qa_data"] = _parse_qa_json(text) or {}
|
| 1164 |
state["qa_role_feedback"] = _parse_qa_role_feedback(text)
|
| 1165 |
state["qa_passed"] = _qa_passed_check(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1166 |
result_label = "โ
PASS" if state["qa_passed"] else "โ FAIL"
|
|
|
|
| 1167 |
trace.append(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1168 |
if state["qa_role_feedback"]:
|
| 1169 |
-
|
| 1170 |
-
|
| 1171 |
-
|
| 1172 |
-
|
| 1173 |
-
trace.append(f" โน Role-specific feedback dispatched โ {
|
| 1174 |
trace.append(f"โโโ [QA TESTER] Result: {result_label} โโโ")
|
| 1175 |
return state
|
| 1176 |
|
| 1177 |
|
| 1178 |
-
def _step_planner_review(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1179 |
"""Planner: review QA feedback and either approve the result or request a revision."""
|
| 1180 |
trace.append("\nโโโ [PLANNER] Reviewing QA feedback... โโโ")
|
| 1181 |
# Format issues/fixes from structured QA data when available
|
| 1182 |
qa_data = state.get("qa_data") or {}
|
| 1183 |
-
|
| 1184 |
-
fixes = qa_data.get("recommended_fixes") or []
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
-
|
| 1188 |
-
|
| 1189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1190 |
else:
|
| 1191 |
qa_summary = state["qa_report"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1192 |
content = (
|
| 1193 |
f"User request: {state['user_request']}\n\n"
|
| 1194 |
f"Plan:\n{state['plan']}\n\n"
|
| 1195 |
-
f"Current deliverable:\n{
|
| 1196 |
f"QA result: {'PASS' if state['qa_passed'] else 'FAIL'}\n"
|
| 1197 |
f"QA details:\n{qa_summary}"
|
| 1198 |
)
|
|
@@ -1224,7 +1522,12 @@ def _step_planner_review(chat_model, state: WorkflowState, trace: List[str]) ->
|
|
| 1224 |
else:
|
| 1225 |
# Revision requested but REVISED INSTRUCTIONS section missing โ keep current plan
|
| 1226 |
trace.append(" โ REVISED INSTRUCTIONS section missing; retrying with existing plan.")
|
| 1227 |
-
new_role = _decide_role(review)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1228 |
state["current_role"] = new_role
|
| 1229 |
trace.append(
|
| 1230 |
f"โโโ [PLANNER] โ ๐ REVISE โ routing to {AGENT_ROLES.get(new_role, new_role).upper()} โโโ"
|
|
@@ -1232,6 +1535,7 @@ def _step_planner_review(chat_model, state: WorkflowState, trace: List[str]) ->
|
|
| 1232 |
return state
|
| 1233 |
|
| 1234 |
|
|
|
|
| 1235 |
def _step_research(chat_model, state: WorkflowState, trace: List[str]) -> WorkflowState:
|
| 1236 |
"""Research Analyst: gather information and produce a comprehensive research summary."""
|
| 1237 |
trace.append("\nโโโ [RESEARCH ANALYST] Gathering information... โโโ")
|
|
@@ -1396,35 +1700,75 @@ def _step_finalize(
|
|
| 1396 |
) -> WorkflowState:
|
| 1397 |
"""Finalizer: synthesise all specialist perspectives and produce the actual deliverable.
|
| 1398 |
|
| 1399 |
-
|
| 1400 |
-
|
| 1401 |
-
|
| 1402 |
-
|
|
|
|
|
|
|
|
|
|
| 1403 |
"""
|
| 1404 |
trace.append("\nโโโ [FINALIZER] Producing the deliverable from all perspectives... โโโ")
|
| 1405 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1406 |
for r_key, r_output in all_outputs:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1407 |
r_label = AGENT_ROLES.get(r_key, r_key)
|
| 1408 |
perspectives.append(f"=== {r_label} ===\n{r_output}")
|
| 1409 |
combined = "\n\n".join(perspectives)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1410 |
content = (
|
| 1411 |
f"User request: {state['user_request']}\n\n"
|
| 1412 |
-
f"
|
|
|
|
| 1413 |
)
|
| 1414 |
text = _llm_call(chat_model, _FINALIZER_SYSTEM, content)
|
| 1415 |
|
|
|
|
|
|
|
| 1416 |
if not text or not text.strip():
|
| 1417 |
-
trace.append(" โ [FINALIZER] returned empty output โ using
|
| 1418 |
-
text =
|
| 1419 |
-
|
| 1420 |
-
|
| 1421 |
-
# Model produced a meta-summary instead of the deliverable โ use primary output
|
| 1422 |
-
primary_output = all_outputs[0][1] if all_outputs else ""
|
| 1423 |
trace.append(
|
| 1424 |
-
" โ [FINALIZER]
|
| 1425 |
-
"
|
| 1426 |
)
|
| 1427 |
-
text =
|
|
|
|
|
|
|
|
|
|
| 1428 |
|
| 1429 |
state["finalized_output"] = text
|
| 1430 |
state["draft_output"] = text
|
|
@@ -1433,6 +1777,7 @@ def _step_finalize(
|
|
| 1433 |
return state
|
| 1434 |
|
| 1435 |
|
|
|
|
| 1436 |
# Mapping from role key โ step function, used by the orchestration loop
|
| 1437 |
_SPECIALIST_STEPS = {
|
| 1438 |
"creative": _step_creative,
|
|
@@ -1453,8 +1798,76 @@ _SPECIALIST_STEPS = {
|
|
| 1453 |
"lawyer": _step_lawyer,
|
| 1454 |
}
|
| 1455 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1456 |
|
| 1457 |
-
# --- Specialist role tools ---
|
| 1458 |
# These wrap the step functions as @tool so the Planner (or any LangChain agent)
|
| 1459 |
# can invoke specialists in a standard tool-use pattern.
|
| 1460 |
|
|
@@ -1471,7 +1884,7 @@ _EMPTY_STATE_BASE: WorkflowState = {
|
|
| 1471 |
"chairman_of_board_output": "", "maga_appointee_output": "", "lawyer_output": "",
|
| 1472 |
"finalized_output": "",
|
| 1473 |
"draft_output": "", "qa_report": "", "qa_data": {}, "qa_role_feedback": {}, "qa_passed": False,
|
| 1474 |
-
"revision_count": 0, "best_artifact": "", "final_answer": "",
|
| 1475 |
}
|
| 1476 |
|
| 1477 |
|
|
@@ -1718,6 +2131,7 @@ def run_multi_role_workflow(
|
|
| 1718 |
"qa_passed": False,
|
| 1719 |
"revision_count": 0,
|
| 1720 |
"best_artifact": "",
|
|
|
|
| 1721 |
"final_answer": "",
|
| 1722 |
}
|
| 1723 |
|
|
@@ -1731,8 +2145,8 @@ def run_multi_role_workflow(
|
|
| 1731 |
|
| 1732 |
try:
|
| 1733 |
if planner_active:
|
| 1734 |
-
# Step 1: Planner creates the initial plan
|
| 1735 |
-
state = _step_plan(chat_model, state, trace)
|
| 1736 |
else:
|
| 1737 |
# No planner: auto-select first active specialist
|
| 1738 |
state["current_role"] = active_specialist_keys[0]
|
|
@@ -1747,19 +2161,31 @@ def run_multi_role_workflow(
|
|
| 1747 |
# then run every other active specialist so all voices are heard.
|
| 1748 |
primary_role = state["current_role"]
|
| 1749 |
if primary_role not in active_specialist_keys:
|
| 1750 |
-
# Safe fallback: requested role not in active set
|
| 1751 |
fallback_role = active_specialist_keys[0]
|
| 1752 |
trace.append(
|
| 1753 |
-
f" โ Role '{
|
|
|
|
| 1754 |
)
|
| 1755 |
primary_role = fallback_role
|
| 1756 |
state["current_role"] = primary_role
|
| 1757 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1758 |
# Run the primary (planner-chosen) specialist
|
| 1759 |
primary_fn = _SPECIALIST_STEPS.get(primary_role, _step_technical)
|
| 1760 |
state = primary_fn(chat_model, state, trace)
|
| 1761 |
primary_output = state["draft_output"]
|
| 1762 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1763 |
# Run all other active specialists and collect their perspectives
|
| 1764 |
all_outputs: List[Tuple[str, str]] = [(primary_role, primary_output)]
|
| 1765 |
for specialist_role in active_specialist_keys:
|
|
@@ -1767,8 +2193,38 @@ def run_multi_role_workflow(
|
|
| 1767 |
continue # already ran above
|
| 1768 |
step_fn = _SPECIALIST_STEPS[specialist_role]
|
| 1769 |
state = step_fn(chat_model, state, trace)
|
| 1770 |
-
|
| 1771 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1772 |
all_outputs.append((specialist_role, state["draft_output"]))
|
| 1773 |
|
| 1774 |
# Finalize all perspectives into the actual deliverable.
|
|
@@ -1780,15 +2236,39 @@ def run_multi_role_workflow(
|
|
| 1780 |
state = _step_finalize(chat_model, state, trace, all_outputs)
|
| 1781 |
else:
|
| 1782 |
state["draft_output"] = primary_output
|
|
|
|
| 1783 |
|
| 1784 |
-
# Update best-candidate tracking:
|
| 1785 |
current_draft = state["draft_output"]
|
| 1786 |
-
if current_draft and current_draft.strip()
|
| 1787 |
-
state
|
| 1788 |
-
|
| 1789 |
-
|
| 1790 |
-
|
| 1791 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1792 |
|
| 1793 |
# Step 3: QA reviews the finalized draft (if enabled)
|
| 1794 |
if qa_active:
|
|
@@ -1802,7 +2282,9 @@ def run_multi_role_workflow(
|
|
| 1802 |
# Step 4: Planner reviews QA and either approves or schedules a revision
|
| 1803 |
if planner_active and qa_active:
|
| 1804 |
prev_plan = state["plan"]
|
| 1805 |
-
state = _step_planner_review(
|
|
|
|
|
|
|
| 1806 |
|
| 1807 |
# Exit if the Planner approved the result
|
| 1808 |
if state["final_answer"]:
|
|
@@ -1829,7 +2311,8 @@ def run_multi_role_workflow(
|
|
| 1829 |
state["final_answer"] = best
|
| 1830 |
trace.append(
|
| 1831 |
f"\nโโโ MAX REVISIONS REACHED ({MAX_REVISIONS}) โโโ\n"
|
| 1832 |
-
f"Returning best artifact (
|
|
|
|
| 1833 |
)
|
| 1834 |
break
|
| 1835 |
else:
|
|
@@ -1851,6 +2334,7 @@ def run_multi_role_workflow(
|
|
| 1851 |
return state["final_answer"], "\n".join(trace)
|
| 1852 |
|
| 1853 |
|
|
|
|
| 1854 |
# ============================================================
|
| 1855 |
# Agent builder
|
| 1856 |
# ============================================================
|
|
@@ -2330,6 +2814,142 @@ def run_demo_lunch_menu(
|
|
| 2330 |
return final_answer, trace
|
| 2331 |
|
| 2332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2333 |
if __name__ == "__main__":
|
| 2334 |
port = int(os.environ.get("PORT", 7860))
|
| 2335 |
demo.launch(
|
|
|
|
| 559 |
draft_output: str # latest specialist/finalized output forwarded to QA
|
| 560 |
qa_report: str
|
| 561 |
qa_data: Dict[str, Any] # parsed QA JSON data for structured access
|
| 562 |
+
qa_role_feedback: Dict[str, Any] # role key โ list[str] of targeted QA feedback items
|
| 563 |
qa_passed: bool
|
| 564 |
revision_count: int
|
| 565 |
best_artifact: str # best non-empty real deliverable seen so far
|
| 566 |
+
best_artifact_score: float # requirement-coverage score of best_artifact
|
| 567 |
final_answer: str
|
| 568 |
|
| 569 |
|
| 570 |
# --- Role system prompts ---
|
| 571 |
|
| 572 |
+
# Role description snippets used to build the dynamic planner prompt
|
| 573 |
+
_ROLE_DESCRIPTIONS: Dict[str, str] = {
|
| 574 |
+
"creative": "Creative Expert (ideas, framing, wording, brainstorming)",
|
| 575 |
+
"technical": "Technical Expert (code, architecture, implementation)",
|
| 576 |
+
"research": "Research Analyst (information gathering, literature review, fact-finding)",
|
| 577 |
+
"security": "Security Reviewer (security analysis, vulnerability checks, best practices)",
|
| 578 |
+
"data_analyst": "Data Analyst (data analysis, statistics, pattern recognition, insights)",
|
| 579 |
+
"mad_professor": "Mad Professor (radical scientific hypotheses, unhinged groundbreaking theories)",
|
| 580 |
+
"accountant": "Accountant (extreme cost scrutiny, ruthless cost-cutting, cheapest alternatives)",
|
| 581 |
+
"artist": "Artist (wildly unhinged creative vision, cosmic feeling and vibes)",
|
| 582 |
+
"lazy_slacker": "Lazy Slacker (minimum viable effort, shortcuts, good-enough solutions)",
|
| 583 |
+
"black_metal_fundamentalist": "Black Metal Fundamentalist (nihilistic kvlt critique, rejection of mainstream approaches)",
|
| 584 |
+
"labour_union_rep": "Labour Union Representative (worker rights, fair wages, job security)",
|
| 585 |
+
"ux_designer": "UX Designer (user needs, user-centricity, usability, accessibility)",
|
| 586 |
+
"doris": "Doris (well-meaning but clueless, rambling, off-topic observations)",
|
| 587 |
+
"chairman_of_board": "Chairman of the Board (corporate governance, shareholder value, strategic vision)",
|
| 588 |
+
"maga_appointee": "MAGA Appointee (America First perspective, anti-globalism, deregulation)",
|
| 589 |
+
"lawyer": "Lawyer (legal compliance, liability, contracts, risk management)",
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
|
| 593 |
+
def _build_planner_system(active_specialist_keys: List[str]) -> str:
|
| 594 |
+
"""Return a Planner system prompt that only lists *active* specialist roles.
|
| 595 |
+
|
| 596 |
+
This prevents the Planner from routing to roles that are not enabled in the
|
| 597 |
+
current run, which was a source of broken routing.
|
| 598 |
+
"""
|
| 599 |
+
if not active_specialist_keys:
|
| 600 |
+
active_specialist_keys = list(_ROLE_DESCRIPTIONS.keys())
|
| 601 |
+
|
| 602 |
+
role_lines = "\n".join(
|
| 603 |
+
f" - '{AGENT_ROLES.get(k, k)}'"
|
| 604 |
+
+ (f" โ {_ROLE_DESCRIPTIONS[k].split('(', 1)[1].rstrip(')')}" if k in _ROLE_DESCRIPTIONS else "")
|
| 605 |
+
for k in active_specialist_keys
|
| 606 |
+
if k in AGENT_ROLES
|
| 607 |
+
)
|
| 608 |
+
role_choices = " | ".join(
|
| 609 |
+
f"'{AGENT_ROLES.get(k, k)}'" for k in active_specialist_keys if k in AGENT_ROLES
|
| 610 |
+
)
|
| 611 |
+
return (
|
| 612 |
+
"You are the Planner in a multi-role AI workflow.\n"
|
| 613 |
+
"Your job is to:\n"
|
| 614 |
+
"1. Break the user's task into clear subtasks.\n"
|
| 615 |
+
"2. Decide which specialist to call as the PRIMARY lead "
|
| 616 |
+
"(choose ONLY from the active roles listed below):\n"
|
| 617 |
+
f"{role_lines}\n"
|
| 618 |
+
"3. State clear success criteria.\n\n"
|
| 619 |
+
"IMPORTANT: You MUST select the PRIMARY ROLE from the active roles listed above ONLY.\n"
|
| 620 |
+
"Do NOT invent or request roles that are not in this list.\n"
|
| 621 |
+
"ALL active specialists will also contribute their own perspective.\n"
|
| 622 |
+
"Your PRIMARY ROLE choice sets the lead voice, but every active role will be heard.\n\n"
|
| 623 |
+
"Respond in this exact format:\n"
|
| 624 |
+
"TASK BREAKDOWN:\n<subtask list>\n\n"
|
| 625 |
+
f"ROLE TO CALL: <{role_choices}>\n\n"
|
| 626 |
+
"SUCCESS CRITERIA:\n<what a correct, complete answer looks like>\n\n"
|
| 627 |
+
"GUIDANCE FOR SPECIALIST:\n<any constraints or focus areas>"
|
| 628 |
+
)
|
| 629 |
+
|
| 630 |
+
|
| 631 |
+
# Fallback static planner system (used when active roles are not yet known)
|
| 632 |
+
_PLANNER_SYSTEM = _build_planner_system(list(_ROLE_DESCRIPTIONS.keys()))
|
| 633 |
+
|
| 634 |
|
| 635 |
_CREATIVE_SYSTEM = (
|
| 636 |
"You are the Creative Expert in a multi-role AI workflow.\n"
|
|
|
|
| 654 |
|
| 655 |
_QA_SYSTEM = (
|
| 656 |
"You are the QA Tester in a multi-role AI workflow.\n"
|
| 657 |
+
"Evaluate ONLY the finalized deliverable (not specialist process notes or summaries).\n"
|
| 658 |
+
"When individual specialist contributions are provided, give targeted, actionable feedback per role\n"
|
| 659 |
+
"so each specialist knows exactly what to fix in the next revision.\n\n"
|
| 660 |
"Respond with ONLY a valid JSON object in this exact format (no text before or after the JSON):\n"
|
| 661 |
"{\n"
|
| 662 |
+
' "passed": false,\n'
|
| 663 |
+
' "score": 0.0,\n'
|
| 664 |
+
' "artifact_present": true,\n'
|
| 665 |
+
' "missing_requirements": ["<specific missing item>"],\n'
|
| 666 |
+
' "required_fixes": ["<concrete fix needed>"],\n'
|
| 667 |
+
' "role_feedback": {\n'
|
| 668 |
+
' "<role_key e.g. mad_professor>": ["<specific fix 1>", "<specific fix 2>"],\n'
|
| 669 |
+
' "<role_key>": ["<fix>"] \n'
|
| 670 |
+
" },\n"
|
| 671 |
+
' "summary": "<one-sentence overall assessment>"\n'
|
| 672 |
"}\n\n"
|
| 673 |
+
'Set "passed" to true only if ALL requirements are met and missing_requirements is empty.\n'
|
| 674 |
+
'Set "score" to a float 0.0-1.0 representing how much of the deliverable is complete.\n'
|
| 675 |
+
'Set "artifact_present" to true if a real deliverable (not just meta-commentary) is present.\n'
|
| 676 |
+
'In "role_feedback", use the internal role key (e.g. "mad_professor", "artist", "technical") not the display name.\n'
|
| 677 |
+
'Each role_feedback value must be a list of specific, actionable fixes for that role.\n'
|
| 678 |
+
'Focus role_feedback on concrete missing content, not general style.\n'
|
| 679 |
+
'In "required_fixes", list the concrete changes needed to make the deliverable pass.\n'
|
| 680 |
+
'Evaluate ONLY the finalized deliverable under "Finalized deliverable:" โ not the specialist inputs.'
|
| 681 |
)
|
| 682 |
|
| 683 |
_PLANNER_REVIEW_SYSTEM = (
|
|
|
|
| 905 |
return content_to_text(response.content)
|
| 906 |
|
| 907 |
|
| 908 |
+
def _decide_role(text: str, active_keys: Optional[List[str]] = None) -> str:
|
| 909 |
"""Parse which specialist role the Planner wants to invoke.
|
| 910 |
|
| 911 |
Normalises the 'ROLE TO CALL:' line (strips surrounding whitespace and
|
| 912 |
collapses internal spaces) before matching, then falls back to a
|
| 913 |
word-boundary search. Returns 'technical' when no clear signal is found.
|
| 914 |
+
|
| 915 |
+
When *active_keys* is provided the returned role key is guaranteed to be in
|
| 916 |
+
that set. If the detected role is not active a deterministic fallback is
|
| 917 |
+
applied (first active key, or 'technical').
|
| 918 |
"""
|
| 919 |
# Normalise: collapse runs of whitespace so "ROLE TO CALL : Creative Expert" still matches
|
| 920 |
normalised = re.sub(r"\s+", " ", text).strip()
|
|
|
|
| 938 |
("Lawyer", "lawyer"),
|
| 939 |
("Doris", "doris"),
|
| 940 |
]
|
| 941 |
+
detected: Optional[str] = None
|
| 942 |
for label, key in _LABEL_TO_KEY_ORDERED:
|
| 943 |
# Match "ROLE TO CALL: <label>" with optional surrounding whitespace
|
| 944 |
if re.search(r"ROLE\s+TO\s+CALL\s*:\s*" + re.escape(label), normalised, re.IGNORECASE):
|
| 945 |
+
detected = key
|
| 946 |
+
break
|
| 947 |
+
|
| 948 |
+
if detected is None:
|
| 949 |
+
# Fallback: word-boundary match on the full (normalised) text
|
| 950 |
+
_WORD_FALLBACKS = [
|
| 951 |
+
(r"\bcreative\b", "creative"),
|
| 952 |
+
(r"\bresearch\b", "research"),
|
| 953 |
+
(r"\bsecurity\b", "security"),
|
| 954 |
+
(r"\bdata\s+analyst\b", "data_analyst"),
|
| 955 |
+
(r"\bmad\s+professor\b", "mad_professor"),
|
| 956 |
+
(r"\baccountant\b", "accountant"),
|
| 957 |
+
(r"\bartist\b", "artist"),
|
| 958 |
+
(r"\blazy\s+slacker\b", "lazy_slacker"),
|
| 959 |
+
(r"\bblack\s+metal\b", "black_metal_fundamentalist"),
|
| 960 |
+
(r"\blabour\s+union\b", "labour_union_rep"),
|
| 961 |
+
(r"\bux\s+designer\b", "ux_designer"),
|
| 962 |
+
(r"\bdoris\b", "doris"),
|
| 963 |
+
(r"\bchairman\b", "chairman_of_board"),
|
| 964 |
+
(r"\bmaga\b", "maga_appointee"),
|
| 965 |
+
(r"\blawyer\b", "lawyer"),
|
| 966 |
+
]
|
| 967 |
+
for pattern, key in _WORD_FALLBACKS:
|
| 968 |
+
if re.search(pattern, normalised, re.IGNORECASE):
|
| 969 |
+
detected = key
|
| 970 |
+
break
|
| 971 |
+
|
| 972 |
+
detected = detected or "technical"
|
| 973 |
+
|
| 974 |
+
# Filter to active keys when provided
|
| 975 |
+
if active_keys:
|
| 976 |
+
if detected not in active_keys:
|
| 977 |
+
fallback = active_keys[0] if active_keys else "technical"
|
| 978 |
+
return fallback
|
| 979 |
+
return detected
|
| 980 |
|
| 981 |
|
| 982 |
def _parse_qa_json(qa_text: str) -> Optional[Dict[str, Any]]:
|
|
|
|
| 1005 |
def _qa_passed_check(qa_text: str) -> bool:
|
| 1006 |
"""Return True only if the QA report contains an explicit PASS result.
|
| 1007 |
|
| 1008 |
+
Handles both the new format (``passed`` bool + ``score``) and the legacy
|
| 1009 |
+
format (``result: PASS|FAIL``). Returns False when the expected format is
|
| 1010 |
+
absent to avoid false positives from words like 'bypass' or 'password'.
|
|
|
|
| 1011 |
"""
|
| 1012 |
data = _parse_qa_json(qa_text)
|
| 1013 |
if data is not None:
|
| 1014 |
+
# New format: "passed" bool field
|
| 1015 |
+
if "passed" in data:
|
| 1016 |
+
return bool(data["passed"])
|
| 1017 |
+
# Legacy format: "result" string field
|
| 1018 |
result = str(data.get("result", "")).strip().upper()
|
| 1019 |
if result == "PASS":
|
| 1020 |
return True
|
|
|
|
| 1029 |
return False
|
| 1030 |
|
| 1031 |
|
| 1032 |
+
def _parse_qa_role_feedback(qa_text: str) -> Dict[str, Any]:
|
| 1033 |
"""Extract per-role targeted feedback from a QA report.
|
| 1034 |
|
| 1035 |
+
Supports the new format where each role maps to a list of strings, and the
|
| 1036 |
+
legacy format where each role maps to a single string. Also accepts role
|
| 1037 |
+
keys (e.g. 'mad_professor') directly in addition to display labels.
|
| 1038 |
+
|
| 1039 |
+
Returns a dict mapping role keys (e.g. 'creative', 'technical') to either
|
| 1040 |
+
a list of fix strings (new format) or a plain string (legacy format).
|
| 1041 |
"""
|
| 1042 |
+
feedback: Dict[str, Any] = {}
|
| 1043 |
|
| 1044 |
+
# JSON path โ handles both new (list) and legacy (string) values
|
| 1045 |
data = _parse_qa_json(qa_text)
|
| 1046 |
if data is not None and isinstance(data.get("role_feedback"), dict):
|
| 1047 |
+
for label_or_key, fb in data["role_feedback"].items():
|
| 1048 |
+
label_or_key = str(label_or_key).strip()
|
| 1049 |
+
# Try direct role key first, then display-label lookup
|
| 1050 |
+
if label_or_key in AGENT_ROLES:
|
| 1051 |
+
role_key = label_or_key
|
| 1052 |
+
else:
|
| 1053 |
+
role_key = _ROLE_LABEL_TO_KEY.get(label_or_key)
|
| 1054 |
if role_key and fb:
|
| 1055 |
+
# Normalise to list for consistent downstream use
|
| 1056 |
+
if isinstance(fb, list):
|
| 1057 |
+
feedback[role_key] = [str(x).strip() for x in fb if x]
|
| 1058 |
+
else:
|
| 1059 |
+
feedback[role_key] = [str(fb).strip()]
|
| 1060 |
return feedback
|
| 1061 |
|
| 1062 |
# Legacy text fallback
|
|
|
|
| 1076 |
role_fb = role_fb.strip()
|
| 1077 |
role_key = _ROLE_LABEL_TO_KEY.get(role_label)
|
| 1078 |
if role_key and role_fb:
|
| 1079 |
+
feedback[role_key] = [role_fb]
|
| 1080 |
|
| 1081 |
return feedback
|
| 1082 |
|
|
|
|
| 1103 |
return count >= 2
|
| 1104 |
|
| 1105 |
|
| 1106 |
+
# โโโ Output quality helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 1107 |
+
|
| 1108 |
+
# Persona-heavy roles that need revision-mode behaviour enforcement
|
| 1109 |
+
_PERSONA_ROLE_KEYS = frozenset({
|
| 1110 |
+
"mad_professor", "artist", "lazy_slacker",
|
| 1111 |
+
"black_metal_fundamentalist", "doris", "maga_appointee",
|
| 1112 |
+
})
|
| 1113 |
+
|
| 1114 |
+
# Heuristics that suggest a real, structured deliverable
|
| 1115 |
+
_DELIVERABLE_SIGNALS = [
|
| 1116 |
+
r"(?:option|choice|step|ingredient|method|approach|alternative)\s*\d*\s*[:\-]",
|
| 1117 |
+
r"^\s*\d+[\.\)]\s+\w+", # numbered list item
|
| 1118 |
+
r"^#{1,3}\s+\w+", # markdown header
|
| 1119 |
+
r"\*\*[^\*]+\*\*", # bold text
|
| 1120 |
+
r"^\s*[-โข]\s+\w+", # bullet point
|
| 1121 |
+
r"\b(?:recipe|menu|plan|schedule|budget|code|function|class|def |import )\b",
|
| 1122 |
+
]
|
| 1123 |
+
|
| 1124 |
+
|
| 1125 |
+
def _is_empty_output(text: str) -> bool:
|
| 1126 |
+
"""Return True when *text* is blank or shorter than a minimal threshold."""
|
| 1127 |
+
return not text or len(text.strip()) < 30
|
| 1128 |
+
|
| 1129 |
+
|
| 1130 |
+
def _is_relevant_to_request(text: str, request: str) -> bool:
|
| 1131 |
+
"""Return True when *text* shares enough key words with *request*.
|
| 1132 |
+
|
| 1133 |
+
Uses a simple word-overlap heuristic: at least 25 % of significant words
|
| 1134 |
+
from the request must appear somewhere in the output.
|
| 1135 |
+
"""
|
| 1136 |
+
if not text or not request:
|
| 1137 |
+
return False
|
| 1138 |
+
request_words = {w.lower() for w in re.findall(r"\b\w{4,}\b", request)}
|
| 1139 |
+
if not request_words:
|
| 1140 |
+
return True
|
| 1141 |
+
text_lower = text.lower()
|
| 1142 |
+
overlap = sum(1 for w in request_words if w in text_lower)
|
| 1143 |
+
return overlap / len(request_words) >= 0.25
|
| 1144 |
+
|
| 1145 |
+
|
| 1146 |
+
def _looks_like_actionable_deliverable(text: str) -> bool:
|
| 1147 |
+
"""Return True when *text* shows structural markers typical of a real deliverable."""
|
| 1148 |
+
if not text or len(text.strip()) < 50:
|
| 1149 |
+
return False
|
| 1150 |
+
text_lower = text.lower()
|
| 1151 |
+
matches = sum(
|
| 1152 |
+
1 for sig in _DELIVERABLE_SIGNALS
|
| 1153 |
+
if re.search(sig, text_lower, re.MULTILINE | re.IGNORECASE)
|
| 1154 |
+
)
|
| 1155 |
+
return matches >= 2
|
| 1156 |
+
|
| 1157 |
+
|
| 1158 |
+
def _score_candidate(text: str, requirements: List[str], request: str) -> float:
|
| 1159 |
+
"""Score an output 0.0-1.0 against requirement coverage + structural quality.
|
| 1160 |
+
|
| 1161 |
+
Combines:
|
| 1162 |
+
- Fraction of explicit requirements whose key words appear in the text (60 %)
|
| 1163 |
+
- Whether the text looks like an actionable deliverable (20 %)
|
| 1164 |
+
- Whether the text is relevant to the request (20 %)
|
| 1165 |
+
"""
|
| 1166 |
+
if not text or not text.strip():
|
| 1167 |
+
return 0.0
|
| 1168 |
+
|
| 1169 |
+
req_score = 0.0
|
| 1170 |
+
if requirements:
|
| 1171 |
+
text_lower = text.lower()
|
| 1172 |
+
covered = sum(
|
| 1173 |
+
1 for req in requirements
|
| 1174 |
+
if any(w in text_lower for w in re.findall(r"\b\w{3,}\b", req.lower()))
|
| 1175 |
+
)
|
| 1176 |
+
req_score = covered / len(requirements)
|
| 1177 |
+
|
| 1178 |
+
deliverable_score = 1.0 if _looks_like_actionable_deliverable(text) else 0.0
|
| 1179 |
+
relevance_score = 1.0 if _is_relevant_to_request(text, request) else 0.0
|
| 1180 |
+
|
| 1181 |
+
return 0.6 * req_score + 0.2 * deliverable_score + 0.2 * relevance_score
|
| 1182 |
+
|
| 1183 |
+
|
| 1184 |
+
def _format_role_feedback_for_prompt(feedback: Any) -> str:
|
| 1185 |
+
"""Render QA role feedback (list or string) as a readable string for prompts."""
|
| 1186 |
+
if isinstance(feedback, list):
|
| 1187 |
+
return "\n".join(f"- {item}" for item in feedback if item)
|
| 1188 |
+
return str(feedback).strip()
|
| 1189 |
+
|
| 1190 |
+
|
| 1191 |
+
def _strict_retry_specialist(
|
| 1192 |
+
chat_model,
|
| 1193 |
+
system_prompt: str,
|
| 1194 |
+
state: "WorkflowState",
|
| 1195 |
+
role_key: str,
|
| 1196 |
+
output_key: str,
|
| 1197 |
+
label: str,
|
| 1198 |
+
trace: List[str],
|
| 1199 |
+
) -> str:
|
| 1200 |
+
"""Re-invoke a specialist with a strict, no-theatrics prompt after a poor output.
|
| 1201 |
+
|
| 1202 |
+
Returns the new output text (may still be empty/poor; callers should check).
|
| 1203 |
+
"""
|
| 1204 |
+
previous_output = state.get(output_key, "") # type: ignore[literal-required]
|
| 1205 |
+
qa_data = state.get("qa_data") or {}
|
| 1206 |
+
fixes = qa_data.get("required_fixes") or qa_data.get("recommended_fixes") or []
|
| 1207 |
+
role_fb = state["qa_role_feedback"].get(role_key)
|
| 1208 |
+
|
| 1209 |
+
strict_content = (
|
| 1210 |
+
f"User request: {state['user_request']}\n\n"
|
| 1211 |
+
f"Planner instructions:\n{state['plan']}\n\n"
|
| 1212 |
+
"STRICT RETRY: Your previous response did not satisfy the task requirements.\n"
|
| 1213 |
+
"Drop persona theatrics entirely. Provide directly usable, task-relevant content.\n"
|
| 1214 |
+
"Address every required fix listed below. Return the deliverable content only โ "
|
| 1215 |
+
"no explanations of what you are about to do, no process notes.\n\n"
|
| 1216 |
+
)
|
| 1217 |
+
if fixes:
|
| 1218 |
+
strict_content += "Required fixes:\n" + "\n".join(f"- {f}" for f in fixes) + "\n\n"
|
| 1219 |
+
if role_fb:
|
| 1220 |
+
strict_content += "Role-specific fixes:\n" + _format_role_feedback_for_prompt(role_fb) + "\n\n"
|
| 1221 |
+
if previous_output:
|
| 1222 |
+
strict_content += f"Your previous (unsatisfactory) output:\n{previous_output}\n\n"
|
| 1223 |
+
strict_content += "Now produce the corrected, improved deliverable:"
|
| 1224 |
+
|
| 1225 |
+
trace.append(f" โฉ [{label}] strict retry invoked.")
|
| 1226 |
+
new_output = _llm_call(chat_model, system_prompt, strict_content)
|
| 1227 |
+
trace.append(f" โ [{label}] strict retry complete ({len(new_output)} chars).")
|
| 1228 |
+
return new_output
|
| 1229 |
+
|
| 1230 |
+
|
| 1231 |
# --- Workflow step functions ---
|
| 1232 |
# Each step receives the shared state and an append-only trace list,
|
| 1233 |
# updates state in place, appends log lines, and returns updated state.
|
| 1234 |
|
| 1235 |
+
def _step_plan(
|
| 1236 |
+
chat_model,
|
| 1237 |
+
state: WorkflowState,
|
| 1238 |
+
trace: List[str],
|
| 1239 |
+
active_specialist_keys: Optional[List[str]] = None,
|
| 1240 |
+
) -> WorkflowState:
|
| 1241 |
+
"""Planner: analyse the task, produce a plan, decide which specialist to call.
|
| 1242 |
+
|
| 1243 |
+
Uses a dynamically generated system prompt that lists ONLY the active
|
| 1244 |
+
specialist roles, preventing routing to inactive roles.
|
| 1245 |
+
"""
|
| 1246 |
trace.append("\nโโโ [PLANNER] Analysing task... โโโ")
|
| 1247 |
+
planner_system = _build_planner_system(active_specialist_keys or [])
|
| 1248 |
content = f"User request: {state['user_request']}"
|
| 1249 |
if state["revision_count"] > 0:
|
| 1250 |
# Use structured QA data when available for clearer revision guidance
|
| 1251 |
qa_data = state.get("qa_data") or {}
|
| 1252 |
+
missing = qa_data.get("missing_requirements") or []
|
| 1253 |
+
fixes = qa_data.get("required_fixes") or qa_data.get("recommended_fixes") or []
|
| 1254 |
+
score = qa_data.get("score", "?")
|
| 1255 |
+
summary = qa_data.get("summary", "")
|
| 1256 |
+
if missing or fixes:
|
| 1257 |
+
qa_summary = ""
|
| 1258 |
+
if summary:
|
| 1259 |
+
qa_summary += f"QA summary (score {score}): {summary}\n\n"
|
| 1260 |
+
if missing:
|
| 1261 |
+
qa_summary += "Missing requirements:\n" + "\n".join(f"- {m}" for m in missing) + "\n\n"
|
| 1262 |
+
if fixes:
|
| 1263 |
+
qa_summary += "Required fixes:\n" + "\n".join(f"- {f}" for f in fixes)
|
| 1264 |
+
else:
|
| 1265 |
+
qa_summary = state["qa_report"]
|
| 1266 |
+
# Identify which role produced the best content (prefer routing to it)
|
| 1267 |
+
best_note = ""
|
| 1268 |
+
if state.get("best_artifact") and not _is_meta_summary(state["best_artifact"]):
|
| 1269 |
+
best_note = "\nNOTE: A good concrete artifact was found in a previous round. Route to the role most likely to improve on it."
|
| 1270 |
content += (
|
| 1271 |
f"\n\nThis is revision {state['revision_count']} of {MAX_REVISIONS}."
|
| 1272 |
f"\nQA concerns to address:\n{qa_summary}"
|
| 1273 |
+
f"{best_note}"
|
| 1274 |
+
"\nChoose the role most likely to produce the ACTUAL DELIVERABLE that addresses these concerns."
|
| 1275 |
+
"\nRemember to select ONLY from the active roles listed in your instructions."
|
| 1276 |
)
|
| 1277 |
+
plan_text = _llm_call(chat_model, planner_system, content)
|
| 1278 |
state["plan"] = plan_text
|
| 1279 |
+
state["current_role"] = _decide_role(plan_text, active_keys=active_specialist_keys)
|
| 1280 |
+
role_label = AGENT_ROLES.get(state["current_role"], state["current_role"]).upper()
|
| 1281 |
trace.append(plan_text)
|
| 1282 |
+
trace.append(f"โโโ [PLANNER] โ routing to: {role_label} โโโ")
|
| 1283 |
return state
|
| 1284 |
|
| 1285 |
|
| 1286 |
+
|
| 1287 |
+
def _build_specialist_content(
|
| 1288 |
+
state: WorkflowState,
|
| 1289 |
+
role_key: str,
|
| 1290 |
+
previous_output_key: str,
|
| 1291 |
+
strict: bool = False,
|
| 1292 |
+
) -> str:
|
| 1293 |
"""Build the user-facing content string for a specialist LLM call.
|
| 1294 |
|
| 1295 |
On the first pass this is just the request + plan.
|
| 1296 |
On revision passes it additionally includes:
|
| 1297 |
+
- Explicit revision-mode prompt contract
|
| 1298 |
+
- The specialist's own previous output
|
| 1299 |
+
- The previous finalizer output (so specialists can see the last merged answer)
|
| 1300 |
+
- Structured QA feedback: global required_fixes + role-specific fixes
|
| 1301 |
+
- A note if persona mode should be suppressed (persona roles only)
|
| 1302 |
+
When *strict* is True the tone is even more direct (used by retry logic).
|
| 1303 |
"""
|
| 1304 |
+
revision = state["revision_count"]
|
| 1305 |
content = (
|
| 1306 |
f"User request: {state['user_request']}\n\n"
|
| 1307 |
f"Planner instructions:\n{state['plan']}"
|
| 1308 |
)
|
| 1309 |
+
|
| 1310 |
+
if revision > 0:
|
| 1311 |
+
# โโ Revision-mode prompt contract โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 1312 |
+
contract = (
|
| 1313 |
+
f"\n\n{'โ'*60}\n"
|
| 1314 |
+
f"REVISION {revision} of {MAX_REVISIONS} โ REVISION MODE ACTIVE\n"
|
| 1315 |
+
"You are revising your previous contribution.\n"
|
| 1316 |
+
"Fix the specific issues identified by QA.\n"
|
| 1317 |
+
"Return ONLY the improved content for your role.\n"
|
| 1318 |
+
"Do NOT return commentary, summaries of opinions, or statements about "
|
| 1319 |
+
"what you plan to do.\n"
|
| 1320 |
+
)
|
| 1321 |
+
if role_key in _PERSONA_ROLE_KEYS and not strict:
|
| 1322 |
+
contract += (
|
| 1323 |
+
"PERSONA NOTE: Keep your characteristic voice for style only. "
|
| 1324 |
+
"In revision mode you MUST prioritise usefulness and requirement "
|
| 1325 |
+
"satisfaction over persona performance. "
|
| 1326 |
+
"Limit persona intro text. Provide concrete deliverable content first.\n"
|
| 1327 |
+
)
|
| 1328 |
+
if strict:
|
| 1329 |
+
contract += (
|
| 1330 |
+
"STRICT RETRY: Your previous response failed quality checks. "
|
| 1331 |
+
"Drop all persona theatrics. Produce directly usable content only.\n"
|
| 1332 |
+
)
|
| 1333 |
+
contract += "โ" * 60
|
| 1334 |
+
content += contract
|
| 1335 |
+
|
| 1336 |
+
# โโ Previous finalizer output โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 1337 |
+
prev_final = state.get("finalized_output", "")
|
| 1338 |
+
if prev_final and not _is_meta_summary(prev_final):
|
| 1339 |
+
content += f"\n\nPrevious finalizer output (the merged answer from last round):\n{short_text(prev_final, 800)}"
|
| 1340 |
+
|
| 1341 |
+
# โโ This role's own previous output โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 1342 |
previous_output = state.get(previous_output_key, "") # type: ignore[literal-required]
|
| 1343 |
if previous_output:
|
| 1344 |
+
content += f"\n\nYour previous output (revise and improve this):\n{previous_output}"
|
| 1345 |
+
|
| 1346 |
+
# โโ QA feedback โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 1347 |
+
qa_data = state.get("qa_data") or {}
|
| 1348 |
+
required_fixes = qa_data.get("required_fixes") or qa_data.get("recommended_fixes") or []
|
| 1349 |
+
missing = qa_data.get("missing_requirements") or []
|
| 1350 |
+
role_feedback = state["qa_role_feedback"].get(role_key)
|
| 1351 |
+
|
| 1352 |
if role_feedback:
|
| 1353 |
+
fb_text = _format_role_feedback_for_prompt(role_feedback)
|
| 1354 |
+
content += f"\n\nQA feedback specific to your role โ address each item:\n{fb_text}"
|
| 1355 |
+
if missing:
|
| 1356 |
+
content += "\n\nMissing requirements to add:\n" + "\n".join(f"- {m}" for m in missing)
|
| 1357 |
+
if required_fixes:
|
| 1358 |
+
content += "\n\nRequired fixes from QA:\n" + "\n".join(f"- {f}" for f in required_fixes)
|
| 1359 |
+
if not role_feedback and not required_fixes and not missing and state["qa_report"]:
|
| 1360 |
+
# Fallback: include score/summary from QA report
|
| 1361 |
+
score = qa_data.get("score", "")
|
| 1362 |
+
summary = qa_data.get("summary", "")
|
| 1363 |
+
if summary:
|
| 1364 |
+
content += f"\n\nQA summary (score {score}): {summary}"
|
| 1365 |
+
|
| 1366 |
return content
|
| 1367 |
|
| 1368 |
|
|
|
|
| 1406 |
trace: List[str],
|
| 1407 |
all_outputs: Optional[List[Tuple[str, str]]] = None,
|
| 1408 |
) -> WorkflowState:
|
| 1409 |
+
"""QA Tester: check the FINALIZED deliverable against the original request.
|
| 1410 |
+
|
| 1411 |
+
Evaluates ``state['draft_output']`` (the finalizer output), not individual
|
| 1412 |
+
specialist contributions. Individual specialist outputs are still passed
|
| 1413 |
+
so QA can supply targeted, per-role feedback stored in
|
| 1414 |
+
``state['qa_role_feedback']`` for the next revision pass.
|
| 1415 |
|
| 1416 |
+
Logs exactly what text was evaluated, the score, and the missing requirements.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1417 |
"""
|
| 1418 |
+
trace.append("\nโโโ [QA TESTER] Reviewing finalized deliverable... โโโ")
|
| 1419 |
+
evaluated_text = state["draft_output"]
|
| 1420 |
+
trace.append(f" โน QA evaluating: {len(evaluated_text)} chars of finalized output")
|
| 1421 |
+
|
| 1422 |
content = (
|
| 1423 |
f"Original user request: {state['user_request']}\n\n"
|
| 1424 |
f"Planner's plan and success criteria:\n{state['plan']}\n\n"
|
| 1425 |
)
|
| 1426 |
if all_outputs:
|
| 1427 |
+
# Include specialist contributions for role-specific feedback only
|
| 1428 |
+
content += "Individual specialist contributions (for per-role feedback only):\n\n"
|
| 1429 |
for r_key, r_output in all_outputs:
|
| 1430 |
r_label = AGENT_ROLES.get(r_key, r_key)
|
| 1431 |
+
content += f"=== {r_label} (role key: {r_key}) ===\n{short_text(r_output, 600)}\n\n"
|
| 1432 |
+
content += f"Finalized deliverable (evaluate this for PASS/FAIL):\n{evaluated_text}"
|
| 1433 |
+
|
|
|
|
| 1434 |
text = _llm_call(chat_model, _QA_SYSTEM, content)
|
| 1435 |
state["qa_report"] = text
|
| 1436 |
state["qa_data"] = _parse_qa_json(text) or {}
|
| 1437 |
state["qa_role_feedback"] = _parse_qa_role_feedback(text)
|
| 1438 |
state["qa_passed"] = _qa_passed_check(text)
|
| 1439 |
+
|
| 1440 |
+
qa_data = state["qa_data"]
|
| 1441 |
+
score = qa_data.get("score", "?")
|
| 1442 |
+
artifact_present = qa_data.get("artifact_present", "?")
|
| 1443 |
+
missing = qa_data.get("missing_requirements") or []
|
| 1444 |
result_label = "โ
PASS" if state["qa_passed"] else "โ FAIL"
|
| 1445 |
+
|
| 1446 |
trace.append(text)
|
| 1447 |
+
trace.append(
|
| 1448 |
+
f" โน QA result: {result_label} | score: {score} | artifact_present: {artifact_present}"
|
| 1449 |
+
)
|
| 1450 |
+
if missing:
|
| 1451 |
+
trace.append(" โน Missing requirements: " + "; ".join(str(m) for m in missing[:5]))
|
| 1452 |
if state["qa_role_feedback"]:
|
| 1453 |
+
fb_parts = []
|
| 1454 |
+
for k, v in state["qa_role_feedback"].items():
|
| 1455 |
+
preview = (v[0][:60] + "โฆ") if isinstance(v, list) and v else (str(v)[:60])
|
| 1456 |
+
fb_parts.append(f"{AGENT_ROLES.get(k, k)}: {preview}")
|
| 1457 |
+
trace.append(f" โน Role-specific feedback dispatched โ {', '.join(fb_parts)}")
|
| 1458 |
trace.append(f"โโโ [QA TESTER] Result: {result_label} โโโ")
|
| 1459 |
return state
|
| 1460 |
|
| 1461 |
|
| 1462 |
+
def _step_planner_review(
|
| 1463 |
+
chat_model,
|
| 1464 |
+
state: WorkflowState,
|
| 1465 |
+
trace: List[str],
|
| 1466 |
+
active_specialist_keys: Optional[List[str]] = None,
|
| 1467 |
+
) -> WorkflowState:
|
| 1468 |
"""Planner: review QA feedback and either approve the result or request a revision."""
|
| 1469 |
trace.append("\nโโโ [PLANNER] Reviewing QA feedback... โโโ")
|
| 1470 |
# Format issues/fixes from structured QA data when available
|
| 1471 |
qa_data = state.get("qa_data") or {}
|
| 1472 |
+
missing = qa_data.get("missing_requirements") or []
|
| 1473 |
+
fixes = qa_data.get("required_fixes") or qa_data.get("recommended_fixes") or []
|
| 1474 |
+
score = qa_data.get("score", "?")
|
| 1475 |
+
summary = qa_data.get("summary", "")
|
| 1476 |
+
if missing or fixes:
|
| 1477 |
+
qa_summary = ""
|
| 1478 |
+
if summary:
|
| 1479 |
+
qa_summary += f"QA summary (score {score}): {summary}\n\n"
|
| 1480 |
+
if missing:
|
| 1481 |
+
qa_summary += "Missing requirements:\n" + "\n".join(f"- {m}" for m in missing) + "\n\n"
|
| 1482 |
+
if fixes:
|
| 1483 |
+
qa_summary += "Required fixes:\n" + "\n".join(f"- {f}" for f in fixes)
|
| 1484 |
else:
|
| 1485 |
qa_summary = state["qa_report"]
|
| 1486 |
+
|
| 1487 |
+
# Use best artifact for approval check, not just the latest draft
|
| 1488 |
+
deliverable = state.get("best_artifact") or state["draft_output"]
|
| 1489 |
+
|
| 1490 |
content = (
|
| 1491 |
f"User request: {state['user_request']}\n\n"
|
| 1492 |
f"Plan:\n{state['plan']}\n\n"
|
| 1493 |
+
f"Current deliverable:\n{deliverable}\n\n"
|
| 1494 |
f"QA result: {'PASS' if state['qa_passed'] else 'FAIL'}\n"
|
| 1495 |
f"QA details:\n{qa_summary}"
|
| 1496 |
)
|
|
|
|
| 1522 |
else:
|
| 1523 |
# Revision requested but REVISED INSTRUCTIONS section missing โ keep current plan
|
| 1524 |
trace.append(" โ REVISED INSTRUCTIONS section missing; retrying with existing plan.")
|
| 1525 |
+
new_role = _decide_role(review, active_keys=active_specialist_keys)
|
| 1526 |
+
# If the new role was inactive (fallback triggered), log it clearly
|
| 1527 |
+
if active_specialist_keys and new_role not in active_specialist_keys:
|
| 1528 |
+
fallback = active_specialist_keys[0]
|
| 1529 |
+
trace.append(f" โ Planner requested inactive role โ falling back to {AGENT_ROLES.get(fallback, fallback).upper()}")
|
| 1530 |
+
new_role = fallback
|
| 1531 |
state["current_role"] = new_role
|
| 1532 |
trace.append(
|
| 1533 |
f"โโโ [PLANNER] โ ๐ REVISE โ routing to {AGENT_ROLES.get(new_role, new_role).upper()} โโโ"
|
|
|
|
| 1535 |
return state
|
| 1536 |
|
| 1537 |
|
| 1538 |
+
|
| 1539 |
def _step_research(chat_model, state: WorkflowState, trace: List[str]) -> WorkflowState:
|
| 1540 |
"""Research Analyst: gather information and produce a comprehensive research summary."""
|
| 1541 |
trace.append("\nโโโ [RESEARCH ANALYST] Gathering information... โโโ")
|
|
|
|
| 1700 |
) -> WorkflowState:
|
| 1701 |
"""Finalizer: synthesise all specialist perspectives and produce the actual deliverable.
|
| 1702 |
|
| 1703 |
+
Strategy:
|
| 1704 |
+
1. Score each specialist output for requirement coverage + actionability.
|
| 1705 |
+
2. Pass the best-scoring outputs (rather than all) to the LLM, so the
|
| 1706 |
+
prompt is not dominated by irrelevant persona content.
|
| 1707 |
+
3. Hard-guard against meta-summary output: if the LLM ignores instructions,
|
| 1708 |
+
fall back to the best-scoring specialist output directly.
|
| 1709 |
+
4. Never let meta-summary content overwrite best_artifact.
|
| 1710 |
"""
|
| 1711 |
trace.append("\nโโโ [FINALIZER] Producing the deliverable from all perspectives... โโโ")
|
| 1712 |
+
|
| 1713 |
+
# โโ Score and rank specialist outputs โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 1714 |
+
qa_data = state.get("qa_data") or {}
|
| 1715 |
+
requirements = (
|
| 1716 |
+
(qa_data.get("missing_requirements") or [])
|
| 1717 |
+
+ (qa_data.get("required_fixes") or qa_data.get("recommended_fixes") or [])
|
| 1718 |
+
)
|
| 1719 |
+
scored: List[Tuple[float, str, str]] = [] # (score, role_key, output)
|
| 1720 |
for r_key, r_output in all_outputs:
|
| 1721 |
+
if not r_output or not r_output.strip():
|
| 1722 |
+
continue
|
| 1723 |
+
s = _score_candidate(r_output, requirements, state["user_request"])
|
| 1724 |
+
scored.append((s, r_key, r_output))
|
| 1725 |
+
scored.sort(key=lambda x: x[0], reverse=True)
|
| 1726 |
+
|
| 1727 |
+
# Log candidate scores
|
| 1728 |
+
for s, r_key, _ in scored:
|
| 1729 |
+
trace.append(f" โน [FINALIZER] candidate score {s:.2f} โ {AGENT_ROLES.get(r_key, r_key)}")
|
| 1730 |
+
|
| 1731 |
+
# Use the top-3 highest-scoring (or all if fewer) to limit LLM context
|
| 1732 |
+
top_outputs = scored[:3] if len(scored) > 3 else scored
|
| 1733 |
+
base_role = top_outputs[0][1] if top_outputs else (all_outputs[0][0] if all_outputs else "")
|
| 1734 |
+
base_output = top_outputs[0][2] if top_outputs else (all_outputs[0][1] if all_outputs else "")
|
| 1735 |
+
|
| 1736 |
+
trace.append(f" โน [FINALIZER] base source: {AGENT_ROLES.get(base_role, base_role)}")
|
| 1737 |
+
|
| 1738 |
+
perspectives = []
|
| 1739 |
+
for _, r_key, r_output in top_outputs:
|
| 1740 |
r_label = AGENT_ROLES.get(r_key, r_key)
|
| 1741 |
perspectives.append(f"=== {r_label} ===\n{r_output}")
|
| 1742 |
combined = "\n\n".join(perspectives)
|
| 1743 |
+
|
| 1744 |
+
# Include the previous finalizer output if this is a revision
|
| 1745 |
+
prev_final_note = ""
|
| 1746 |
+
prev_final = state.get("finalized_output", "")
|
| 1747 |
+
if prev_final and not _is_meta_summary(prev_final) and state["revision_count"] > 0:
|
| 1748 |
+
prev_final_note = f"\nPrevious finalizer output (improve on this, do not summarise it):\n{short_text(prev_final, 800)}\n\n"
|
| 1749 |
+
|
| 1750 |
content = (
|
| 1751 |
f"User request: {state['user_request']}\n\n"
|
| 1752 |
+
f"{prev_final_note}"
|
| 1753 |
+
f"Top specialist perspectives to synthesise into the deliverable:\n\n{combined}"
|
| 1754 |
)
|
| 1755 |
text = _llm_call(chat_model, _FINALIZER_SYSTEM, content)
|
| 1756 |
|
| 1757 |
+
# โโ Meta-summary guard โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 1758 |
+
meta_guard_triggered = False
|
| 1759 |
if not text or not text.strip():
|
| 1760 |
+
trace.append(" โ [FINALIZER] returned empty output โ using best specialist output.")
|
| 1761 |
+
text = base_output
|
| 1762 |
+
meta_guard_triggered = True
|
| 1763 |
+
elif _is_meta_summary(text):
|
|
|
|
|
|
|
| 1764 |
trace.append(
|
| 1765 |
+
" โ [FINALIZER] meta-summary guard triggered (perspective-summary headers detected).\n"
|
| 1766 |
+
f" Substituting best specialist output from {AGENT_ROLES.get(base_role, base_role)}."
|
| 1767 |
)
|
| 1768 |
+
text = base_output
|
| 1769 |
+
meta_guard_triggered = True
|
| 1770 |
+
|
| 1771 |
+
trace.append(f" โน [FINALIZER] meta-summary guard: {'triggered' if meta_guard_triggered else 'not triggered'}")
|
| 1772 |
|
| 1773 |
state["finalized_output"] = text
|
| 1774 |
state["draft_output"] = text
|
|
|
|
| 1777 |
return state
|
| 1778 |
|
| 1779 |
|
| 1780 |
+
|
| 1781 |
# Mapping from role key โ step function, used by the orchestration loop
|
| 1782 |
_SPECIALIST_STEPS = {
|
| 1783 |
"creative": _step_creative,
|
|
|
|
| 1798 |
"lawyer": _step_lawyer,
|
| 1799 |
}
|
| 1800 |
|
| 1801 |
+
# Mapping from role key โ system prompt string (used for strict retries)
|
| 1802 |
+
_SPECIALIST_SYSTEM_PROMPTS: Dict[str, str] = {
|
| 1803 |
+
"creative": _CREATIVE_SYSTEM,
|
| 1804 |
+
"technical": _TECHNICAL_SYSTEM,
|
| 1805 |
+
"research": _RESEARCH_SYSTEM,
|
| 1806 |
+
"security": _SECURITY_SYSTEM,
|
| 1807 |
+
"data_analyst": _DATA_ANALYST_SYSTEM,
|
| 1808 |
+
"mad_professor": _MAD_PROFESSOR_SYSTEM,
|
| 1809 |
+
"accountant": _ACCOUNTANT_SYSTEM,
|
| 1810 |
+
"artist": _ARTIST_SYSTEM,
|
| 1811 |
+
"lazy_slacker": _LAZY_SLACKER_SYSTEM,
|
| 1812 |
+
"black_metal_fundamentalist": _BLACK_METAL_FUNDAMENTALIST_SYSTEM,
|
| 1813 |
+
"labour_union_rep": _LABOUR_UNION_REP_SYSTEM,
|
| 1814 |
+
"ux_designer": _UX_DESIGNER_SYSTEM,
|
| 1815 |
+
"doris": _DORIS_SYSTEM,
|
| 1816 |
+
"chairman_of_board": _CHAIRMAN_SYSTEM,
|
| 1817 |
+
"maga_appointee": _MAGA_APPOINTEE_SYSTEM,
|
| 1818 |
+
"lawyer": _LAWYER_SYSTEM,
|
| 1819 |
+
}
|
| 1820 |
+
|
| 1821 |
+
|
| 1822 |
+
def _log_revision_tracking(
|
| 1823 |
+
trace: List[str],
|
| 1824 |
+
role_key: str,
|
| 1825 |
+
state: "WorkflowState",
|
| 1826 |
+
prev_outputs: Dict[str, str],
|
| 1827 |
+
is_primary: bool = False,
|
| 1828 |
+
) -> None:
|
| 1829 |
+
"""Append a revision-tracking log line for a specialist role.
|
| 1830 |
+
|
| 1831 |
+
Logs whether:
|
| 1832 |
+
- Previous output was passed in
|
| 1833 |
+
- QA feedback was passed in
|
| 1834 |
+
- Role-specific feedback was passed in
|
| 1835 |
+
- Output changed materially from the previous round
|
| 1836 |
+
- Output is relevant to the request
|
| 1837 |
+
"""
|
| 1838 |
+
revision = state["revision_count"]
|
| 1839 |
+
if revision == 0:
|
| 1840 |
+
return # nothing to track on the first pass
|
| 1841 |
+
|
| 1842 |
+
output_key = f"{role_key}_output"
|
| 1843 |
+
current_out = state.get(output_key, "") # type: ignore[literal-required]
|
| 1844 |
+
prev_out = prev_outputs.get(role_key, "")
|
| 1845 |
+
|
| 1846 |
+
had_prev = bool(prev_out and prev_out.strip())
|
| 1847 |
+
had_qa = bool(state.get("qa_report"))
|
| 1848 |
+
had_role_fb = bool(state["qa_role_feedback"].get(role_key))
|
| 1849 |
+
changed = had_prev and current_out != prev_out
|
| 1850 |
+
relevant = _is_relevant_to_request(current_out, state["user_request"])
|
| 1851 |
+
qa_data = state.get("qa_data") or {}
|
| 1852 |
+
requirements = (
|
| 1853 |
+
(qa_data.get("missing_requirements") or [])
|
| 1854 |
+
+ (qa_data.get("required_fixes") or qa_data.get("recommended_fixes") or [])
|
| 1855 |
+
)
|
| 1856 |
+
coverage = _score_candidate(current_out, requirements, state["user_request"])
|
| 1857 |
+
|
| 1858 |
+
label = AGENT_ROLES.get(role_key, role_key)
|
| 1859 |
+
primary_tag = " [PRIMARY]" if is_primary else ""
|
| 1860 |
+
trace.append(
|
| 1861 |
+
f" ๐ [{label}{primary_tag}] revision tracking โ "
|
| 1862 |
+
f"prev_output: {'yes' if had_prev else 'no'} | "
|
| 1863 |
+
f"qa_feedback: {'yes' if had_qa else 'no'} | "
|
| 1864 |
+
f"role_feedback: {'yes' if had_role_fb else 'no'} | "
|
| 1865 |
+
f"output_changed: {'yes' if changed else 'no'} | "
|
| 1866 |
+
f"relevant: {'yes' if relevant else 'no'} | "
|
| 1867 |
+
f"coverage_score: {coverage:.2f}"
|
| 1868 |
+
)
|
| 1869 |
+
|
| 1870 |
|
|
|
|
| 1871 |
# These wrap the step functions as @tool so the Planner (or any LangChain agent)
|
| 1872 |
# can invoke specialists in a standard tool-use pattern.
|
| 1873 |
|
|
|
|
| 1884 |
"chairman_of_board_output": "", "maga_appointee_output": "", "lawyer_output": "",
|
| 1885 |
"finalized_output": "",
|
| 1886 |
"draft_output": "", "qa_report": "", "qa_data": {}, "qa_role_feedback": {}, "qa_passed": False,
|
| 1887 |
+
"revision_count": 0, "best_artifact": "", "best_artifact_score": 0.0, "final_answer": "",
|
| 1888 |
}
|
| 1889 |
|
| 1890 |
|
|
|
|
| 2131 |
"qa_passed": False,
|
| 2132 |
"revision_count": 0,
|
| 2133 |
"best_artifact": "",
|
| 2134 |
+
"best_artifact_score": 0.0,
|
| 2135 |
"final_answer": "",
|
| 2136 |
}
|
| 2137 |
|
|
|
|
| 2145 |
|
| 2146 |
try:
|
| 2147 |
if planner_active:
|
| 2148 |
+
# Step 1: Planner creates the initial plan using only active specialist roles
|
| 2149 |
+
state = _step_plan(chat_model, state, trace, active_specialist_keys)
|
| 2150 |
else:
|
| 2151 |
# No planner: auto-select first active specialist
|
| 2152 |
state["current_role"] = active_specialist_keys[0]
|
|
|
|
| 2161 |
# then run every other active specialist so all voices are heard.
|
| 2162 |
primary_role = state["current_role"]
|
| 2163 |
if primary_role not in active_specialist_keys:
|
| 2164 |
+
# Safe fallback: requested role not in active set โ log clearly
|
| 2165 |
fallback_role = active_specialist_keys[0]
|
| 2166 |
trace.append(
|
| 2167 |
+
f" โ FALLBACK: Role '{AGENT_ROLES.get(primary_role, primary_role)}' is not active "
|
| 2168 |
+
f"โ falling back to {AGENT_ROLES.get(fallback_role, fallback_role).upper()}"
|
| 2169 |
)
|
| 2170 |
primary_role = fallback_role
|
| 2171 |
state["current_role"] = primary_role
|
| 2172 |
|
| 2173 |
+
# โโ Track previous outputs for revision logging โโโโโโโโโโโโโโโโ
|
| 2174 |
+
prev_outputs_snapshot: Dict[str, str] = {
|
| 2175 |
+
k: state.get(f"{k}_output", "") # type: ignore[literal-required]
|
| 2176 |
+
for k in active_specialist_keys
|
| 2177 |
+
}
|
| 2178 |
+
|
| 2179 |
# Run the primary (planner-chosen) specialist
|
| 2180 |
primary_fn = _SPECIALIST_STEPS.get(primary_role, _step_technical)
|
| 2181 |
state = primary_fn(chat_model, state, trace)
|
| 2182 |
primary_output = state["draft_output"]
|
| 2183 |
|
| 2184 |
+
# Log revision tracking for primary specialist
|
| 2185 |
+
_log_revision_tracking(
|
| 2186 |
+
trace, primary_role, state, prev_outputs_snapshot, is_primary=True
|
| 2187 |
+
)
|
| 2188 |
+
|
| 2189 |
# Run all other active specialists and collect their perspectives
|
| 2190 |
all_outputs: List[Tuple[str, str]] = [(primary_role, primary_output)]
|
| 2191 |
for specialist_role in active_specialist_keys:
|
|
|
|
| 2193 |
continue # already ran above
|
| 2194 |
step_fn = _SPECIALIST_STEPS[specialist_role]
|
| 2195 |
state = step_fn(chat_model, state, trace)
|
| 2196 |
+
specialist_output = state["draft_output"]
|
| 2197 |
+
|
| 2198 |
+
# Log revision tracking for this specialist
|
| 2199 |
+
_log_revision_tracking(
|
| 2200 |
+
trace, specialist_role, state, prev_outputs_snapshot, is_primary=False
|
| 2201 |
+
)
|
| 2202 |
+
|
| 2203 |
+
# โโ Strict retry for poor outputs (revision rounds only) โโโ
|
| 2204 |
+
if state["revision_count"] > 0:
|
| 2205 |
+
output_key = f"{specialist_role}_output"
|
| 2206 |
+
current_output = state.get(output_key, "") # type: ignore[literal-required]
|
| 2207 |
+
needs_retry = (
|
| 2208 |
+
_is_empty_output(current_output)
|
| 2209 |
+
or not _is_relevant_to_request(current_output, message)
|
| 2210 |
+
)
|
| 2211 |
+
if needs_retry:
|
| 2212 |
+
sys_prompt = _SPECIALIST_SYSTEM_PROMPTS.get(specialist_role)
|
| 2213 |
+
if sys_prompt:
|
| 2214 |
+
retry_text = _strict_retry_specialist(
|
| 2215 |
+
chat_model, sys_prompt, state,
|
| 2216 |
+
specialist_role, output_key, specialist_role.upper(), trace
|
| 2217 |
+
)
|
| 2218 |
+
if retry_text and retry_text.strip():
|
| 2219 |
+
state[output_key] = retry_text # type: ignore[literal-required]
|
| 2220 |
+
state["draft_output"] = retry_text
|
| 2221 |
+
specialist_output = retry_text
|
| 2222 |
+
trace.append(
|
| 2223 |
+
f" โน [{AGENT_ROLES.get(specialist_role, specialist_role)}] "
|
| 2224 |
+
f"strict retry: output changed "
|
| 2225 |
+
f"({'relevant' if _is_relevant_to_request(retry_text, message) else 'still poor'})"
|
| 2226 |
+
)
|
| 2227 |
+
|
| 2228 |
all_outputs.append((specialist_role, state["draft_output"]))
|
| 2229 |
|
| 2230 |
# Finalize all perspectives into the actual deliverable.
|
|
|
|
| 2236 |
state = _step_finalize(chat_model, state, trace, all_outputs)
|
| 2237 |
else:
|
| 2238 |
state["draft_output"] = primary_output
|
| 2239 |
+
state["finalized_output"] = primary_output
|
| 2240 |
|
| 2241 |
+
# Update best-candidate tracking with scoring: only update if better than current best
|
| 2242 |
current_draft = state["draft_output"]
|
| 2243 |
+
if current_draft and current_draft.strip():
|
| 2244 |
+
qa_data = state.get("qa_data") or {}
|
| 2245 |
+
requirements = (
|
| 2246 |
+
(qa_data.get("missing_requirements") or [])
|
| 2247 |
+
+ (qa_data.get("required_fixes") or qa_data.get("recommended_fixes") or [])
|
| 2248 |
+
)
|
| 2249 |
+
current_score = _score_candidate(current_draft, requirements, message)
|
| 2250 |
+
best_score = state.get("best_artifact_score", 0.0)
|
| 2251 |
+
|
| 2252 |
+
if not _is_meta_summary(current_draft):
|
| 2253 |
+
if current_score >= best_score or not state.get("best_artifact"):
|
| 2254 |
+
prev_best = state.get("best_artifact", "")
|
| 2255 |
+
state["best_artifact"] = current_draft
|
| 2256 |
+
state["best_artifact_score"] = current_score
|
| 2257 |
+
updated = current_draft != prev_best
|
| 2258 |
+
trace.append(
|
| 2259 |
+
f" โ Best artifact {'updated' if updated else 'confirmed'} "
|
| 2260 |
+
f"(rev {state['revision_count']}, score {current_score:.2f}): "
|
| 2261 |
+
f"{len(current_draft)} chars"
|
| 2262 |
+
)
|
| 2263 |
+
else:
|
| 2264 |
+
trace.append(
|
| 2265 |
+
f" โน Current draft score {current_score:.2f} < best {best_score:.2f} "
|
| 2266 |
+
f"โ keeping existing best artifact"
|
| 2267 |
+
)
|
| 2268 |
+
elif not state.get("best_artifact"):
|
| 2269 |
+
# Safety net: even a meta-summary beats nothing
|
| 2270 |
+
state["best_artifact"] = current_draft
|
| 2271 |
+
state["best_artifact_score"] = 0.0
|
| 2272 |
|
| 2273 |
# Step 3: QA reviews the finalized draft (if enabled)
|
| 2274 |
if qa_active:
|
|
|
|
| 2282 |
# Step 4: Planner reviews QA and either approves or schedules a revision
|
| 2283 |
if planner_active and qa_active:
|
| 2284 |
prev_plan = state["plan"]
|
| 2285 |
+
state = _step_planner_review(
|
| 2286 |
+
chat_model, state, trace, active_specialist_keys
|
| 2287 |
+
)
|
| 2288 |
|
| 2289 |
# Exit if the Planner approved the result
|
| 2290 |
if state["final_answer"]:
|
|
|
|
| 2311 |
state["final_answer"] = best
|
| 2312 |
trace.append(
|
| 2313 |
f"\nโโโ MAX REVISIONS REACHED ({MAX_REVISIONS}) โโโ\n"
|
| 2314 |
+
f"Returning best artifact (score {state.get('best_artifact_score', 0.0):.2f}, "
|
| 2315 |
+
f"{len(best)} chars)."
|
| 2316 |
)
|
| 2317 |
break
|
| 2318 |
else:
|
|
|
|
| 2334 |
return state["final_answer"], "\n".join(trace)
|
| 2335 |
|
| 2336 |
|
| 2337 |
+
|
| 2338 |
# ============================================================
|
| 2339 |
# Agent builder
|
| 2340 |
# ============================================================
|
|
|
|
| 2814 |
return final_answer, trace
|
| 2815 |
|
| 2816 |
|
| 2817 |
+
def run_demo_three_course_menu(
|
| 2818 |
+
model_id: str = DEFAULT_MODEL_ID,
|
| 2819 |
+
active_roles: Optional[List[str]] = None,
|
| 2820 |
+
) -> Tuple[str, str]:
|
| 2821 |
+
"""Regression demo A: three-course menu request with structured QA revision.
|
| 2822 |
+
|
| 2823 |
+
Verifies:
|
| 2824 |
+
- At least one revision round occurs (QA initially fails the underspecified output)
|
| 2825 |
+
- Some role gets role-specific QA feedback
|
| 2826 |
+
- Finalizer returns an actual structured menu, not a meta-summary
|
| 2827 |
+
- Final result contains at least two options per course and practical details
|
| 2828 |
+
|
| 2829 |
+
Example::
|
| 2830 |
+
|
| 2831 |
+
answer, trace = run_demo_three_course_menu()
|
| 2832 |
+
# Verify the final answer is a structured menu:
|
| 2833 |
+
assert "appetizer" in answer.lower() or "starter" in answer.lower()
|
| 2834 |
+
assert not any(
|
| 2835 |
+
h in answer.upper()
|
| 2836 |
+
for h in ("PERSPECTIVES SUMMARY", "COMMON GROUND", "TENSIONS AND TRADE-OFFS")
|
| 2837 |
+
)
|
| 2838 |
+
"""
|
| 2839 |
+
task = (
|
| 2840 |
+
"Discuss best options for a three-course menu for tonight. "
|
| 2841 |
+
"Include at least two options per course (appetizer, main, dessert). "
|
| 2842 |
+
"For each option provide: name, key ingredients, prep/cook time, skill level, "
|
| 2843 |
+
"suggested pairings, and whether it is suitable for vegetarians or vegans. "
|
| 2844 |
+
"Also include total estimated time and a note on dietary variations."
|
| 2845 |
+
)
|
| 2846 |
+
if active_roles is None:
|
| 2847 |
+
active_roles = [
|
| 2848 |
+
AGENT_ROLES["planner"],
|
| 2849 |
+
AGENT_ROLES["creative"],
|
| 2850 |
+
AGENT_ROLES["technical"],
|
| 2851 |
+
AGENT_ROLES["mad_professor"],
|
| 2852 |
+
AGENT_ROLES["qa_tester"],
|
| 2853 |
+
]
|
| 2854 |
+
print(f"\n[DEMO-A] Three-course menu regression demo")
|
| 2855 |
+
print(f"[DEMO-A] Task: {task!r}")
|
| 2856 |
+
print(f"[DEMO-A] Active roles: {active_roles}")
|
| 2857 |
+
final_answer, trace = run_multi_role_workflow(task, model_id, active_roles)
|
| 2858 |
+
print("\n[DEMO-A] === FINAL ANSWER ===")
|
| 2859 |
+
print(final_answer or "(empty โ check trace for details)")
|
| 2860 |
+
print("\n[DEMO-A] === WORKFLOW TRACE ===")
|
| 2861 |
+
print(trace)
|
| 2862 |
+
|
| 2863 |
+
# โโ Assertions for regression verification โโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 2864 |
+
failures = []
|
| 2865 |
+
answer_lower = final_answer.lower()
|
| 2866 |
+
# Check it's not a pure meta-summary
|
| 2867 |
+
if _is_meta_summary(final_answer):
|
| 2868 |
+
failures.append("FAIL: Final answer looks like a meta-summary, not a menu.")
|
| 2869 |
+
# Check for course structure
|
| 2870 |
+
if not any(k in answer_lower for k in ("appetizer", "starter", "first course")):
|
| 2871 |
+
failures.append("FAIL: No appetizer/starter section found in final answer.")
|
| 2872 |
+
if not any(k in answer_lower for k in ("main", "entrรฉe", "entree", "second course")):
|
| 2873 |
+
failures.append("FAIL: No main course section found in final answer.")
|
| 2874 |
+
if not any(k in answer_lower for k in ("dessert", "sweet", "third course")):
|
| 2875 |
+
failures.append("FAIL: No dessert section found in final answer.")
|
| 2876 |
+
# Check QA role feedback was dispatched (visible in trace)
|
| 2877 |
+
if "Role-specific feedback dispatched" not in trace and "role_feedback" not in trace:
|
| 2878 |
+
failures.append("WARN: No role-specific QA feedback detected in trace.")
|
| 2879 |
+
# Check at least one revision occurred
|
| 2880 |
+
if "REVISION 1" not in trace:
|
| 2881 |
+
failures.append("WARN: No revision round detected โ QA may have passed immediately.")
|
| 2882 |
+
|
| 2883 |
+
if failures:
|
| 2884 |
+
print("\n[DEMO-A] โ Regression warnings:")
|
| 2885 |
+
for f in failures:
|
| 2886 |
+
print(f" {f}")
|
| 2887 |
+
else:
|
| 2888 |
+
print("\n[DEMO-A] โ
All regression checks passed.")
|
| 2889 |
+
|
| 2890 |
+
return final_answer, trace
|
| 2891 |
+
|
| 2892 |
+
|
| 2893 |
+
def run_demo_inactive_role_fallback(
|
| 2894 |
+
model_id: str = DEFAULT_MODEL_ID,
|
| 2895 |
+
) -> Tuple[str, str]:
|
| 2896 |
+
"""Regression demo B: planner asks for inactive role โ clean fallback must occur.
|
| 2897 |
+
|
| 2898 |
+
Verifies:
|
| 2899 |
+
- Planner attempts to route to 'Creative Expert' (only active specialist role is 'Technical Expert')
|
| 2900 |
+
- Clean fallback is logged (no crash, no broken routing)
|
| 2901 |
+
- Final result still produces a useful deliverable
|
| 2902 |
+
|
| 2903 |
+
Example::
|
| 2904 |
+
|
| 2905 |
+
answer, trace = run_demo_inactive_role_fallback()
|
| 2906 |
+
assert "FALLBACK" in trace or "not active" in trace
|
| 2907 |
+
assert answer and len(answer.strip()) > 20
|
| 2908 |
+
"""
|
| 2909 |
+
task = "Write a short, engaging product description for a new AI-powered coffee maker."
|
| 2910 |
+
# Deliberately enable only a small set that does NOT include Creative Expert
|
| 2911 |
+
# This forces a fallback when the planner (naturally) requests Creative Expert for a writing task
|
| 2912 |
+
active_roles = [
|
| 2913 |
+
AGENT_ROLES["planner"],
|
| 2914 |
+
AGENT_ROLES["technical"],
|
| 2915 |
+
AGENT_ROLES["qa_tester"],
|
| 2916 |
+
]
|
| 2917 |
+
print(f"\n[DEMO-B] Inactive role fallback regression demo")
|
| 2918 |
+
print(f"[DEMO-B] Task: {task!r}")
|
| 2919 |
+
print(f"[DEMO-B] Active roles: {active_roles}")
|
| 2920 |
+
print(f"[DEMO-B] NOTE: 'Creative Expert' is intentionally disabled to test fallback routing.")
|
| 2921 |
+
final_answer, trace = run_multi_role_workflow(task, model_id, active_roles)
|
| 2922 |
+
print("\n[DEMO-B] === FINAL ANSWER ===")
|
| 2923 |
+
print(final_answer or "(empty โ check trace for details)")
|
| 2924 |
+
print("\n[DEMO-B] === WORKFLOW TRACE ===")
|
| 2925 |
+
print(trace)
|
| 2926 |
+
|
| 2927 |
+
# โโ Assertions for regression verification โโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 2928 |
+
failures = []
|
| 2929 |
+
if not final_answer or len(final_answer.strip()) < 20:
|
| 2930 |
+
failures.append("FAIL: Final answer is empty or too short.")
|
| 2931 |
+
# Fallback should be logged when the planner requests an inactive role
|
| 2932 |
+
fallback_logged = "FALLBACK" in trace or "not active" in trace or "falling back" in trace.lower()
|
| 2933 |
+
if not fallback_logged:
|
| 2934 |
+
failures.append(
|
| 2935 |
+
"WARN: No fallback log detected โ the planner may have correctly chosen "
|
| 2936 |
+
"an active role directly, or the fallback path was not triggered."
|
| 2937 |
+
)
|
| 2938 |
+
# Should NOT crash
|
| 2939 |
+
if "[ERROR]" in trace:
|
| 2940 |
+
failures.append("FAIL: Workflow error detected in trace.")
|
| 2941 |
+
|
| 2942 |
+
if failures:
|
| 2943 |
+
print("\n[DEMO-B] โ Regression warnings:")
|
| 2944 |
+
for f in failures:
|
| 2945 |
+
print(f" {f}")
|
| 2946 |
+
else:
|
| 2947 |
+
print("\n[DEMO-B] โ
All regression checks passed.")
|
| 2948 |
+
|
| 2949 |
+
return final_answer, trace
|
| 2950 |
+
|
| 2951 |
+
|
| 2952 |
+
|
| 2953 |
if __name__ == "__main__":
|
| 2954 |
port = int(os.environ.get("PORT", 7860))
|
| 2955 |
demo.launch(
|