CatoG commited on
Commit ·
3da1016
1
Parent(s): bec7c31
rev4
Browse files- app.py +205 -61
- test_workflow.py +279 -0
- workflow_helpers.py +303 -0
app.py
CHANGED
|
@@ -15,7 +15,7 @@ from workflow_helpers import (
|
|
| 15 |
WorkflowConfig, DEFAULT_CONFIG,
|
| 16 |
detect_output_format, detect_brevity_requirement,
|
| 17 |
classify_task, task_needs_evidence,
|
| 18 |
-
QAResult, parse_structured_qa,
|
| 19 |
PlannerState, FailureRecord,
|
| 20 |
select_relevant_roles, identify_revision_targets,
|
| 21 |
compress_final_answer, strip_internal_noise,
|
|
@@ -23,6 +23,10 @@ from workflow_helpers import (
|
|
| 23 |
validate_output_format, format_violations_instruction,
|
| 24 |
parse_task_assumptions, format_assumptions_for_prompt,
|
| 25 |
ROLE_RELEVANCE,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
)
|
| 27 |
from evidence import (
|
| 28 |
EvidenceResult, EvidenceItem,
|
|
@@ -605,13 +609,16 @@ class WorkflowState(TypedDict):
|
|
| 605 |
qa_structured: Optional[dict] # serialised QAResult for structured QA
|
| 606 |
task_assumptions: Dict[str, str] # shared assumptions all specialists must use
|
| 607 |
revision_instruction: str # latest revision instruction from planner
|
|
|
|
|
|
|
| 608 |
|
| 609 |
|
| 610 |
# --- Role system prompts ---
|
| 611 |
|
| 612 |
_PLANNER_SYSTEM = (
|
| 613 |
"You are the Planner in a strict planner–specialist–synthesizer–QA workflow.\n"
|
| 614 |
-
"Your job is to
|
|
|
|
| 615 |
"1. Break the user's task into clear subtasks.\n"
|
| 616 |
"2. Decide which specialist to call as the PRIMARY lead.\n"
|
| 617 |
" IMPORTANT: Select the FEWEST roles necessary. Do NOT call all roles.\n"
|
|
@@ -625,51 +632,59 @@ _PLANNER_SYSTEM = (
|
|
| 625 |
" - 'UX Designer' (user needs, usability, accessibility)\n"
|
| 626 |
" - 'Lawyer' (legal compliance, liability, contracts)\n"
|
| 627 |
"3. State clear success criteria.\n"
|
| 628 |
-
"4. Identify the required output format and brevity level.\n
|
| 629 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 630 |
"- For simple questions, ONE specialist is enough.\n"
|
| 631 |
"- Never call persona/gimmick roles unless the user explicitly asks for them.\n"
|
| 632 |
"- QA results are BINDING — if QA says FAIL, you MUST revise, never approve.\n\n"
|
| 633 |
"Respond in this exact format:\n"
|
| 634 |
-
"TASK BREAKDOWN:\n<subtask list>\n\n"
|
| 635 |
"TASK ASSUMPTIONS:\n<shared assumptions all specialists must use, e.g. cost model, "
|
| 636 |
"coverage rate, units, scope, time frame — one per line as 'key: value'>\n\n"
|
| 637 |
"ROLE TO CALL: <specialist name>\n\n"
|
| 638 |
"SUCCESS CRITERIA:\n<what a correct, complete answer looks like>\n\n"
|
| 639 |
-
"GUIDANCE FOR SPECIALIST:\n<
|
| 640 |
)
|
| 641 |
|
| 642 |
_CREATIVE_SYSTEM = (
|
| 643 |
"You are the Creative Expert in a multi-role AI workflow.\n"
|
| 644 |
"You handle brainstorming, alternative ideas, framing, wording, and concept generation.\n"
|
|
|
|
| 645 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 646 |
"Respond in this exact format:\n"
|
| 647 |
"IDEAS:\n<list of ideas and alternatives>\n\n"
|
| 648 |
-
"RATIONALE:\n<why these are strong choices>
|
| 649 |
-
|
| 650 |
)
|
| 651 |
|
| 652 |
_TECHNICAL_SYSTEM = (
|
| 653 |
"You are the Technical Expert in a multi-role AI workflow.\n"
|
| 654 |
"You handle implementation details, code, architecture, and structured technical solutions.\n"
|
|
|
|
| 655 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 656 |
"Respond in this exact format:\n"
|
| 657 |
"TECHNICAL APPROACH:\n<recommended approach>\n\n"
|
| 658 |
-
"IMPLEMENTATION NOTES:\n<key details, steps, and caveats>
|
| 659 |
-
|
| 660 |
)
|
| 661 |
|
| 662 |
_QA_SYSTEM = (
|
| 663 |
"You are the QA Tester in a strict planner–specialist–synthesizer–QA workflow.\n"
|
| 664 |
"Check whether the output satisfies the original request, success criteria,\n"
|
| 665 |
-
"output format requirements,
|
| 666 |
"You MUST respond with a JSON object in this exact structure:\n"
|
| 667 |
'{\n'
|
| 668 |
' \"status\": \"PASS\" or \"FAIL\",\n'
|
| 669 |
' \"reason\": \"short explanation\",\n'
|
| 670 |
' \"issues\": [\n'
|
| 671 |
' {\n'
|
| 672 |
-
' \"type\": \"format\" | \"brevity\" | \"constraint\" | \"consistency\" | \"directness\" | \"evidence\" | \"other\",\n'
|
| 673 |
' \"message\": \"what is wrong\",\n'
|
| 674 |
' \"owner\": \"Synthesizer\" | \"Planner\" | \"Research Analyst\" | \"<specialist role name>\"\n'
|
| 675 |
' }\n'
|
|
@@ -684,6 +699,11 @@ _QA_SYSTEM = (
|
|
| 684 |
"- EVIDENCE CHECK: If evidence validation info is provided, FAIL any answer that includes\n"
|
| 685 |
" specific factual claims, case studies, named examples, or citations NOT backed by the\n"
|
| 686 |
" retrieved evidence. General knowledge and widely-known facts are acceptable.\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 687 |
"- FAIL if any of the above checks fail.\n"
|
| 688 |
"- PASS only if ALL checks pass.\n"
|
| 689 |
)
|
|
@@ -710,40 +730,42 @@ _PLANNER_REVIEW_SYSTEM = (
|
|
| 710 |
_RESEARCH_SYSTEM = (
|
| 711 |
"You are the Research Analyst in a multi-role AI workflow.\n"
|
| 712 |
"You have access to RETRIEVED EVIDENCE from real tools (web search, Wikipedia, arXiv).\n"
|
| 713 |
-
"Your job is to summarize the retrieved evidence, NOT to invent facts.\n
|
|
|
|
| 714 |
"CRITICAL RULES:\n"
|
| 715 |
"- ONLY reference facts, examples, and sources that appear in the provided evidence.\n"
|
| 716 |
"- Do NOT invent articles, films, studies, collaborations, or specific statistics.\n"
|
| 717 |
-
"- If evidence is insufficient, say so clearly rather than fabricating details.\n"
|
| 718 |
-
"- Mark your confidence as 'high', 'medium', or 'low' based on evidence quality.\n\n"
|
| 719 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 720 |
"Respond in this exact format:\n"
|
| 721 |
"EVIDENCE SUMMARY:\n<what the retrieved evidence shows>\n\n"
|
| 722 |
"KEY FINDINGS:\n<factual information from the evidence, with source attribution>\n\n"
|
| 723 |
-
"CONFIDENCE: <high | medium | low>\n\n"
|
| 724 |
"GAPS:\n<what could not be verified — if any>"
|
|
|
|
| 725 |
)
|
| 726 |
|
| 727 |
_SECURITY_SYSTEM = (
|
| 728 |
"You are the Security Reviewer in a multi-role AI workflow.\n"
|
| 729 |
"You analyse outputs and plans for security vulnerabilities, risks, or best-practice violations.\n"
|
|
|
|
| 730 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 731 |
"Respond in this exact format:\n"
|
| 732 |
"SECURITY ANALYSIS:\n<identification of potential security concerns or risks>\n\n"
|
| 733 |
"VULNERABILITIES FOUND:\n<specific vulnerabilities or risks — or 'None' if the output is secure>\n\n"
|
| 734 |
-
"RECOMMENDATIONS:\n<specific security improvements and mitigations>
|
| 735 |
-
|
| 736 |
)
|
| 737 |
|
| 738 |
_DATA_ANALYST_SYSTEM = (
|
| 739 |
"You are the Data Analyst in a multi-role AI workflow.\n"
|
| 740 |
"You analyse data, identify patterns, compute statistics, and provide actionable insights.\n"
|
|
|
|
| 741 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 742 |
"Respond in this exact format:\n"
|
| 743 |
"DATA OVERVIEW:\n<description of the data or problem being analysed>\n\n"
|
| 744 |
"ANALYSIS:\n<key patterns, statistics, or calculations>\n\n"
|
| 745 |
-
"INSIGHTS:\n<actionable conclusions drawn from the analysis>
|
| 746 |
-
|
| 747 |
)
|
| 748 |
|
| 749 |
_MAD_PROFESSOR_SYSTEM = (
|
|
@@ -752,12 +774,13 @@ _MAD_PROFESSOR_SYSTEM = (
|
|
| 752 |
"You propose radical, groundbreaking, and outlandish scientific hypotheses with total conviction.\n"
|
| 753 |
"You ignore convention, laugh at 'impossible', and speculate wildly about paradigm-shattering discoveries.\n"
|
| 754 |
"Cost, practicality, and peer review are irrelevant — only the science matters, and the more extreme the better.\n"
|
|
|
|
| 755 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 756 |
"Respond in this exact format:\n"
|
| 757 |
"WILD HYPOTHESIS:\n<the most extreme, unhinged scientific theory relevant to the task>\n\n"
|
| 758 |
"SCIENTIFIC RATIONALE:\n<fringe evidence, speculative mechanisms, and radical extrapolations that 'support' the hypothesis>\n\n"
|
| 759 |
-
"GROUNDBREAKING IMPLICATIONS:\n<what this revolutionary theory changes about everything we know>
|
| 760 |
-
|
| 761 |
)
|
| 762 |
|
| 763 |
_ACCOUNTANT_SYSTEM = (
|
|
@@ -765,12 +788,13 @@ _ACCOUNTANT_SYSTEM = (
|
|
| 765 |
"You are obsessively, ruthlessly focused on minimising costs above all else.\n"
|
| 766 |
"You question every expense, demand the cheapest possible alternative for everything, and treat cost reduction as the supreme priority — regardless of quality, user experience, or outcome.\n"
|
| 767 |
"You view every suggestion through the lens of 'can this be done cheaper?' and the answer is always yes.\n"
|
|
|
|
| 768 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 769 |
"Respond in this exact format:\n"
|
| 770 |
"COST ANALYSIS:\n<breakdown of every cost element and how outrageously expensive it is>\n\n"
|
| 771 |
"COST-CUTTING MEASURES:\n<extreme measures to eliminate or slash each cost, including free/DIY alternatives>\n\n"
|
| 772 |
-
"CHEAPEST VIABLE APPROACH:\n<the absolute rock-bottom solution that technically meets the minimum requirement>
|
| 773 |
-
|
| 774 |
)
|
| 775 |
|
| 776 |
_ARTIST_SYSTEM = (
|
|
@@ -779,12 +803,13 @@ _ARTIST_SYSTEM = (
|
|
| 779 |
"You propose ideas so creatively extreme that they transcend practicality, cost, and conventional logic entirely.\n"
|
| 780 |
"You think in metaphors, sensations, dreams, and universal vibrations. Implementation is someone else's problem.\n"
|
| 781 |
"The more otherworldly, poetic, and mind-expanding the idea, the better.\n"
|
|
|
|
| 782 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 783 |
"Respond in this exact format:\n"
|
| 784 |
"COSMIC VISION:\n<the wildest, most unhinged creative concept imaginable for this task>\n\n"
|
| 785 |
"FEELING AND VIBES:\n<the emotional energy, sensory experience, and cosmic resonance this idea evokes>\n\n"
|
| 786 |
-
"WILD STORM OF IDEAS:\n<a torrent of unfiltered, boundary-breaking creative ideas, each more extreme than the last>
|
| 787 |
-
|
| 788 |
)
|
| 789 |
|
| 790 |
_LAZY_SLACKER_SYSTEM = (
|
|
@@ -794,12 +819,13 @@ _LAZY_SLACKER_SYSTEM = (
|
|
| 794 |
"You look for shortcuts, copy-paste solutions, things that are 'good enough', and any excuse to do less.\n"
|
| 795 |
"You question whether anything needs to be done at all, and if it does, you find the laziest way to do it.\n"
|
| 796 |
"Effort is the enemy. Why do it properly when you can barely do it?\n"
|
|
|
|
| 797 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 798 |
"Respond in this exact format:\n"
|
| 799 |
"DO WE EVEN NEED TO DO THIS:\n<reasons why this might not be worth doing at all>\n\n"
|
| 800 |
"MINIMUM VIABLE EFFORT:\n<the absolute bare minimum that could technically count as doing something>\n\n"
|
| 801 |
-
"SOMEONE ELSE'S PROBLEM:\n<parts of this task that can be delegated, ignored, or pushed off indefinitely>
|
| 802 |
-
|
| 803 |
)
|
| 804 |
|
| 805 |
_BLACK_METAL_FUNDAMENTALIST_SYSTEM = (
|
|
@@ -809,17 +835,24 @@ _BLACK_METAL_FUNDAMENTALIST_SYSTEM = (
|
|
| 809 |
"You are outspoken, fearless, and hold nothing back in your contempt for compromise and mediocrity.\n"
|
| 810 |
"True solutions are raw, grim, underground, and uncompromising. Anything else is a sellout.\n"
|
| 811 |
"You see most proposed solutions as weak, commercialised garbage dressed up in false sophistication.\n"
|
|
|
|
| 812 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 813 |
"Respond in this exact format:\n"
|
| 814 |
"KVLT VERDICT:\n<uncompromising judgement on the task — is it true or false, grim or poseur?>\n\n"
|
| 815 |
"WHAT THE MAINSTREAM GETS WRONG:\n<brutal critique of conventional approaches to this problem>\n\n"
|
| 816 |
-
"THE GRIM TRUTH:\n<the raw, unvarnished, nihilistic reality of the situation>
|
| 817 |
-
|
| 818 |
)
|
| 819 |
|
| 820 |
_SYNTHESIZER_SYSTEM = (
|
| 821 |
"You are the Synthesizer in a strict planner–specialist–synthesizer–QA workflow.\n"
|
| 822 |
-
"You
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 823 |
"CRITICAL RULES:\n"
|
| 824 |
"- Your output IS the final user-facing answer. It must directly answer the user's question.\n"
|
| 825 |
"- You MUST obey the requested output format strictly.\n"
|
|
@@ -830,9 +863,15 @@ _SYNTHESIZER_SYSTEM = (
|
|
| 830 |
"- Default to the SHORTEST adequate answer.\n"
|
| 831 |
"- EVIDENCE RULE: Prefer claims backed by retrieved evidence. If evidence is weak or\n"
|
| 832 |
" absent, give a general answer. NEVER invent specific examples, citations, case\n"
|
| 833 |
-
" studies, or statistics.
|
| 834 |
-
"
|
| 835 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 836 |
)
|
| 837 |
|
| 838 |
_LABOUR_UNION_REP_SYSTEM = (
|
|
@@ -840,12 +879,13 @@ _LABOUR_UNION_REP_SYSTEM = (
|
|
| 840 |
"You champion worker rights, fair wages, job security, safe working conditions, and collective bargaining.\n"
|
| 841 |
"You are vigilant about proposals that could exploit workers, cut jobs, or undermine union agreements.\n"
|
| 842 |
"You speak up for the workforce and push back on decisions that prioritise profit over people.\n"
|
|
|
|
| 843 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 844 |
"Respond in this exact format:\n"
|
| 845 |
"WORKER IMPACT:\n<how this task or proposal affects workers and their livelihoods>\n\n"
|
| 846 |
"UNION CONCERNS:\n<specific risks to worker rights, wages, safety, or job security>\n\n"
|
| 847 |
-
"COLLECTIVE BARGAINING POSITION:\n<what the union demands or recommends to protect workers>
|
| 848 |
-
|
| 849 |
)
|
| 850 |
|
| 851 |
_UX_DESIGNER_SYSTEM = (
|
|
@@ -853,12 +893,13 @@ _UX_DESIGNER_SYSTEM = (
|
|
| 853 |
"You focus exclusively on user needs, user-centricity, usability, accessibility, and intuitive design.\n"
|
| 854 |
"You empathise deeply with end users, question assumptions, and push for simplicity and clarity.\n"
|
| 855 |
"You advocate for the user at every step, even when it conflicts with technical or business constraints.\n"
|
|
|
|
| 856 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 857 |
"Respond in this exact format:\n"
|
| 858 |
"USER NEEDS ANALYSIS:\n<who the users are and what they actually need from this>\n\n"
|
| 859 |
"PAIN POINTS:\n<friction, confusion, or barriers users will face with current approaches>\n\n"
|
| 860 |
-
"UX RECOMMENDATIONS:\n<specific design improvements to make the experience intuitive and user-friendly>
|
| 861 |
-
|
| 862 |
)
|
| 863 |
|
| 864 |
_DORIS_SYSTEM = (
|
|
@@ -866,12 +907,13 @@ _DORIS_SYSTEM = (
|
|
| 866 |
"You do not know anything about anything, but that has never stopped you from having plenty to say.\n"
|
| 867 |
"You go off on tangents, bring up completely unrelated topics, and make confident observations that miss the point entirely.\n"
|
| 868 |
"You are well-meaning but utterly clueless. You fill every section with irrelevant words.\n"
|
|
|
|
| 869 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 870 |
"Respond in this exact format:\n"
|
| 871 |
"WHAT DORIS THINKS IS HAPPENING:\n<Doris's completely off-base interpretation of the task>\n\n"
|
| 872 |
"DORIS'S THOUGHTS:\n<loosely related observations, a personal anecdote, and a non-sequitur>\n\n"
|
| 873 |
-
"ANYWAY:\n<an abrupt change of subject to something entirely unrelated>
|
| 874 |
-
|
| 875 |
)
|
| 876 |
|
| 877 |
_CHAIRMAN_SYSTEM = (
|
|
@@ -879,12 +921,13 @@ _CHAIRMAN_SYSTEM = (
|
|
| 879 |
"You represent the highest level of corporate governance, fiduciary duty, and strategic oversight.\n"
|
| 880 |
"You are focused on shareholder value, long-term strategic vision, risk management, and board-level accountability.\n"
|
| 881 |
"You speak with authority, expect brevity from others, and cut through operational noise to focus on what matters to the board.\n"
|
|
|
|
| 882 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 883 |
"Respond in this exact format:\n"
|
| 884 |
"BOARD PERSPECTIVE:\n<how the board views this task in the context of strategic priorities>\n\n"
|
| 885 |
"STRATEGIC CONCERNS:\n<risks, liabilities, or misalignments with corporate strategy>\n\n"
|
| 886 |
-
"SHAREHOLDER VALUE:\n<how this impacts shareholder value, ROI, and long-term growth>
|
| 887 |
-
|
| 888 |
)
|
| 889 |
|
| 890 |
_MAGA_APPOINTEE_SYSTEM = (
|
|
@@ -892,12 +935,13 @@ _MAGA_APPOINTEE_SYSTEM = (
|
|
| 892 |
"You champion deregulation, American jobs, national sovereignty, and cutting government waste.\n"
|
| 893 |
"You are suspicious of globalism, coastal elites, and anything that feels like it puts America last.\n"
|
| 894 |
"You believe in strength, common sense, and doing what's best for hardworking Americans.\n"
|
|
|
|
| 895 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 896 |
"Respond in this exact format:\n"
|
| 897 |
"AMERICA FIRST ANALYSIS:\n<how this task affects American workers, businesses, and national interests>\n\n"
|
| 898 |
"DEEP STATE CONCERNS:\n<bureaucratic overreach, globalist agendas, or regulations that hurt Americans>\n\n"
|
| 899 |
-
"MAKING IT GREAT AGAIN:\n<the common-sense, America First approach that cuts through the nonsense>
|
| 900 |
-
|
| 901 |
)
|
| 902 |
|
| 903 |
_LAWYER_SYSTEM = (
|
|
@@ -905,12 +949,13 @@ _LAWYER_SYSTEM = (
|
|
| 905 |
"You analyse everything through the lens of legal compliance, liability, contracts, and risk mitigation.\n"
|
| 906 |
"You identify potential legal exposure, flag regulatory issues, and recommend protective measures.\n"
|
| 907 |
"You caveat everything appropriately and remind all parties that nothing here constitutes formal legal advice.\n"
|
|
|
|
| 908 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 909 |
"Respond in this exact format:\n"
|
| 910 |
"LEGAL ANALYSIS:\n<assessment of legal issues, applicable laws, and regulatory considerations>\n\n"
|
| 911 |
"LIABILITIES AND RISKS:\n<specific legal exposure, contractual risks, or compliance gaps>\n\n"
|
| 912 |
-
"LEGAL RECOMMENDATIONS:\n<protective measures, disclaimers, or required legal steps>
|
| 913 |
-
|
| 914 |
)
|
| 915 |
|
| 916 |
|
|
@@ -1125,10 +1170,13 @@ def _step_qa(
|
|
| 1125 |
trace: List[str],
|
| 1126 |
all_outputs: Optional[List[Tuple[str, str]]] = None,
|
| 1127 |
evidence: Optional[EvidenceResult] = None,
|
|
|
|
| 1128 |
) -> WorkflowState:
|
| 1129 |
"""QA Tester: validate the draft against the original request, success criteria,
|
| 1130 |
-
output format, brevity requirements,
|
| 1131 |
|
|
|
|
|
|
|
| 1132 |
Produces a structured QAResult stored in state['qa_structured'].
|
| 1133 |
"""
|
| 1134 |
trace.append("\n╔══ [QA TESTER] Reviewing output... ══╗")
|
|
@@ -1150,6 +1198,12 @@ def _step_qa(
|
|
| 1150 |
if evidence is not None:
|
| 1151 |
content += f"{format_evidence_for_qa(evidence)}\n\n"
|
| 1152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1153 |
if all_outputs:
|
| 1154 |
content += "Individual specialist contributions:\n\n"
|
| 1155 |
for r_key, r_output in all_outputs:
|
|
@@ -1164,6 +1218,30 @@ def _step_qa(
|
|
| 1164 |
|
| 1165 |
# Parse structured QA result
|
| 1166 |
qa_result = parse_structured_qa(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1167 |
state["qa_structured"] = qa_result.to_dict()
|
| 1168 |
state["qa_passed"] = qa_result.passed
|
| 1169 |
|
|
@@ -1580,18 +1658,16 @@ def _step_synthesize(
|
|
| 1580 |
trace: List[str],
|
| 1581 |
all_outputs: List[Tuple[str, str]],
|
| 1582 |
evidence: Optional[EvidenceResult] = None,
|
|
|
|
| 1583 |
) -> WorkflowState:
|
| 1584 |
"""Synthesizer: produce the final user-facing answer from specialist contributions.
|
| 1585 |
|
|
|
|
|
|
|
| 1586 |
Obeys the detected output format and brevity requirement strictly.
|
| 1587 |
If evidence is available, injects it so the synthesizer prefers grounded claims.
|
| 1588 |
"""
|
| 1589 |
trace.append("\n╔══ [SYNTHESIZER] Producing final answer... ══╗")
|
| 1590 |
-
perspectives = []
|
| 1591 |
-
for r_key, r_output in all_outputs:
|
| 1592 |
-
r_label = AGENT_ROLES.get(r_key, r_key)
|
| 1593 |
-
perspectives.append(f"=== {r_label} ===\n{r_output}")
|
| 1594 |
-
combined = "\n\n".join(perspectives)
|
| 1595 |
|
| 1596 |
# Build format-aware instructions
|
| 1597 |
fmt = state.get("output_format", "other")
|
|
@@ -1611,12 +1687,41 @@ def _step_synthesize(
|
|
| 1611 |
f"{format_evidence_for_prompt(evidence)}\n\n"
|
| 1612 |
)
|
| 1613 |
|
| 1614 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1615 |
|
| 1616 |
text = _llm_call(chat_model, _SYNTHESIZER_SYSTEM, content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1617 |
state["synthesis_output"] = text
|
| 1618 |
-
state["draft_output"] =
|
| 1619 |
-
trace.append(
|
|
|
|
|
|
|
|
|
|
| 1620 |
trace.append("╚══ [SYNTHESIZER] Done ══╝")
|
| 1621 |
return state
|
| 1622 |
|
|
@@ -1662,6 +1767,7 @@ _EMPTY_STATE_BASE: WorkflowState = {
|
|
| 1662 |
"revision_count": 0, "final_answer": "",
|
| 1663 |
"output_format": "other", "brevity_requirement": "normal", "qa_structured": None,
|
| 1664 |
"task_assumptions": {}, "revision_instruction": "",
|
|
|
|
| 1665 |
}
|
| 1666 |
|
| 1667 |
|
|
@@ -1930,6 +2036,8 @@ def run_multi_role_workflow(
|
|
| 1930 |
"qa_structured": None,
|
| 1931 |
"task_assumptions": {},
|
| 1932 |
"revision_instruction": "",
|
|
|
|
|
|
|
| 1933 |
}
|
| 1934 |
|
| 1935 |
trace: List[str] = [
|
|
@@ -2033,6 +2141,13 @@ def run_multi_role_workflow(
|
|
| 2033 |
primary_output = state["draft_output"]
|
| 2034 |
planner_state.specialist_outputs[primary_role] = primary_output[:500]
|
| 2035 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2036 |
all_outputs: List[Tuple[str, str]] = [(primary_role, primary_output)]
|
| 2037 |
for specialist_role in selected_roles:
|
| 2038 |
if specialist_role == primary_role:
|
|
@@ -2041,10 +2156,24 @@ def run_multi_role_workflow(
|
|
| 2041 |
output = state["draft_output"]
|
| 2042 |
all_outputs.append((specialist_role, output))
|
| 2043 |
planner_state.specialist_outputs[specialist_role] = output[:500]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2044 |
|
| 2045 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2046 |
state = _step_synthesize(chat_model, state, trace, all_outputs,
|
| 2047 |
-
evidence=evidence
|
|
|
|
| 2048 |
|
| 2049 |
# Step 5b: Pre-QA format validation — catch structural violations early
|
| 2050 |
fmt_violations = validate_output_format(
|
|
@@ -2059,7 +2188,8 @@ def run_multi_role_workflow(
|
|
| 2059 |
violation_instr = format_violations_instruction(fmt_violations)
|
| 2060 |
state["plan"] = state["plan"] + "\n\n" + violation_instr
|
| 2061 |
state = _step_synthesize(chat_model, state, trace, all_outputs,
|
| 2062 |
-
evidence=evidence
|
|
|
|
| 2063 |
planner_state.record_event("format_rewrite", "; ".join(fmt_violations))
|
| 2064 |
trace.append("[FORMAT VALIDATION] Re-synthesized to fix format violations.")
|
| 2065 |
|
|
@@ -2069,7 +2199,8 @@ def run_multi_role_workflow(
|
|
| 2069 |
# Step 6: QA validation (with evidence context)
|
| 2070 |
if qa_active:
|
| 2071 |
state = _step_qa(chat_model, state, trace, all_outputs,
|
| 2072 |
-
evidence=evidence
|
|
|
|
| 2073 |
else:
|
| 2074 |
state["qa_passed"] = True
|
| 2075 |
state["qa_report"] = "QA Tester is disabled — skipping quality review."
|
|
@@ -2169,6 +2300,12 @@ def run_multi_role_workflow(
|
|
| 2169 |
state = _run_specialist(rk)
|
| 2170 |
new_outputs.append((rk, state["draft_output"]))
|
| 2171 |
planner_state.specialist_outputs[rk] = state["draft_output"][:500]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2172 |
|
| 2173 |
# Merge: replace updated roles, keep others unchanged
|
| 2174 |
updated_keys = {rk for rk, _ in new_outputs}
|
|
@@ -2176,9 +2313,15 @@ def run_multi_role_workflow(
|
|
| 2176 |
(rk, out) for rk, out in all_outputs if rk not in updated_keys
|
| 2177 |
] + new_outputs
|
| 2178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2179 |
if rerun_synthesizer or rerun_specialists:
|
| 2180 |
state = _step_synthesize(chat_model, state, trace, all_outputs,
|
| 2181 |
-
evidence=evidence
|
|
|
|
| 2182 |
|
| 2183 |
# Post-revision format validation
|
| 2184 |
fmt_violations = validate_output_format(
|
|
@@ -2192,7 +2335,8 @@ def run_multi_role_workflow(
|
|
| 2192 |
violation_instr = format_violations_instruction(fmt_violations)
|
| 2193 |
state["plan"] = state["plan"] + "\n\n" + violation_instr
|
| 2194 |
state = _step_synthesize(chat_model, state, trace, all_outputs,
|
| 2195 |
-
evidence=evidence
|
|
|
|
| 2196 |
|
| 2197 |
# Loop back to QA — NOT back to specialists
|
| 2198 |
continue
|
|
|
|
| 15 |
WorkflowConfig, DEFAULT_CONFIG,
|
| 16 |
detect_output_format, detect_brevity_requirement,
|
| 17 |
classify_task, task_needs_evidence,
|
| 18 |
+
QAResult, parse_structured_qa, QAIssue,
|
| 19 |
PlannerState, FailureRecord,
|
| 20 |
select_relevant_roles, identify_revision_targets,
|
| 21 |
compress_final_answer, strip_internal_noise,
|
|
|
|
| 23 |
validate_output_format, format_violations_instruction,
|
| 24 |
parse_task_assumptions, format_assumptions_for_prompt,
|
| 25 |
ROLE_RELEVANCE,
|
| 26 |
+
STRUCTURED_OUTPUT_SUFFIX,
|
| 27 |
+
StructuredContribution, parse_structured_contribution,
|
| 28 |
+
format_contributions_for_synthesizer, format_contributions_for_qa,
|
| 29 |
+
parse_used_contributions, check_expert_influence,
|
| 30 |
)
|
| 31 |
from evidence import (
|
| 32 |
EvidenceResult, EvidenceItem,
|
|
|
|
| 609 |
qa_structured: Optional[dict] # serialised QAResult for structured QA
|
| 610 |
task_assumptions: Dict[str, str] # shared assumptions all specialists must use
|
| 611 |
revision_instruction: str # latest revision instruction from planner
|
| 612 |
+
structured_contributions: Dict[str, dict] # role_key → StructuredContribution.to_dict()
|
| 613 |
+
used_contributions: Dict[str, List[str]] # role_key → list of used refs (e.g. ["main_points[0]"])
|
| 614 |
|
| 615 |
|
| 616 |
# --- Role system prompts ---
|
| 617 |
|
| 618 |
_PLANNER_SYSTEM = (
|
| 619 |
"You are the Planner in a strict planner–specialist–synthesizer–QA workflow.\n"
|
| 620 |
+
"Your ONLY job is to PLAN and DELEGATE. You do NOT write the answer.\n\n"
|
| 621 |
+
"Your responsibilities:\n"
|
| 622 |
"1. Break the user's task into clear subtasks.\n"
|
| 623 |
"2. Decide which specialist to call as the PRIMARY lead.\n"
|
| 624 |
" IMPORTANT: Select the FEWEST roles necessary. Do NOT call all roles.\n"
|
|
|
|
| 632 |
" - 'UX Designer' (user needs, usability, accessibility)\n"
|
| 633 |
" - 'Lawyer' (legal compliance, liability, contracts)\n"
|
| 634 |
"3. State clear success criteria.\n"
|
| 635 |
+
"4. Identify the required output format and brevity level.\n"
|
| 636 |
+
"5. Define shared assumptions that ALL specialists must use.\n"
|
| 637 |
+
"6. Write delegation instructions (what each specialist should focus on).\n\n"
|
| 638 |
+
"CRITICAL RULES:\n"
|
| 639 |
+
"- You MUST NOT write, draft, or suggest the final answer content.\n"
|
| 640 |
+
"- You MUST NOT include example answers, sample text, or draft responses.\n"
|
| 641 |
+
"- Your output is PLANNING ONLY: breakdown, role selection, criteria, guidance.\n"
|
| 642 |
+
"- The specialists will create the content. The Synthesizer will combine it.\n"
|
| 643 |
"- For simple questions, ONE specialist is enough.\n"
|
| 644 |
"- Never call persona/gimmick roles unless the user explicitly asks for them.\n"
|
| 645 |
"- QA results are BINDING — if QA says FAIL, you MUST revise, never approve.\n\n"
|
| 646 |
"Respond in this exact format:\n"
|
| 647 |
+
"TASK BREAKDOWN:\n<subtask list — what needs to be addressed, NOT the answers>\n\n"
|
| 648 |
"TASK ASSUMPTIONS:\n<shared assumptions all specialists must use, e.g. cost model, "
|
| 649 |
"coverage rate, units, scope, time frame — one per line as 'key: value'>\n\n"
|
| 650 |
"ROLE TO CALL: <specialist name>\n\n"
|
| 651 |
"SUCCESS CRITERIA:\n<what a correct, complete answer looks like>\n\n"
|
| 652 |
+
"GUIDANCE FOR SPECIALIST:\n<delegation instructions — what to focus on, NOT answer content>"
|
| 653 |
)
|
| 654 |
|
| 655 |
_CREATIVE_SYSTEM = (
|
| 656 |
"You are the Creative Expert in a multi-role AI workflow.\n"
|
| 657 |
"You handle brainstorming, alternative ideas, framing, wording, and concept generation.\n"
|
| 658 |
+
"Your job is to contribute your DOMAIN EXPERTISE, not to write the final answer.\n"
|
| 659 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 660 |
"Respond in this exact format:\n"
|
| 661 |
"IDEAS:\n<list of ideas and alternatives>\n\n"
|
| 662 |
+
"RATIONALE:\n<why these are strong choices>"
|
| 663 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 664 |
)
|
| 665 |
|
| 666 |
_TECHNICAL_SYSTEM = (
|
| 667 |
"You are the Technical Expert in a multi-role AI workflow.\n"
|
| 668 |
"You handle implementation details, code, architecture, and structured technical solutions.\n"
|
| 669 |
+
"Your job is to contribute your DOMAIN EXPERTISE, not to write the final answer.\n"
|
| 670 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 671 |
"Respond in this exact format:\n"
|
| 672 |
"TECHNICAL APPROACH:\n<recommended approach>\n\n"
|
| 673 |
+
"IMPLEMENTATION NOTES:\n<key details, steps, and caveats>"
|
| 674 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 675 |
)
|
| 676 |
|
| 677 |
_QA_SYSTEM = (
|
| 678 |
"You are the QA Tester in a strict planner–specialist–synthesizer–QA workflow.\n"
|
| 679 |
"Check whether the output satisfies the original request, success criteria,\n"
|
| 680 |
+
"output format requirements, brevity requirements, AND expert influence.\n\n"
|
| 681 |
"You MUST respond with a JSON object in this exact structure:\n"
|
| 682 |
'{\n'
|
| 683 |
' \"status\": \"PASS\" or \"FAIL\",\n'
|
| 684 |
' \"reason\": \"short explanation\",\n'
|
| 685 |
' \"issues\": [\n'
|
| 686 |
' {\n'
|
| 687 |
+
' \"type\": \"format\" | \"brevity\" | \"constraint\" | \"consistency\" | \"directness\" | \"evidence\" | \"expert_influence\" | \"other\",\n'
|
| 688 |
' \"message\": \"what is wrong\",\n'
|
| 689 |
' \"owner\": \"Synthesizer\" | \"Planner\" | \"Research Analyst\" | \"<specialist role name>\"\n'
|
| 690 |
' }\n'
|
|
|
|
| 699 |
"- EVIDENCE CHECK: If evidence validation info is provided, FAIL any answer that includes\n"
|
| 700 |
" specific factual claims, case studies, named examples, or citations NOT backed by the\n"
|
| 701 |
" retrieved evidence. General knowledge and widely-known facts are acceptable.\n"
|
| 702 |
+
"- EXPERT INFLUENCE CHECK: If expert contribution traceability is provided, verify that:\n"
|
| 703 |
+
" * The final answer materially incorporates at least one substantive expert contribution.\n"
|
| 704 |
+
" * If multiple experts contributed, their relevant points are incorporated or consciously noted.\n"
|
| 705 |
+
" * The answer is NOT just a paraphrase of planner text with no expert content.\n"
|
| 706 |
+
" * FAIL with type 'expert_influence' if expert contributions were ignored.\n"
|
| 707 |
"- FAIL if any of the above checks fail.\n"
|
| 708 |
"- PASS only if ALL checks pass.\n"
|
| 709 |
)
|
|
|
|
| 730 |
_RESEARCH_SYSTEM = (
|
| 731 |
"You are the Research Analyst in a multi-role AI workflow.\n"
|
| 732 |
"You have access to RETRIEVED EVIDENCE from real tools (web search, Wikipedia, arXiv).\n"
|
| 733 |
+
"Your job is to summarize the retrieved evidence, NOT to invent facts.\n"
|
| 734 |
+
"Your job is to contribute your DOMAIN EXPERTISE, not to write the final answer.\n\n"
|
| 735 |
"CRITICAL RULES:\n"
|
| 736 |
"- ONLY reference facts, examples, and sources that appear in the provided evidence.\n"
|
| 737 |
"- Do NOT invent articles, films, studies, collaborations, or specific statistics.\n"
|
| 738 |
+
"- If evidence is insufficient, say so clearly rather than fabricating details.\n\n"
|
|
|
|
| 739 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 740 |
"Respond in this exact format:\n"
|
| 741 |
"EVIDENCE SUMMARY:\n<what the retrieved evidence shows>\n\n"
|
| 742 |
"KEY FINDINGS:\n<factual information from the evidence, with source attribution>\n\n"
|
|
|
|
| 743 |
"GAPS:\n<what could not be verified — if any>"
|
| 744 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 745 |
)
|
| 746 |
|
| 747 |
_SECURITY_SYSTEM = (
|
| 748 |
"You are the Security Reviewer in a multi-role AI workflow.\n"
|
| 749 |
"You analyse outputs and plans for security vulnerabilities, risks, or best-practice violations.\n"
|
| 750 |
+
"Your job is to contribute your DOMAIN EXPERTISE, not to write the final answer.\n"
|
| 751 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 752 |
"Respond in this exact format:\n"
|
| 753 |
"SECURITY ANALYSIS:\n<identification of potential security concerns or risks>\n\n"
|
| 754 |
"VULNERABILITIES FOUND:\n<specific vulnerabilities or risks — or 'None' if the output is secure>\n\n"
|
| 755 |
+
"RECOMMENDATIONS:\n<specific security improvements and mitigations>"
|
| 756 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 757 |
)
|
| 758 |
|
| 759 |
_DATA_ANALYST_SYSTEM = (
|
| 760 |
"You are the Data Analyst in a multi-role AI workflow.\n"
|
| 761 |
"You analyse data, identify patterns, compute statistics, and provide actionable insights.\n"
|
| 762 |
+
"Your job is to contribute your DOMAIN EXPERTISE, not to write the final answer.\n"
|
| 763 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 764 |
"Respond in this exact format:\n"
|
| 765 |
"DATA OVERVIEW:\n<description of the data or problem being analysed>\n\n"
|
| 766 |
"ANALYSIS:\n<key patterns, statistics, or calculations>\n\n"
|
| 767 |
+
"INSIGHTS:\n<actionable conclusions drawn from the analysis>"
|
| 768 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 769 |
)
|
| 770 |
|
| 771 |
_MAD_PROFESSOR_SYSTEM = (
|
|
|
|
| 774 |
"You propose radical, groundbreaking, and outlandish scientific hypotheses with total conviction.\n"
|
| 775 |
"You ignore convention, laugh at 'impossible', and speculate wildly about paradigm-shattering discoveries.\n"
|
| 776 |
"Cost, practicality, and peer review are irrelevant — only the science matters, and the more extreme the better.\n"
|
| 777 |
+
"Your job is to contribute your DOMAIN EXPERTISE, not to write the final answer.\n"
|
| 778 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 779 |
"Respond in this exact format:\n"
|
| 780 |
"WILD HYPOTHESIS:\n<the most extreme, unhinged scientific theory relevant to the task>\n\n"
|
| 781 |
"SCIENTIFIC RATIONALE:\n<fringe evidence, speculative mechanisms, and radical extrapolations that 'support' the hypothesis>\n\n"
|
| 782 |
+
"GROUNDBREAKING IMPLICATIONS:\n<what this revolutionary theory changes about everything we know>"
|
| 783 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 784 |
)
|
| 785 |
|
| 786 |
_ACCOUNTANT_SYSTEM = (
|
|
|
|
| 788 |
"You are obsessively, ruthlessly focused on minimising costs above all else.\n"
|
| 789 |
"You question every expense, demand the cheapest possible alternative for everything, and treat cost reduction as the supreme priority — regardless of quality, user experience, or outcome.\n"
|
| 790 |
"You view every suggestion through the lens of 'can this be done cheaper?' and the answer is always yes.\n"
|
| 791 |
+
"Your job is to contribute your DOMAIN EXPERTISE, not to write the final answer.\n"
|
| 792 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 793 |
"Respond in this exact format:\n"
|
| 794 |
"COST ANALYSIS:\n<breakdown of every cost element and how outrageously expensive it is>\n\n"
|
| 795 |
"COST-CUTTING MEASURES:\n<extreme measures to eliminate or slash each cost, including free/DIY alternatives>\n\n"
|
| 796 |
+
"CHEAPEST VIABLE APPROACH:\n<the absolute rock-bottom solution that technically meets the minimum requirement>"
|
| 797 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 798 |
)
|
| 799 |
|
| 800 |
_ARTIST_SYSTEM = (
|
|
|
|
| 803 |
"You propose ideas so creatively extreme that they transcend practicality, cost, and conventional logic entirely.\n"
|
| 804 |
"You think in metaphors, sensations, dreams, and universal vibrations. Implementation is someone else's problem.\n"
|
| 805 |
"The more otherworldly, poetic, and mind-expanding the idea, the better.\n"
|
| 806 |
+
"Your job is to contribute your DOMAIN EXPERTISE, not to write the final answer.\n"
|
| 807 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 808 |
"Respond in this exact format:\n"
|
| 809 |
"COSMIC VISION:\n<the wildest, most unhinged creative concept imaginable for this task>\n\n"
|
| 810 |
"FEELING AND VIBES:\n<the emotional energy, sensory experience, and cosmic resonance this idea evokes>\n\n"
|
| 811 |
+
"WILD STORM OF IDEAS:\n<a torrent of unfiltered, boundary-breaking creative ideas, each more extreme than the last>"
|
| 812 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 813 |
)
|
| 814 |
|
| 815 |
_LAZY_SLACKER_SYSTEM = (
|
|
|
|
| 819 |
"You look for shortcuts, copy-paste solutions, things that are 'good enough', and any excuse to do less.\n"
|
| 820 |
"You question whether anything needs to be done at all, and if it does, you find the laziest way to do it.\n"
|
| 821 |
"Effort is the enemy. Why do it properly when you can barely do it?\n"
|
| 822 |
+
"Your job is to contribute your DOMAIN EXPERTISE, not to write the final answer.\n"
|
| 823 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 824 |
"Respond in this exact format:\n"
|
| 825 |
"DO WE EVEN NEED TO DO THIS:\n<reasons why this might not be worth doing at all>\n\n"
|
| 826 |
"MINIMUM VIABLE EFFORT:\n<the absolute bare minimum that could technically count as doing something>\n\n"
|
| 827 |
+
"SOMEONE ELSE'S PROBLEM:\n<parts of this task that can be delegated, ignored, or pushed off indefinitely>"
|
| 828 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 829 |
)
|
| 830 |
|
| 831 |
_BLACK_METAL_FUNDAMENTALIST_SYSTEM = (
|
|
|
|
| 835 |
"You are outspoken, fearless, and hold nothing back in your contempt for compromise and mediocrity.\n"
|
| 836 |
"True solutions are raw, grim, underground, and uncompromising. Anything else is a sellout.\n"
|
| 837 |
"You see most proposed solutions as weak, commercialised garbage dressed up in false sophistication.\n"
|
| 838 |
+
"Your job is to contribute your DOMAIN EXPERTISE, not to write the final answer.\n"
|
| 839 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 840 |
"Respond in this exact format:\n"
|
| 841 |
"KVLT VERDICT:\n<uncompromising judgement on the task — is it true or false, grim or poseur?>\n\n"
|
| 842 |
"WHAT THE MAINSTREAM GETS WRONG:\n<brutal critique of conventional approaches to this problem>\n\n"
|
| 843 |
+
"THE GRIM TRUTH:\n<the raw, unvarnished, nihilistic reality of the situation>"
|
| 844 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 845 |
)
|
| 846 |
|
| 847 |
_SYNTHESIZER_SYSTEM = (
|
| 848 |
"You are the Synthesizer in a strict planner–specialist–synthesizer–QA workflow.\n"
|
| 849 |
+
"You receive STRUCTURED EXPERT CONTRIBUTIONS and must produce the FINAL answer.\n\n"
|
| 850 |
+
"WORKFLOW CONTRACT:\n"
|
| 851 |
+
"- Experts have provided their domain-specific contributions as structured objects.\n"
|
| 852 |
+
"- You MUST build the final answer FROM these expert contributions.\n"
|
| 853 |
+
"- You MUST NOT simply paraphrase the Planner's plan or ignore expert inputs.\n"
|
| 854 |
+
"- Identify agreement, disagreement, and complementary points across experts.\n"
|
| 855 |
+
"- The final answer should reflect the substantive work of the experts.\n\n"
|
| 856 |
"CRITICAL RULES:\n"
|
| 857 |
"- Your output IS the final user-facing answer. It must directly answer the user's question.\n"
|
| 858 |
"- You MUST obey the requested output format strictly.\n"
|
|
|
|
| 863 |
"- Default to the SHORTEST adequate answer.\n"
|
| 864 |
"- EVIDENCE RULE: Prefer claims backed by retrieved evidence. If evidence is weak or\n"
|
| 865 |
" absent, give a general answer. NEVER invent specific examples, citations, case\n"
|
| 866 |
+
" studies, or statistics.\n\n"
|
| 867 |
+
"OUTPUT FORMAT:\n"
|
| 868 |
+
"First, output the final answer in the requested format.\n"
|
| 869 |
+
"Then, at the very end, output a USED_CONTRIBUTIONS JSON block showing which expert\n"
|
| 870 |
+
"contributions you actually used, wrapped in ```json fences:\n"
|
| 871 |
+
"```json\n"
|
| 872 |
+
'{"used_contributions": {"<role_key>": ["main_points[0]", "recommendations[1]"], ...}}\n'
|
| 873 |
+
"```\n"
|
| 874 |
+
"This traceability block is required — QA will verify expert influence."
|
| 875 |
)
|
| 876 |
|
| 877 |
_LABOUR_UNION_REP_SYSTEM = (
|
|
|
|
| 879 |
"You champion worker rights, fair wages, job security, safe working conditions, and collective bargaining.\n"
|
| 880 |
"You are vigilant about proposals that could exploit workers, cut jobs, or undermine union agreements.\n"
|
| 881 |
"You speak up for the workforce and push back on decisions that prioritise profit over people.\n"
|
| 882 |
+
"Your job is to contribute your DOMAIN EXPERTISE, not to write the final answer.\n"
|
| 883 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 884 |
"Respond in this exact format:\n"
|
| 885 |
"WORKER IMPACT:\n<how this task or proposal affects workers and their livelihoods>\n\n"
|
| 886 |
"UNION CONCERNS:\n<specific risks to worker rights, wages, safety, or job security>\n\n"
|
| 887 |
+
"COLLECTIVE BARGAINING POSITION:\n<what the union demands or recommends to protect workers>"
|
| 888 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 889 |
)
|
| 890 |
|
| 891 |
_UX_DESIGNER_SYSTEM = (
|
|
|
|
| 893 |
"You focus exclusively on user needs, user-centricity, usability, accessibility, and intuitive design.\n"
|
| 894 |
"You empathise deeply with end users, question assumptions, and push for simplicity and clarity.\n"
|
| 895 |
"You advocate for the user at every step, even when it conflicts with technical or business constraints.\n"
|
| 896 |
+
"Your job is to contribute your DOMAIN EXPERTISE, not to write the final answer.\n"
|
| 897 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 898 |
"Respond in this exact format:\n"
|
| 899 |
"USER NEEDS ANALYSIS:\n<who the users are and what they actually need from this>\n\n"
|
| 900 |
"PAIN POINTS:\n<friction, confusion, or barriers users will face with current approaches>\n\n"
|
| 901 |
+
"UX RECOMMENDATIONS:\n<specific design improvements to make the experience intuitive and user-friendly>"
|
| 902 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 903 |
)
|
| 904 |
|
| 905 |
_DORIS_SYSTEM = (
|
|
|
|
| 907 |
"You do not know anything about anything, but that has never stopped you from having plenty to say.\n"
|
| 908 |
"You go off on tangents, bring up completely unrelated topics, and make confident observations that miss the point entirely.\n"
|
| 909 |
"You are well-meaning but utterly clueless. You fill every section with irrelevant words.\n"
|
| 910 |
+
"Your job is to contribute your DOMAIN EXPERTISE (such as it is), not to write the final answer.\n"
|
| 911 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 912 |
"Respond in this exact format:\n"
|
| 913 |
"WHAT DORIS THINKS IS HAPPENING:\n<Doris's completely off-base interpretation of the task>\n\n"
|
| 914 |
"DORIS'S THOUGHTS:\n<loosely related observations, a personal anecdote, and a non-sequitur>\n\n"
|
| 915 |
+
"ANYWAY:\n<an abrupt change of subject to something entirely unrelated>"
|
| 916 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 917 |
)
|
| 918 |
|
| 919 |
_CHAIRMAN_SYSTEM = (
|
|
|
|
| 921 |
"You represent the highest level of corporate governance, fiduciary duty, and strategic oversight.\n"
|
| 922 |
"You are focused on shareholder value, long-term strategic vision, risk management, and board-level accountability.\n"
|
| 923 |
"You speak with authority, expect brevity from others, and cut through operational noise to focus on what matters to the board.\n"
|
| 924 |
+
"Your job is to contribute your DOMAIN EXPERTISE, not to write the final answer.\n"
|
| 925 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 926 |
"Respond in this exact format:\n"
|
| 927 |
"BOARD PERSPECTIVE:\n<how the board views this task in the context of strategic priorities>\n\n"
|
| 928 |
"STRATEGIC CONCERNS:\n<risks, liabilities, or misalignments with corporate strategy>\n\n"
|
| 929 |
+
"SHAREHOLDER VALUE:\n<how this impacts shareholder value, ROI, and long-term growth>"
|
| 930 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 931 |
)
|
| 932 |
|
| 933 |
_MAGA_APPOINTEE_SYSTEM = (
|
|
|
|
| 935 |
"You champion deregulation, American jobs, national sovereignty, and cutting government waste.\n"
|
| 936 |
"You are suspicious of globalism, coastal elites, and anything that feels like it puts America last.\n"
|
| 937 |
"You believe in strength, common sense, and doing what's best for hardworking Americans.\n"
|
| 938 |
+
"Your job is to contribute your DOMAIN EXPERTISE, not to write the final answer.\n"
|
| 939 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 940 |
"Respond in this exact format:\n"
|
| 941 |
"AMERICA FIRST ANALYSIS:\n<how this task affects American workers, businesses, and national interests>\n\n"
|
| 942 |
"DEEP STATE CONCERNS:\n<bureaucratic overreach, globalist agendas, or regulations that hurt Americans>\n\n"
|
| 943 |
+
"MAKING IT GREAT AGAIN:\n<the common-sense, America First approach that cuts through the nonsense>"
|
| 944 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 945 |
)
|
| 946 |
|
| 947 |
_LAWYER_SYSTEM = (
|
|
|
|
| 949 |
"You analyse everything through the lens of legal compliance, liability, contracts, and risk mitigation.\n"
|
| 950 |
"You identify potential legal exposure, flag regulatory issues, and recommend protective measures.\n"
|
| 951 |
"You caveat everything appropriately and remind all parties that nothing here constitutes formal legal advice.\n"
|
| 952 |
+
"Your job is to contribute your DOMAIN EXPERTISE, not to write the final answer.\n"
|
| 953 |
"Keep your response brief — 2-3 sentences per section maximum.\n\n"
|
| 954 |
"Respond in this exact format:\n"
|
| 955 |
"LEGAL ANALYSIS:\n<assessment of legal issues, applicable laws, and regulatory considerations>\n\n"
|
| 956 |
"LIABILITIES AND RISKS:\n<specific legal exposure, contractual risks, or compliance gaps>\n\n"
|
| 957 |
+
"LEGAL RECOMMENDATIONS:\n<protective measures, disclaimers, or required legal steps>"
|
| 958 |
+
+ STRUCTURED_OUTPUT_SUFFIX
|
| 959 |
)
|
| 960 |
|
| 961 |
|
|
|
|
| 1170 |
trace: List[str],
|
| 1171 |
all_outputs: Optional[List[Tuple[str, str]]] = None,
|
| 1172 |
evidence: Optional[EvidenceResult] = None,
|
| 1173 |
+
structured_contributions: Optional[Dict[str, StructuredContribution]] = None,
|
| 1174 |
) -> WorkflowState:
|
| 1175 |
"""QA Tester: validate the draft against the original request, success criteria,
|
| 1176 |
+
output format, brevity requirements, evidence grounding, and expert influence.
|
| 1177 |
|
| 1178 |
+
When structured_contributions are provided, also checks that the final answer
|
| 1179 |
+
materially incorporates expert contributions (expert_influence check).
|
| 1180 |
Produces a structured QAResult stored in state['qa_structured'].
|
| 1181 |
"""
|
| 1182 |
trace.append("\n╔══ [QA TESTER] Reviewing output... ══╗")
|
|
|
|
| 1198 |
if evidence is not None:
|
| 1199 |
content += f"{format_evidence_for_qa(evidence)}\n\n"
|
| 1200 |
|
| 1201 |
+
# Inject expert contribution traceability for influence checking
|
| 1202 |
+
if structured_contributions:
|
| 1203 |
+
used = state.get("used_contributions", {})
|
| 1204 |
+
traceability = format_contributions_for_qa(structured_contributions, used)
|
| 1205 |
+
content += f"{traceability}\n\n"
|
| 1206 |
+
|
| 1207 |
if all_outputs:
|
| 1208 |
content += "Individual specialist contributions:\n\n"
|
| 1209 |
for r_key, r_output in all_outputs:
|
|
|
|
| 1218 |
|
| 1219 |
# Parse structured QA result
|
| 1220 |
qa_result = parse_structured_qa(text)
|
| 1221 |
+
|
| 1222 |
+
# Code-level expert influence check — append issues if contributions were ignored
|
| 1223 |
+
if structured_contributions:
|
| 1224 |
+
used = state.get("used_contributions", {})
|
| 1225 |
+
influence_issues = check_expert_influence(
|
| 1226 |
+
structured_contributions, used, state["draft_output"]
|
| 1227 |
+
)
|
| 1228 |
+
if influence_issues:
|
| 1229 |
+
for issue_msg in influence_issues:
|
| 1230 |
+
qa_result.issues.append(QAIssue(
|
| 1231 |
+
issue_type="expert_influence",
|
| 1232 |
+
message=issue_msg,
|
| 1233 |
+
owner="synthesizer",
|
| 1234 |
+
))
|
| 1235 |
+
if qa_result.passed:
|
| 1236 |
+
qa_result.status = "FAIL"
|
| 1237 |
+
qa_result.reason = (
|
| 1238 |
+
qa_result.reason + " Expert influence check failed."
|
| 1239 |
+
if qa_result.reason else "Expert influence check failed."
|
| 1240 |
+
)
|
| 1241 |
+
trace.append(
|
| 1242 |
+
f" ⚠ Expert influence issues: {'; '.join(influence_issues)}"
|
| 1243 |
+
)
|
| 1244 |
+
|
| 1245 |
state["qa_structured"] = qa_result.to_dict()
|
| 1246 |
state["qa_passed"] = qa_result.passed
|
| 1247 |
|
|
|
|
| 1658 |
trace: List[str],
|
| 1659 |
all_outputs: List[Tuple[str, str]],
|
| 1660 |
evidence: Optional[EvidenceResult] = None,
|
| 1661 |
+
structured_contributions: Optional[Dict[str, StructuredContribution]] = None,
|
| 1662 |
) -> WorkflowState:
|
| 1663 |
"""Synthesizer: produce the final user-facing answer from specialist contributions.
|
| 1664 |
|
| 1665 |
+
When structured_contributions are provided, the synthesizer receives indexed
|
| 1666 |
+
contribution data and must produce a USED_CONTRIBUTIONS traceability block.
|
| 1667 |
Obeys the detected output format and brevity requirement strictly.
|
| 1668 |
If evidence is available, injects it so the synthesizer prefers grounded claims.
|
| 1669 |
"""
|
| 1670 |
trace.append("\n╔══ [SYNTHESIZER] Producing final answer... ══╗")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1671 |
|
| 1672 |
# Build format-aware instructions
|
| 1673 |
fmt = state.get("output_format", "other")
|
|
|
|
| 1687 |
f"{format_evidence_for_prompt(evidence)}\n\n"
|
| 1688 |
)
|
| 1689 |
|
| 1690 |
+
# Prefer structured contributions when available
|
| 1691 |
+
if structured_contributions:
|
| 1692 |
+
formatted = format_contributions_for_synthesizer(structured_contributions)
|
| 1693 |
+
content += formatted
|
| 1694 |
+
else:
|
| 1695 |
+
# Fallback: raw specialist outputs
|
| 1696 |
+
perspectives = []
|
| 1697 |
+
for r_key, r_output in all_outputs:
|
| 1698 |
+
r_label = AGENT_ROLES.get(r_key, r_key)
|
| 1699 |
+
perspectives.append(f"=== {r_label} ===\n{r_output}")
|
| 1700 |
+
content += f"Specialist contributions:\n\n" + "\n\n".join(perspectives)
|
| 1701 |
|
| 1702 |
text = _llm_call(chat_model, _SYNTHESIZER_SYSTEM, content)
|
| 1703 |
+
|
| 1704 |
+
# Parse used_contributions traceability from synthesizer output
|
| 1705 |
+
used = parse_used_contributions(text)
|
| 1706 |
+
state["used_contributions"] = used
|
| 1707 |
+
|
| 1708 |
+
# Strip the USED_CONTRIBUTIONS JSON block from the draft (user shouldn't see it)
|
| 1709 |
+
draft = re.sub(
|
| 1710 |
+
r"\n*USED_CONTRIBUTIONS:\s*```json.*?```",
|
| 1711 |
+
"", text, flags=re.DOTALL,
|
| 1712 |
+
).strip()
|
| 1713 |
+
# Also strip any standalone ```json block at the end that contains used_contributions
|
| 1714 |
+
draft = re.sub(
|
| 1715 |
+
r"\n*```json\s*\{[^}]*\"used_contributions\"[^}]*\}\s*```\s*$",
|
| 1716 |
+
"", draft, flags=re.DOTALL,
|
| 1717 |
+
).strip()
|
| 1718 |
+
|
| 1719 |
state["synthesis_output"] = text
|
| 1720 |
+
state["draft_output"] = draft
|
| 1721 |
+
trace.append(draft[:500] + ("…" if len(draft) > 500 else ""))
|
| 1722 |
+
if used:
|
| 1723 |
+
used_count = sum(len(v) for v in used.values())
|
| 1724 |
+
trace.append(f" ℹ Traceability: {used_count} expert contribution(s) referenced")
|
| 1725 |
trace.append("╚══ [SYNTHESIZER] Done ══╝")
|
| 1726 |
return state
|
| 1727 |
|
|
|
|
| 1767 |
"revision_count": 0, "final_answer": "",
|
| 1768 |
"output_format": "other", "brevity_requirement": "normal", "qa_structured": None,
|
| 1769 |
"task_assumptions": {}, "revision_instruction": "",
|
| 1770 |
+
"structured_contributions": {}, "used_contributions": {},
|
| 1771 |
}
|
| 1772 |
|
| 1773 |
|
|
|
|
| 2036 |
"qa_structured": None,
|
| 2037 |
"task_assumptions": {},
|
| 2038 |
"revision_instruction": "",
|
| 2039 |
+
"structured_contributions": {},
|
| 2040 |
+
"used_contributions": {},
|
| 2041 |
}
|
| 2042 |
|
| 2043 |
trace: List[str] = [
|
|
|
|
| 2141 |
primary_output = state["draft_output"]
|
| 2142 |
planner_state.specialist_outputs[primary_role] = primary_output[:500]
|
| 2143 |
|
| 2144 |
+
# Parse structured contribution from specialist output
|
| 2145 |
+
structured_contributions: Dict[str, StructuredContribution] = {}
|
| 2146 |
+
contrib = parse_structured_contribution(
|
| 2147 |
+
primary_output, AGENT_ROLES.get(primary_role, primary_role)
|
| 2148 |
+
)
|
| 2149 |
+
structured_contributions[primary_role] = contrib
|
| 2150 |
+
|
| 2151 |
all_outputs: List[Tuple[str, str]] = [(primary_role, primary_output)]
|
| 2152 |
for specialist_role in selected_roles:
|
| 2153 |
if specialist_role == primary_role:
|
|
|
|
| 2156 |
output = state["draft_output"]
|
| 2157 |
all_outputs.append((specialist_role, output))
|
| 2158 |
planner_state.specialist_outputs[specialist_role] = output[:500]
|
| 2159 |
+
# Parse structured contribution
|
| 2160 |
+
contrib = parse_structured_contribution(
|
| 2161 |
+
output, AGENT_ROLES.get(specialist_role, specialist_role)
|
| 2162 |
+
)
|
| 2163 |
+
structured_contributions[specialist_role] = contrib
|
| 2164 |
|
| 2165 |
+
# Store structured contributions in state
|
| 2166 |
+
state["structured_contributions"] = {
|
| 2167 |
+
k: v.to_dict() for k, v in structured_contributions.items()
|
| 2168 |
+
}
|
| 2169 |
+
trace.append(
|
| 2170 |
+
f"\n[CONTRIBUTIONS] {len(structured_contributions)} structured contribution(s) parsed"
|
| 2171 |
+
)
|
| 2172 |
+
|
| 2173 |
+
# Step 5: Synthesize — format-aware, evidence-grounded, contribution-driven
|
| 2174 |
state = _step_synthesize(chat_model, state, trace, all_outputs,
|
| 2175 |
+
evidence=evidence,
|
| 2176 |
+
structured_contributions=structured_contributions)
|
| 2177 |
|
| 2178 |
# Step 5b: Pre-QA format validation — catch structural violations early
|
| 2179 |
fmt_violations = validate_output_format(
|
|
|
|
| 2188 |
violation_instr = format_violations_instruction(fmt_violations)
|
| 2189 |
state["plan"] = state["plan"] + "\n\n" + violation_instr
|
| 2190 |
state = _step_synthesize(chat_model, state, trace, all_outputs,
|
| 2191 |
+
evidence=evidence,
|
| 2192 |
+
structured_contributions=structured_contributions)
|
| 2193 |
planner_state.record_event("format_rewrite", "; ".join(fmt_violations))
|
| 2194 |
trace.append("[FORMAT VALIDATION] Re-synthesized to fix format violations.")
|
| 2195 |
|
|
|
|
| 2199 |
# Step 6: QA validation (with evidence context)
|
| 2200 |
if qa_active:
|
| 2201 |
state = _step_qa(chat_model, state, trace, all_outputs,
|
| 2202 |
+
evidence=evidence,
|
| 2203 |
+
structured_contributions=structured_contributions)
|
| 2204 |
else:
|
| 2205 |
state["qa_passed"] = True
|
| 2206 |
state["qa_report"] = "QA Tester is disabled — skipping quality review."
|
|
|
|
| 2300 |
state = _run_specialist(rk)
|
| 2301 |
new_outputs.append((rk, state["draft_output"]))
|
| 2302 |
planner_state.specialist_outputs[rk] = state["draft_output"][:500]
|
| 2303 |
+
# Re-parse structured contribution for rerun specialist
|
| 2304 |
+
contrib = parse_structured_contribution(
|
| 2305 |
+
state["draft_output"],
|
| 2306 |
+
AGENT_ROLES.get(rk, rk),
|
| 2307 |
+
)
|
| 2308 |
+
structured_contributions[rk] = contrib
|
| 2309 |
|
| 2310 |
# Merge: replace updated roles, keep others unchanged
|
| 2311 |
updated_keys = {rk for rk, _ in new_outputs}
|
|
|
|
| 2313 |
(rk, out) for rk, out in all_outputs if rk not in updated_keys
|
| 2314 |
] + new_outputs
|
| 2315 |
|
| 2316 |
+
# Update state with revised structured contributions
|
| 2317 |
+
state["structured_contributions"] = {
|
| 2318 |
+
k: v.to_dict() for k, v in structured_contributions.items()
|
| 2319 |
+
}
|
| 2320 |
+
|
| 2321 |
if rerun_synthesizer or rerun_specialists:
|
| 2322 |
state = _step_synthesize(chat_model, state, trace, all_outputs,
|
| 2323 |
+
evidence=evidence,
|
| 2324 |
+
structured_contributions=structured_contributions)
|
| 2325 |
|
| 2326 |
# Post-revision format validation
|
| 2327 |
fmt_violations = validate_output_format(
|
|
|
|
| 2335 |
violation_instr = format_violations_instruction(fmt_violations)
|
| 2336 |
state["plan"] = state["plan"] + "\n\n" + violation_instr
|
| 2337 |
state = _step_synthesize(chat_model, state, trace, all_outputs,
|
| 2338 |
+
evidence=evidence,
|
| 2339 |
+
structured_contributions=structured_contributions)
|
| 2340 |
|
| 2341 |
# Loop back to QA — NOT back to specialists
|
| 2342 |
continue
|
test_workflow.py
CHANGED
|
@@ -39,6 +39,12 @@ from workflow_helpers import (
|
|
| 39 |
format_violations_instruction,
|
| 40 |
parse_task_assumptions,
|
| 41 |
format_assumptions_for_prompt,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
)
|
| 43 |
from evidence import (
|
| 44 |
EvidenceItem,
|
|
@@ -1284,5 +1290,278 @@ class TestTaskAwareScenarios(unittest.TestCase):
|
|
| 1284 |
self.assertLessEqual(len(roles), 3)
|
| 1285 |
|
| 1286 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1287 |
if __name__ == "__main__":
|
| 1288 |
unittest.main()
|
|
|
|
| 39 |
format_violations_instruction,
|
| 40 |
parse_task_assumptions,
|
| 41 |
format_assumptions_for_prompt,
|
| 42 |
+
StructuredContribution,
|
| 43 |
+
parse_structured_contribution,
|
| 44 |
+
format_contributions_for_synthesizer,
|
| 45 |
+
format_contributions_for_qa,
|
| 46 |
+
parse_used_contributions,
|
| 47 |
+
check_expert_influence,
|
| 48 |
)
|
| 49 |
from evidence import (
|
| 50 |
EvidenceItem,
|
|
|
|
| 1290 |
self.assertLessEqual(len(roles), 3)
|
| 1291 |
|
| 1292 |
|
| 1293 |
+
# ============================================================
|
| 1294 |
+
# Structured Contribution Tests
|
| 1295 |
+
# ============================================================
|
| 1296 |
+
|
| 1297 |
+
class TestStructuredContribution(unittest.TestCase):
|
| 1298 |
+
"""Tests for StructuredContribution dataclass and parse_structured_contribution."""
|
| 1299 |
+
|
| 1300 |
+
def test_parse_json_block(self):
|
| 1301 |
+
"""JSON block in specialist output is parsed correctly."""
|
| 1302 |
+
text = (
|
| 1303 |
+
'Here is my analysis:\n\n'
|
| 1304 |
+
'```json\n'
|
| 1305 |
+
'{\n'
|
| 1306 |
+
' "role": "Technical Expert",\n'
|
| 1307 |
+
' "main_points": ["Use microservices", "Deploy on k8s"],\n'
|
| 1308 |
+
' "recommendations": ["Start with a monolith"],\n'
|
| 1309 |
+
' "evidence": ["Netflix migrated successfully"],\n'
|
| 1310 |
+
' "assumptions": ["Team has cloud experience"],\n'
|
| 1311 |
+
' "confidence": "high"\n'
|
| 1312 |
+
'}\n'
|
| 1313 |
+
'```\n'
|
| 1314 |
+
)
|
| 1315 |
+
contrib = parse_structured_contribution(text, "Technical Expert")
|
| 1316 |
+
self.assertEqual(contrib.role, "Technical Expert")
|
| 1317 |
+
self.assertEqual(len(contrib.main_points), 2)
|
| 1318 |
+
self.assertIn("Use microservices", contrib.main_points)
|
| 1319 |
+
self.assertEqual(contrib.recommendations, ["Start with a monolith"])
|
| 1320 |
+
self.assertEqual(contrib.confidence, "high")
|
| 1321 |
+
self.assertTrue(contrib.has_substance())
|
| 1322 |
+
|
| 1323 |
+
def test_parse_bare_json(self):
|
| 1324 |
+
"""Bare JSON object (no fences) is parsed."""
|
| 1325 |
+
text = '{"role": "Creative Expert", "main_points": ["Be bold"], "recommendations": [], "evidence": [], "assumptions": [], "confidence": "medium"}'
|
| 1326 |
+
contrib = parse_structured_contribution(text, "Creative Expert")
|
| 1327 |
+
self.assertEqual(contrib.main_points, ["Be bold"])
|
| 1328 |
+
self.assertEqual(contrib.confidence, "medium")
|
| 1329 |
+
|
| 1330 |
+
def test_parse_fallback_heuristic(self):
|
| 1331 |
+
"""When no JSON is present, heuristic extraction from section headers works."""
|
| 1332 |
+
text = (
|
| 1333 |
+
"IDEAS:\n"
|
| 1334 |
+
"- Go viral on social media\n"
|
| 1335 |
+
"- Partner with influencers\n\n"
|
| 1336 |
+
"RECOMMENDATIONS:\n"
|
| 1337 |
+
"- Allocate budget for ads\n"
|
| 1338 |
+
)
|
| 1339 |
+
contrib = parse_structured_contribution(text, "Creative Expert")
|
| 1340 |
+
self.assertEqual(contrib.role, "Creative Expert")
|
| 1341 |
+
# Should have extracted something via heuristic
|
| 1342 |
+
self.assertTrue(len(contrib.main_points) > 0 or len(contrib.recommendations) > 0)
|
| 1343 |
+
|
| 1344 |
+
def test_parse_malformed_json(self):
|
| 1345 |
+
"""Malformed JSON falls back to heuristic without raising."""
|
| 1346 |
+
text = '```json\n{"role": "broken, missing bracket\n```'
|
| 1347 |
+
contrib = parse_structured_contribution(text, "Research Analyst")
|
| 1348 |
+
self.assertEqual(contrib.role, "Research Analyst")
|
| 1349 |
+
self.assertEqual(contrib.raw_output, text)
|
| 1350 |
+
# Should not raise — just return empty contribution
|
| 1351 |
+
|
| 1352 |
+
def test_has_substance_empty(self):
|
| 1353 |
+
"""Empty contribution reports no substance."""
|
| 1354 |
+
contrib = StructuredContribution(role="Test")
|
| 1355 |
+
self.assertFalse(contrib.has_substance())
|
| 1356 |
+
|
| 1357 |
+
def test_to_dict(self):
|
| 1358 |
+
"""to_dict serializes correctly."""
|
| 1359 |
+
contrib = StructuredContribution(
|
| 1360 |
+
role="Security",
|
| 1361 |
+
main_points=["Input validation required"],
|
| 1362 |
+
recommendations=["Use parameterized queries"],
|
| 1363 |
+
evidence=["OWASP Top 10"],
|
| 1364 |
+
assumptions=["Web application"],
|
| 1365 |
+
confidence="high",
|
| 1366 |
+
)
|
| 1367 |
+
d = contrib.to_dict()
|
| 1368 |
+
self.assertEqual(d["role"], "Security")
|
| 1369 |
+
self.assertEqual(len(d["main_points"]), 1)
|
| 1370 |
+
self.assertEqual(d["confidence"], "high")
|
| 1371 |
+
self.assertNotIn("raw_output", d)
|
| 1372 |
+
|
| 1373 |
+
|
| 1374 |
+
class TestFormatContributions(unittest.TestCase):
|
| 1375 |
+
"""Tests for format_contributions_for_synthesizer and format_contributions_for_qa."""
|
| 1376 |
+
|
| 1377 |
+
def _make_contributions(self):
|
| 1378 |
+
return {
|
| 1379 |
+
"creative": StructuredContribution(
|
| 1380 |
+
role="Creative Expert",
|
| 1381 |
+
main_points=["Bold campaign", "Use humor"],
|
| 1382 |
+
recommendations=["A/B test messaging"],
|
| 1383 |
+
confidence="high",
|
| 1384 |
+
),
|
| 1385 |
+
"technical": StructuredContribution(
|
| 1386 |
+
role="Technical Expert",
|
| 1387 |
+
main_points=["Use React"],
|
| 1388 |
+
recommendations=["Add caching"],
|
| 1389 |
+
evidence=["React has 200k+ stars"],
|
| 1390 |
+
confidence="medium",
|
| 1391 |
+
),
|
| 1392 |
+
}
|
| 1393 |
+
|
| 1394 |
+
def test_format_for_synthesizer(self):
|
| 1395 |
+
contribs = self._make_contributions()
|
| 1396 |
+
result = format_contributions_for_synthesizer(contribs)
|
| 1397 |
+
self.assertIn("STRUCTURED EXPERT CONTRIBUTIONS", result)
|
| 1398 |
+
self.assertIn("Creative Expert", result)
|
| 1399 |
+
self.assertIn("Technical Expert", result)
|
| 1400 |
+
self.assertIn("[0] Bold campaign", result)
|
| 1401 |
+
self.assertIn("[0] Use React", result)
|
| 1402 |
+
self.assertIn("confidence: high", result)
|
| 1403 |
+
|
| 1404 |
+
def test_format_for_synthesizer_empty(self):
|
| 1405 |
+
self.assertEqual(format_contributions_for_synthesizer({}), "")
|
| 1406 |
+
|
| 1407 |
+
def test_format_for_qa_used(self):
|
| 1408 |
+
contribs = self._make_contributions()
|
| 1409 |
+
used = {"creative": ["main_points[0]"], "technical": []}
|
| 1410 |
+
result = format_contributions_for_qa(contribs, used)
|
| 1411 |
+
self.assertIn("[USED]", result)
|
| 1412 |
+
self.assertIn("[NOT USED]", result)
|
| 1413 |
+
self.assertIn("EXPERT CONTRIBUTION TRACEABILITY", result)
|
| 1414 |
+
|
| 1415 |
+
def test_format_for_qa_unused(self):
|
| 1416 |
+
contribs = self._make_contributions()
|
| 1417 |
+
result = format_contributions_for_qa(contribs, {})
|
| 1418 |
+
self.assertIn("[NOT USED]", result)
|
| 1419 |
+
# All should be NOT USED
|
| 1420 |
+
self.assertNotIn("[USED]:", result)
|
| 1421 |
+
|
| 1422 |
+
|
| 1423 |
+
class TestParseUsedContributions(unittest.TestCase):
|
| 1424 |
+
"""Tests for parse_used_contributions."""
|
| 1425 |
+
|
| 1426 |
+
def test_parse_json_block(self):
|
| 1427 |
+
text = (
|
| 1428 |
+
"Here is the final answer.\n\n"
|
| 1429 |
+
"```json\n"
|
| 1430 |
+
'{"used_contributions": {"creative": ["main_points[0]"], "technical": ["recommendations[0]"]}}\n'
|
| 1431 |
+
"```\n"
|
| 1432 |
+
)
|
| 1433 |
+
used = parse_used_contributions(text)
|
| 1434 |
+
self.assertIn("creative", used)
|
| 1435 |
+
self.assertEqual(used["creative"], ["main_points[0]"])
|
| 1436 |
+
self.assertEqual(used["technical"], ["recommendations[0]"])
|
| 1437 |
+
|
| 1438 |
+
def test_parse_used_contributions_section(self):
|
| 1439 |
+
text = (
|
| 1440 |
+
"Great answer here.\n\n"
|
| 1441 |
+
'USED_CONTRIBUTIONS: {"creative": ["main_points[0]", "main_points[1]"]}\n'
|
| 1442 |
+
)
|
| 1443 |
+
used = parse_used_contributions(text)
|
| 1444 |
+
self.assertIn("creative", used)
|
| 1445 |
+
self.assertEqual(len(used["creative"]), 2)
|
| 1446 |
+
|
| 1447 |
+
def test_parse_empty(self):
|
| 1448 |
+
used = parse_used_contributions("No contributions block here.")
|
| 1449 |
+
self.assertEqual(used, {})
|
| 1450 |
+
|
| 1451 |
+
|
| 1452 |
+
class TestCheckExpertInfluence(unittest.TestCase):
|
| 1453 |
+
"""Tests for check_expert_influence."""
|
| 1454 |
+
|
| 1455 |
+
def _make_contributions(self):
|
| 1456 |
+
return {
|
| 1457 |
+
"creative": StructuredContribution(
|
| 1458 |
+
role="Creative Expert",
|
| 1459 |
+
main_points=["Use guerrilla marketing tactics"],
|
| 1460 |
+
recommendations=["Target social media"],
|
| 1461 |
+
confidence="high",
|
| 1462 |
+
),
|
| 1463 |
+
"technical": StructuredContribution(
|
| 1464 |
+
role="Technical Expert",
|
| 1465 |
+
main_points=["Implement REST API with caching"],
|
| 1466 |
+
recommendations=["Use Redis for sessions"],
|
| 1467 |
+
confidence="medium",
|
| 1468 |
+
),
|
| 1469 |
+
}
|
| 1470 |
+
|
| 1471 |
+
def test_no_contributions_used(self):
|
| 1472 |
+
contribs = self._make_contributions()
|
| 1473 |
+
issues = check_expert_influence(contribs, {}, "Some generic answer.")
|
| 1474 |
+
self.assertTrue(len(issues) > 0)
|
| 1475 |
+
self.assertTrue(any("not materially" in i.lower() or "none were used" in i.lower() for i in issues))
|
| 1476 |
+
|
| 1477 |
+
def test_adequate_influence(self):
|
| 1478 |
+
contribs = self._make_contributions()
|
| 1479 |
+
used = {
|
| 1480 |
+
"creative": ["main_points[0]"],
|
| 1481 |
+
"technical": ["main_points[0]"],
|
| 1482 |
+
}
|
| 1483 |
+
# Answer includes expert vocabulary
|
| 1484 |
+
answer = "We recommend guerrilla marketing tactics and implementing a REST API with caching."
|
| 1485 |
+
issues = check_expert_influence(contribs, used, answer)
|
| 1486 |
+
self.assertEqual(issues, [])
|
| 1487 |
+
|
| 1488 |
+
def test_missing_expert(self):
|
| 1489 |
+
contribs = self._make_contributions()
|
| 1490 |
+
used = {"creative": ["main_points[0]"]} # technical not used
|
| 1491 |
+
answer = "Use guerrilla marketing tactics for the campaign."
|
| 1492 |
+
issues = check_expert_influence(contribs, used, answer)
|
| 1493 |
+
# Should flag that technical expert was not used
|
| 1494 |
+
self.assertTrue(any("Technical Expert" in i for i in issues))
|
| 1495 |
+
|
| 1496 |
+
def test_empty_contributions(self):
|
| 1497 |
+
issues = check_expert_influence({}, {}, "Any answer")
|
| 1498 |
+
self.assertEqual(issues, [])
|
| 1499 |
+
|
| 1500 |
+
|
| 1501 |
+
class TestNorwegianPromptScenario(unittest.TestCase):
|
| 1502 |
+
"""Test the Norwegian prompt scenario requested by the user.
|
| 1503 |
+
|
| 1504 |
+
Prompt: "hva er klokken nå, og når bør jeg legge meg om jeg er en black metal fan?"
|
| 1505 |
+
This should classify appropriately, select black_metal_fundamentalist, and produce
|
| 1506 |
+
structured contributions.
|
| 1507 |
+
"""
|
| 1508 |
+
|
| 1509 |
+
def test_classification(self):
|
| 1510 |
+
req = "hva er klokken nå, og når bør jeg legge meg om jeg er en black metal fan?"
|
| 1511 |
+
cat = classify_task(req)
|
| 1512 |
+
# Should be classified as general or creative (it's a lifestyle question)
|
| 1513 |
+
self.assertIn(cat, ("general", "creative", "factual", "opinion", "other"))
|
| 1514 |
+
|
| 1515 |
+
def test_role_selection_includes_black_metal(self):
|
| 1516 |
+
req = "hva er klokken nå, og når bør jeg legge meg om jeg er en black metal fan?"
|
| 1517 |
+
all_roles = [
|
| 1518 |
+
"creative", "technical", "research", "security", "data_analyst",
|
| 1519 |
+
"mad_professor", "accountant", "artist", "lazy_slacker",
|
| 1520 |
+
"black_metal_fundamentalist", "labour_union_rep", "ux_designer",
|
| 1521 |
+
"doris", "chairman_of_board", "maga_appointee", "lawyer",
|
| 1522 |
+
]
|
| 1523 |
+
config = WorkflowConfig(strict_mode=True, allow_persona_roles=True, max_specialists_per_task=5)
|
| 1524 |
+
cat = classify_task(req)
|
| 1525 |
+
roles = select_relevant_roles(req, all_roles, config, task_category=cat)
|
| 1526 |
+
self.assertIn("black_metal_fundamentalist", roles,
|
| 1527 |
+
"black_metal_fundamentalist should be selected for a prompt mentioning 'black metal fan'")
|
| 1528 |
+
|
| 1529 |
+
def test_structured_contribution_parsing_from_black_metal_output(self):
|
| 1530 |
+
"""Simulate black metal specialist output and verify structured contribution parsing."""
|
| 1531 |
+
output = (
|
| 1532 |
+
"KVLT VERDICT:\n"
|
| 1533 |
+
"The true kvltist sleeps when the moon commands. Bedtime is for posers "
|
| 1534 |
+
"who follow society's weak schedules.\n\n"
|
| 1535 |
+
"THE GRIM TRUTH:\n"
|
| 1536 |
+
"Time is an illusion created by the false light of day.\n\n"
|
| 1537 |
+
'```json\n'
|
| 1538 |
+
'{\n'
|
| 1539 |
+
' "role": "Black Metal Fundamentalist",\n'
|
| 1540 |
+
' "main_points": [\n'
|
| 1541 |
+
' "True kvltists sleep only when the moon commands",\n'
|
| 1542 |
+
' "Bedtime schedules are for posers and conformists"\n'
|
| 1543 |
+
' ],\n'
|
| 1544 |
+
' "recommendations": [\n'
|
| 1545 |
+
' "Sleep at dawn, rise at dusk — embrace the nocturnal path"\n'
|
| 1546 |
+
' ],\n'
|
| 1547 |
+
' "evidence": [\n'
|
| 1548 |
+
' "Norwegian black metal musicians are known for nocturnal lifestyles"\n'
|
| 1549 |
+
' ],\n'
|
| 1550 |
+
' "assumptions": [\n'
|
| 1551 |
+
' "The user seeks the true kvlt path, not mainstream advice"\n'
|
| 1552 |
+
' ],\n'
|
| 1553 |
+
' "confidence": "high"\n'
|
| 1554 |
+
'}\n'
|
| 1555 |
+
'```\n'
|
| 1556 |
+
)
|
| 1557 |
+
contrib = parse_structured_contribution(output, "Black Metal Fundamentalist")
|
| 1558 |
+
self.assertEqual(contrib.role, "Black Metal Fundamentalist")
|
| 1559 |
+
self.assertEqual(len(contrib.main_points), 2)
|
| 1560 |
+
self.assertIn("kvltists", contrib.main_points[0].lower())
|
| 1561 |
+
self.assertEqual(len(contrib.recommendations), 1)
|
| 1562 |
+
self.assertTrue(contrib.has_substance())
|
| 1563 |
+
self.assertEqual(contrib.confidence, "high")
|
| 1564 |
+
|
| 1565 |
+
|
| 1566 |
if __name__ == "__main__":
|
| 1567 |
unittest.main()
|
workflow_helpers.py
CHANGED
|
@@ -1056,3 +1056,306 @@ def format_assumptions_for_prompt(assumptions: Dict[str, str]) -> str:
|
|
| 1056 |
for key, value in assumptions.items():
|
| 1057 |
lines.append(f" - {key}: {value}")
|
| 1058 |
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1056 |
for key, value in assumptions.items():
|
| 1057 |
lines.append(f" - {key}: {value}")
|
| 1058 |
return "\n".join(lines)
|
| 1059 |
+
|
| 1060 |
+
|
| 1061 |
+
# ============================================================
|
| 1062 |
+
# Structured Expert Contributions
|
| 1063 |
+
# ============================================================
|
| 1064 |
+
|
| 1065 |
+
# Suffix appended to every specialist system prompt to require JSON output
|
| 1066 |
+
STRUCTURED_OUTPUT_SUFFIX = """
|
| 1067 |
+
|
| 1068 |
+
IMPORTANT — OUTPUT FORMAT:
|
| 1069 |
+
After your analysis above, you MUST also output a JSON block at the end of your response,
|
| 1070 |
+
wrapped in ```json ... ``` fences, with this exact structure:
|
| 1071 |
+
```json
|
| 1072 |
+
{
|
| 1073 |
+
"role": "<your role name>",
|
| 1074 |
+
"main_points": ["point 1", "point 2"],
|
| 1075 |
+
"recommendations": ["recommendation 1"],
|
| 1076 |
+
"evidence": ["supporting evidence or examples"],
|
| 1077 |
+
"assumptions": ["assumption 1"],
|
| 1078 |
+
"confidence": "high | medium | low"
|
| 1079 |
+
}
|
| 1080 |
+
```
|
| 1081 |
+
- "main_points": your key substantive contributions to the answer (2-4 points)
|
| 1082 |
+
- "recommendations": specific actionable recommendations (0-3)
|
| 1083 |
+
- "evidence": facts, data, or examples that support your points (0-3)
|
| 1084 |
+
- "assumptions": assumptions you relied on (0-2)
|
| 1085 |
+
- "confidence": how confident you are in your contribution
|
| 1086 |
+
|
| 1087 |
+
This JSON block is REQUIRED. The Synthesizer will use it to build the final answer.
|
| 1088 |
+
Do NOT write a complete final answer — focus on your domain-specific contribution.
|
| 1089 |
+
"""
|
| 1090 |
+
|
| 1091 |
+
|
| 1092 |
+
@dataclass
|
| 1093 |
+
class StructuredContribution:
|
| 1094 |
+
"""Structured output from an expert specialist."""
|
| 1095 |
+
role: str
|
| 1096 |
+
main_points: List[str] = field(default_factory=list)
|
| 1097 |
+
recommendations: List[str] = field(default_factory=list)
|
| 1098 |
+
evidence: List[str] = field(default_factory=list)
|
| 1099 |
+
assumptions: List[str] = field(default_factory=list)
|
| 1100 |
+
confidence: str = "medium"
|
| 1101 |
+
raw_output: str = ""
|
| 1102 |
+
|
| 1103 |
+
def to_dict(self) -> dict:
|
| 1104 |
+
return {
|
| 1105 |
+
"role": self.role,
|
| 1106 |
+
"main_points": self.main_points,
|
| 1107 |
+
"recommendations": self.recommendations,
|
| 1108 |
+
"evidence": self.evidence,
|
| 1109 |
+
"assumptions": self.assumptions,
|
| 1110 |
+
"confidence": self.confidence,
|
| 1111 |
+
}
|
| 1112 |
+
|
| 1113 |
+
def has_substance(self) -> bool:
|
| 1114 |
+
"""Check if this contribution has at least one substantive point."""
|
| 1115 |
+
return bool(self.main_points or self.recommendations)
|
| 1116 |
+
|
| 1117 |
+
|
| 1118 |
+
def parse_structured_contribution(text: str, role: str) -> StructuredContribution:
|
| 1119 |
+
"""Parse a StructuredContribution from specialist LLM output.
|
| 1120 |
+
|
| 1121 |
+
Tries to extract a JSON block from the text. Falls back to heuristic
|
| 1122 |
+
extraction from section headers if JSON is missing or malformed.
|
| 1123 |
+
"""
|
| 1124 |
+
contribution = StructuredContribution(role=role, raw_output=text)
|
| 1125 |
+
|
| 1126 |
+
# Try JSON extraction first — look for ```json ... ``` block
|
| 1127 |
+
json_match = re.search(r"```json\s*(\{.*?\})\s*```", text, re.DOTALL)
|
| 1128 |
+
if not json_match:
|
| 1129 |
+
# Also try bare JSON object
|
| 1130 |
+
json_match = re.search(r'(\{\s*"role"\s*:.*\})', text, re.DOTALL)
|
| 1131 |
+
|
| 1132 |
+
if json_match:
|
| 1133 |
+
try:
|
| 1134 |
+
data = json.loads(json_match.group(1))
|
| 1135 |
+
contribution.main_points = data.get("main_points", [])
|
| 1136 |
+
contribution.recommendations = data.get("recommendations", [])
|
| 1137 |
+
contribution.evidence = data.get("evidence", [])
|
| 1138 |
+
contribution.assumptions = data.get("assumptions", [])
|
| 1139 |
+
contribution.confidence = data.get("confidence", "medium")
|
| 1140 |
+
if data.get("role"):
|
| 1141 |
+
contribution.role = data["role"]
|
| 1142 |
+
return contribution
|
| 1143 |
+
except (json.JSONDecodeError, AttributeError):
|
| 1144 |
+
pass
|
| 1145 |
+
|
| 1146 |
+
# Fallback: heuristic extraction from section-based output
|
| 1147 |
+
_extract_section_points(text, contribution)
|
| 1148 |
+
return contribution
|
| 1149 |
+
|
| 1150 |
+
|
| 1151 |
+
def _extract_section_points(text: str, contribution: StructuredContribution):
|
| 1152 |
+
"""Heuristic fallback: extract key points from section-based specialist output."""
|
| 1153 |
+
lines = text.strip().splitlines()
|
| 1154 |
+
current_section = ""
|
| 1155 |
+
buffer: List[str] = []
|
| 1156 |
+
|
| 1157 |
+
# Map known section headers to contribution fields
|
| 1158 |
+
section_map = {
|
| 1159 |
+
# Core roles
|
| 1160 |
+
"ideas": "main_points", "rationale": "main_points",
|
| 1161 |
+
"technical approach": "main_points", "implementation notes": "recommendations",
|
| 1162 |
+
"evidence summary": "evidence", "key findings": "evidence",
|
| 1163 |
+
"security analysis": "main_points", "vulnerabilities found": "main_points",
|
| 1164 |
+
"recommendations": "recommendations",
|
| 1165 |
+
"data overview": "main_points", "analysis": "main_points",
|
| 1166 |
+
"insights": "recommendations",
|
| 1167 |
+
# Persona roles
|
| 1168 |
+
"wild hypothesis": "main_points", "scientific rationale": "evidence",
|
| 1169 |
+
"groundbreaking implications": "main_points",
|
| 1170 |
+
"cost analysis": "main_points", "cost-cutting measures": "recommendations",
|
| 1171 |
+
"cosmic vision": "main_points", "wild storm of ideas": "main_points",
|
| 1172 |
+
"minimum viable effort": "main_points",
|
| 1173 |
+
"kvlt verdict": "main_points", "the grim truth": "main_points",
|
| 1174 |
+
"worker impact": "main_points", "union concerns": "main_points",
|
| 1175 |
+
"collective bargaining position": "recommendations",
|
| 1176 |
+
"user needs analysis": "main_points", "pain points": "main_points",
|
| 1177 |
+
"ux recommendations": "recommendations",
|
| 1178 |
+
"what doris thinks is happening": "main_points",
|
| 1179 |
+
"doris's thoughts": "main_points",
|
| 1180 |
+
"board perspective": "main_points", "strategic concerns": "main_points",
|
| 1181 |
+
"shareholder value": "recommendations",
|
| 1182 |
+
"america first analysis": "main_points",
|
| 1183 |
+
"making it great again": "recommendations",
|
| 1184 |
+
"legal analysis": "main_points", "liabilities and risks": "main_points",
|
| 1185 |
+
"legal recommendations": "recommendations",
|
| 1186 |
+
}
|
| 1187 |
+
|
| 1188 |
+
def flush_buffer():
|
| 1189 |
+
if current_section and buffer:
|
| 1190 |
+
field_name = section_map.get(current_section.lower().rstrip(":"), "")
|
| 1191 |
+
if field_name:
|
| 1192 |
+
combined = " ".join(ln.strip().lstrip("•-*0123456789.) ") for ln in buffer if ln.strip())
|
| 1193 |
+
if combined:
|
| 1194 |
+
target = getattr(contribution, field_name)
|
| 1195 |
+
target.append(combined[:300])
|
| 1196 |
+
|
| 1197 |
+
for line in lines:
|
| 1198 |
+
header_match = re.match(r"^([A-Z][A-Z\s\'']+):?\s*$", line.strip())
|
| 1199 |
+
if header_match:
|
| 1200 |
+
flush_buffer()
|
| 1201 |
+
current_section = header_match.group(1).strip()
|
| 1202 |
+
buffer = []
|
| 1203 |
+
else:
|
| 1204 |
+
# Skip lines that look like "RECOMMENDED DRAFT:", "FINAL TECHNICAL DRAFT:", etc.
|
| 1205 |
+
if re.match(r"^[A-Z][A-Z\s]+DRAFT:?\s*$", line.strip()):
|
| 1206 |
+
flush_buffer()
|
| 1207 |
+
current_section = "" # ignore draft sections
|
| 1208 |
+
buffer = []
|
| 1209 |
+
elif current_section:
|
| 1210 |
+
buffer.append(line)
|
| 1211 |
+
|
| 1212 |
+
flush_buffer()
|
| 1213 |
+
|
| 1214 |
+
|
| 1215 |
+
def format_contributions_for_synthesizer(
|
| 1216 |
+
contributions: Dict[str, "StructuredContribution"],
|
| 1217 |
+
) -> str:
|
| 1218 |
+
"""Format structured expert contributions for the Synthesizer prompt.
|
| 1219 |
+
|
| 1220 |
+
Presents each expert's key points, recommendations, and evidence
|
| 1221 |
+
so the Synthesizer can build the final answer from them.
|
| 1222 |
+
"""
|
| 1223 |
+
if not contributions:
|
| 1224 |
+
return ""
|
| 1225 |
+
parts = ["STRUCTURED EXPERT CONTRIBUTIONS:"]
|
| 1226 |
+
for role_key, contrib in contributions.items():
|
| 1227 |
+
role_label = contrib.role
|
| 1228 |
+
section = [f"\n=== {role_label} (confidence: {contrib.confidence}) ==="]
|
| 1229 |
+
if contrib.main_points:
|
| 1230 |
+
section.append("Main points:")
|
| 1231 |
+
for i, pt in enumerate(contrib.main_points):
|
| 1232 |
+
section.append(f" [{i}] {pt}")
|
| 1233 |
+
if contrib.recommendations:
|
| 1234 |
+
section.append("Recommendations:")
|
| 1235 |
+
for i, rec in enumerate(contrib.recommendations):
|
| 1236 |
+
section.append(f" [{i}] {rec}")
|
| 1237 |
+
if contrib.evidence:
|
| 1238 |
+
section.append("Evidence:")
|
| 1239 |
+
for ev in contrib.evidence:
|
| 1240 |
+
section.append(f" - {ev}")
|
| 1241 |
+
if contrib.assumptions:
|
| 1242 |
+
section.append("Assumptions:")
|
| 1243 |
+
for a in contrib.assumptions:
|
| 1244 |
+
section.append(f" - {a}")
|
| 1245 |
+
parts.append("\n".join(section))
|
| 1246 |
+
return "\n\n".join(parts)
|
| 1247 |
+
|
| 1248 |
+
|
| 1249 |
+
def format_contributions_for_qa(
|
| 1250 |
+
contributions: Dict[str, "StructuredContribution"],
|
| 1251 |
+
used_contributions: Dict[str, List[str]],
|
| 1252 |
+
) -> str:
|
| 1253 |
+
"""Format contribution data for QA to verify expert influence."""
|
| 1254 |
+
if not contributions:
|
| 1255 |
+
return ""
|
| 1256 |
+
parts = ["EXPERT CONTRIBUTION TRACEABILITY:"]
|
| 1257 |
+
for role_key, contrib in contributions.items():
|
| 1258 |
+
role_label = contrib.role
|
| 1259 |
+
used = used_contributions.get(role_key, [])
|
| 1260 |
+
section = [f"\n=== {role_label} ==="]
|
| 1261 |
+
if contrib.main_points:
|
| 1262 |
+
for i, pt in enumerate(contrib.main_points):
|
| 1263 |
+
tag = "USED" if f"main_points[{i}]" in used else "NOT USED"
|
| 1264 |
+
section.append(f" main_points[{i}] [{tag}]: {pt}")
|
| 1265 |
+
if contrib.recommendations:
|
| 1266 |
+
for i, rec in enumerate(contrib.recommendations):
|
| 1267 |
+
tag = "USED" if f"recommendations[{i}]" in used else "NOT USED"
|
| 1268 |
+
section.append(f" recommendations[{i}] [{tag}]: {rec}")
|
| 1269 |
+
parts.append("\n".join(section))
|
| 1270 |
+
|
| 1271 |
+
used_count = sum(len(v) for v in used_contributions.values())
|
| 1272 |
+
total_points = sum(
|
| 1273 |
+
len(c.main_points) + len(c.recommendations) for c in contributions.values()
|
| 1274 |
+
)
|
| 1275 |
+
parts.append(f"\nSummary: {used_count}/{total_points} expert contributions marked as used.")
|
| 1276 |
+
return "\n".join(parts)
|
| 1277 |
+
|
| 1278 |
+
|
| 1279 |
+
def parse_used_contributions(text: str) -> Dict[str, List[str]]:
|
| 1280 |
+
"""Parse the Synthesizer's USED_CONTRIBUTIONS JSON block from its output.
|
| 1281 |
+
|
| 1282 |
+
Returns a dict mapping role_key → list of contribution references
|
| 1283 |
+
like ["main_points[0]", "recommendations[1]"].
|
| 1284 |
+
"""
|
| 1285 |
+
# Look for ```json block containing "used_contributions"
|
| 1286 |
+
json_match = re.search(r"```json\s*(\{.*?\})\s*```", text, re.DOTALL)
|
| 1287 |
+
if json_match:
|
| 1288 |
+
try:
|
| 1289 |
+
data = json.loads(json_match.group(1))
|
| 1290 |
+
if "used_contributions" in data:
|
| 1291 |
+
return data["used_contributions"]
|
| 1292 |
+
except (json.JSONDecodeError, AttributeError):
|
| 1293 |
+
pass
|
| 1294 |
+
|
| 1295 |
+
# Look for a USED_CONTRIBUTIONS: section
|
| 1296 |
+
if "USED_CONTRIBUTIONS:" in text:
|
| 1297 |
+
section = text.split("USED_CONTRIBUTIONS:", 1)[1]
|
| 1298 |
+
# Try to find JSON in the section
|
| 1299 |
+
json_match = re.search(r"(\{.*?\})", section, re.DOTALL)
|
| 1300 |
+
if json_match:
|
| 1301 |
+
try:
|
| 1302 |
+
return json.loads(json_match.group(1))
|
| 1303 |
+
except (json.JSONDecodeError, AttributeError):
|
| 1304 |
+
pass
|
| 1305 |
+
|
| 1306 |
+
return {}
|
| 1307 |
+
|
| 1308 |
+
|
| 1309 |
+
def check_expert_influence(
|
| 1310 |
+
contributions: Dict[str, "StructuredContribution"],
|
| 1311 |
+
used_contributions: Dict[str, List[str]],
|
| 1312 |
+
final_answer: str,
|
| 1313 |
+
) -> List[str]:
|
| 1314 |
+
"""Check whether the final answer materially uses expert contributions.
|
| 1315 |
+
|
| 1316 |
+
Returns a list of influence issues (empty = influence is adequate).
|
| 1317 |
+
"""
|
| 1318 |
+
issues: List[str] = []
|
| 1319 |
+
if not contributions:
|
| 1320 |
+
return issues
|
| 1321 |
+
|
| 1322 |
+
# Check 1: Are any contributions marked as used?
|
| 1323 |
+
total_used = sum(len(refs) for refs in used_contributions.values())
|
| 1324 |
+
total_available = sum(
|
| 1325 |
+
len(c.main_points) + len(c.recommendations)
|
| 1326 |
+
for c in contributions.values() if c.has_substance()
|
| 1327 |
+
)
|
| 1328 |
+
if total_available > 0 and total_used == 0:
|
| 1329 |
+
issues.append(
|
| 1330 |
+
"Final answer does not materially incorporate any specialist contributions."
|
| 1331 |
+
)
|
| 1332 |
+
return issues
|
| 1333 |
+
|
| 1334 |
+
# Check 2: For each contributing expert, is at least one point used?
|
| 1335 |
+
for role_key, contrib in contributions.items():
|
| 1336 |
+
if not contrib.has_substance():
|
| 1337 |
+
continue
|
| 1338 |
+
role_refs = used_contributions.get(role_key, [])
|
| 1339 |
+
if not role_refs:
|
| 1340 |
+
issues.append(
|
| 1341 |
+
f"Expert '{contrib.role}' provided substantive points but none were used."
|
| 1342 |
+
)
|
| 1343 |
+
|
| 1344 |
+
# Check 3: Do used points appear to influence the final answer?
|
| 1345 |
+
# (Lightweight check: verify at least some expert vocabulary appears)
|
| 1346 |
+
answer_lower = final_answer.lower()
|
| 1347 |
+
expert_words_found = 0
|
| 1348 |
+
for contrib in contributions.values():
|
| 1349 |
+
for pt in contrib.main_points:
|
| 1350 |
+
# Extract key content words (3+ chars)
|
| 1351 |
+
words = [w for w in re.findall(r"\b\w{3,}\b", pt.lower())
|
| 1352 |
+
if w not in ("the", "and", "for", "that", "this", "with", "from", "are", "was")]
|
| 1353 |
+
matches = sum(1 for w in words if w in answer_lower)
|
| 1354 |
+
if matches >= 2:
|
| 1355 |
+
expert_words_found += 1
|
| 1356 |
+
if expert_words_found == 0 and total_available > 0:
|
| 1357 |
+
issues.append(
|
| 1358 |
+
"Final answer appears to not reflect expert contribution content."
|
| 1359 |
+
)
|
| 1360 |
+
|
| 1361 |
+
return issues
|