Spaces:

CatoG
/

Agent1

Sleeping

App Files Files Community

CatoG commited on Mar 13

Commit

bec7c31

1 Parent(s): 61b5f58

revision 3

Browse files

Files changed (3) hide show

app.py +104 -55
test_workflow.py +196 -0
workflow_helpers.py +125 -0

app.py CHANGED Viewed

@@ -20,6 +20,8 @@ from workflow_helpers import (
     select_relevant_roles, identify_revision_targets,
     compress_final_answer, strip_internal_noise,
     get_synthesizer_format_instruction, get_qa_format_instruction,
     ROLE_RELEVANCE,
 )
 from evidence import (
@@ -601,6 +603,8 @@ class WorkflowState(TypedDict):
     output_format: str      # detected output format (single_choice, short_answer, etc.)
     brevity_requirement: str  # minimal, short, normal, verbose
     qa_structured: Optional[dict]  # serialised QAResult for structured QA
 # --- Role system prompts ---
@@ -628,6 +632,8 @@ _PLANNER_SYSTEM = (
     "- QA results are BINDING — if QA says FAIL, you MUST revise, never approve.\n\n"
     "Respond in this exact format:\n"
     "TASK BREAKDOWN:\n<subtask list>\n\n"
     "ROLE TO CALL: <specialist name>\n\n"
     "SUCCESS CRITERIA:\n<what a correct, complete answer looks like>\n\n"
     "GUIDANCE FOR SPECIALIST:\n<any constraints or focus areas>"
@@ -1655,6 +1661,7 @@ _EMPTY_STATE_BASE: WorkflowState = {
     "draft_output": "", "qa_report": "", "qa_role_feedback": {}, "qa_passed": False,
     "revision_count": 0, "final_answer": "",
     "output_format": "other", "brevity_requirement": "normal", "qa_structured": None,
 }
@@ -1921,6 +1928,8 @@ def run_multi_role_workflow(
         "output_format": output_format,
         "brevity_requirement": brevity,
         "qa_structured": None,
     }
     trace: List[str] = [
@@ -1958,6 +1967,14 @@ def run_multi_role_workflow(
     try:
         if planner_active:
             state = _step_plan(chat_model, state, trace)
         else:
             state["current_role"] = active_specialist_keys[0]
             state["plan"] = message
@@ -1993,39 +2010,62 @@ def run_multi_role_workflow(
             + ", ".join(AGENT_ROLES.get(k, k) for k in selected_roles)
         )
-        # Main orchestration loop
-        while True:
-            # Step 4: Run selected specialists
-            if primary_role not in selected_roles:
-                primary_role = selected_roles[0]
-                state["current_role"] = primary_role
-            # Run primary specialist (research gets evidence injected)
-            primary_fn = _SPECIALIST_STEPS.get(primary_role, _step_technical)
-            if primary_role == "research" and evidence:
-                state = _step_research(chat_model, state, trace, evidence=evidence)
-            else:
-                state = primary_fn(chat_model, state, trace)
-            primary_output = state["draft_output"]
-            planner_state.specialist_outputs[primary_role] = primary_output[:500]
-            all_outputs: List[Tuple[str, str]] = [(primary_role, primary_output)]
-            for specialist_role in selected_roles:
-                if specialist_role == primary_role:
-                    continue
-                if specialist_role == "research" and evidence:
-                    state = _step_research(chat_model, state, trace, evidence=evidence)
-                else:
-                    step_fn = _SPECIALIST_STEPS[specialist_role]
-                    state = step_fn(chat_model, state, trace)
-                output = state["draft_output"]
-                all_outputs.append((specialist_role, output))
-                planner_state.specialist_outputs[specialist_role] = output[:500]
-            # Step 5: Synthesize — format-aware, evidence-grounded
             state = _step_synthesize(chat_model, state, trace, all_outputs,
                                      evidence=evidence)
             # Step 6: QA validation (with evidence context)
             if qa_active:
                 state = _step_qa(chat_model, state, trace, all_outputs,
@@ -2056,7 +2096,18 @@ def run_multi_role_workflow(
                     trace.append("\n═══ WORKFLOW COMPLETE — APPROVED ═══")
                     break
-                # QA failed and planner was forced to revise
                 state["revision_count"] += 1
                 planner_state.revision_count = state["revision_count"]
@@ -2079,7 +2130,6 @@ def run_multi_role_workflow(
                     planner_state.record_event("escalation", escalation)
                     if escalation == "suppress_role":
-                        # Suppress roles that keep producing unsupported content
                         suppress = planner_state.get_roles_to_suppress()
                         for role_label in suppress:
                             role_key = _ROLE_LABEL_TO_KEY.get(role_label)
@@ -2090,17 +2140,15 @@ def run_multi_role_workflow(
                             selected_roles = [primary_role]
                     elif escalation == "rewrite_from_state":
-                        # Synthesizer should rewrite from state, not reuse bloated draft
                         trace.append("  ⚠ Synthesizer will rewrite from state instead of reusing draft")
-                        state["draft_output"] = ""  # Force synthesizer to rebuild
                     elif escalation == "narrow_scope":
-                        # Reduce to a single specialist
                         if len(selected_roles) > 1:
                             selected_roles = [selected_roles[0]]
                             trace.append(f"  ⚠ Narrowed to single specialist: {selected_roles[0]}")
-                # Step 9: TARGETED REVISIONS — only rerun failing role(s)
                 revision_targets = identify_revision_targets(qa_result, _ROLE_LABEL_TO_KEY)
                 trace.append(
                     f"\n═���═ REVISION {state['revision_count']} / {MAX_REVISIONS} ═══\n"
@@ -2108,45 +2156,46 @@ def run_multi_role_workflow(
                 )
                 planner_state.record_event("revision", f"targets={revision_targets}")
-                # Determine what to rerun
                 rerun_specialists = [
                     t for t in revision_targets
                     if t in _SPECIALIST_STEPS and t in selected_roles
                 ]
-                rerun_synthesizer = "synthesizer" in revision_targets or rerun_specialists
                 if rerun_specialists:
-                    # Only rerun the targeted specialists
                     new_outputs = []
                     for rk in rerun_specialists:
-                        if rk == "research" and evidence:
-                            state = _step_research(chat_model, state, trace, evidence=evidence)
-                        else:
-                            step_fn = _SPECIALIST_STEPS[rk]
-                            state = step_fn(chat_model, state, trace)
                         new_outputs.append((rk, state["draft_output"]))
                         planner_state.specialist_outputs[rk] = state["draft_output"][:500]
-                    # Merge with previous outputs (replace updated roles)
                     updated_keys = {rk for rk, _ in new_outputs}
-                    merged_outputs = [
                         (rk, out) for rk, out in all_outputs if rk not in updated_keys
                     ] + new_outputs
-                    all_outputs = merged_outputs
                 if rerun_synthesizer or rerun_specialists:
                     state = _step_synthesize(chat_model, state, trace, all_outputs,
                                              evidence=evidence)
-                # Update selected_roles based on planner's new routing
-                primary_role = state["current_role"]
-                if primary_role in selected_roles:
-                    pass  # keep existing selection
-                elif primary_role in active_specialist_keys:
-                    selected_roles = [primary_role] + [r for r in selected_roles if r != primary_role]
-                    selected_roles = selected_roles[:config.max_specialists_per_task]
-                continue  # Loop back to QA
             else:
                 # No Planner review loop — accept the draft

     select_relevant_roles, identify_revision_targets,
     compress_final_answer, strip_internal_noise,
     get_synthesizer_format_instruction, get_qa_format_instruction,
+    validate_output_format, format_violations_instruction,
+    parse_task_assumptions, format_assumptions_for_prompt,
     ROLE_RELEVANCE,
 )
 from evidence import (
     output_format: str      # detected output format (single_choice, short_answer, etc.)
     brevity_requirement: str  # minimal, short, normal, verbose
     qa_structured: Optional[dict]  # serialised QAResult for structured QA
+    task_assumptions: Dict[str, str]  # shared assumptions all specialists must use
+    revision_instruction: str  # latest revision instruction from planner
 # --- Role system prompts ---
     "- QA results are BINDING — if QA says FAIL, you MUST revise, never approve.\n\n"
     "Respond in this exact format:\n"
     "TASK BREAKDOWN:\n<subtask list>\n\n"
+    "TASK ASSUMPTIONS:\n<shared assumptions all specialists must use, e.g. cost model, "
+    "coverage rate, units, scope, time frame — one per line as 'key: value'>\n\n"
     "ROLE TO CALL: <specialist name>\n\n"
     "SUCCESS CRITERIA:\n<what a correct, complete answer looks like>\n\n"
     "GUIDANCE FOR SPECIALIST:\n<any constraints or focus areas>"
     "draft_output": "", "qa_report": "", "qa_role_feedback": {}, "qa_passed": False,
     "revision_count": 0, "final_answer": "",
     "output_format": "other", "brevity_requirement": "normal", "qa_structured": None,
+    "task_assumptions": {}, "revision_instruction": "",
 }
         "output_format": output_format,
         "brevity_requirement": brevity,
         "qa_structured": None,
+        "task_assumptions": {},
+        "revision_instruction": "",
     }
     trace: List[str] = [
     try:
         if planner_active:
             state = _step_plan(chat_model, state, trace)
+            # Parse shared task assumptions from planner output
+            assumptions = parse_task_assumptions(state["plan"])
+            if assumptions:
+                state["task_assumptions"] = assumptions
+                planner_state.task_assumptions = assumptions
+                trace.append(f"[ASSUMPTIONS] {len(assumptions)} shared assumption(s) set: "
+                             + ", ".join(f"{k}={v}" for k, v in assumptions.items()))
         else:
             state["current_role"] = active_specialist_keys[0]
             state["plan"] = message
             + ", ".join(AGENT_ROLES.get(k, k) for k in selected_roles)
         )
+        # Step 4: Run ALL selected specialists (initial run only)
+        if primary_role not in selected_roles:
+            primary_role = selected_roles[0]
+            state["current_role"] = primary_role
+        # Build assumptions context for specialist prompts
+        assumptions_ctx = format_assumptions_for_prompt(state.get("task_assumptions", {}))
+        def _run_specialist(role_key):
+            """Run a single specialist, injecting evidence and assumptions as needed."""
+            if role_key == "research" and evidence:
+                return _step_research(chat_model, state, trace, evidence=evidence)
+            step_fn = _SPECIALIST_STEPS.get(role_key, _step_technical)
+            # Inject shared assumptions into plan context for specialist
+            if assumptions_ctx and assumptions_ctx not in state["plan"]:
+                state["plan"] = state["plan"] + "\n\n" + assumptions_ctx
+            return step_fn(chat_model, state, trace)
+        # Run primary specialist
+        state = _run_specialist(primary_role)
+        primary_output = state["draft_output"]
+        planner_state.specialist_outputs[primary_role] = primary_output[:500]
+        all_outputs: List[Tuple[str, str]] = [(primary_role, primary_output)]
+        for specialist_role in selected_roles:
+            if specialist_role == primary_role:
+                continue
+            state = _run_specialist(specialist_role)
+            output = state["draft_output"]
+            all_outputs.append((specialist_role, output))
+            planner_state.specialist_outputs[specialist_role] = output[:500]
+        # Step 5: Synthesize — format-aware, evidence-grounded
+        state = _step_synthesize(chat_model, state, trace, all_outputs,
+                                 evidence=evidence)
+        # Step 5b: Pre-QA format validation — catch structural violations early
+        fmt_violations = validate_output_format(
+            state["draft_output"], output_format, brevity
+        )
+        if fmt_violations:
+            trace.append(
+                "\n[FORMAT VALIDATION] Violations detected before QA:\n"
+                + "\n".join(f"  - {v}" for v in fmt_violations)
+            )
+            # Re-synthesize with explicit violation feedback
+            violation_instr = format_violations_instruction(fmt_violations)
+            state["plan"] = state["plan"] + "\n\n" + violation_instr
             state = _step_synthesize(chat_model, state, trace, all_outputs,
                                      evidence=evidence)
+            planner_state.record_event("format_rewrite", "; ".join(fmt_violations))
+            trace.append("[FORMAT VALIDATION] Re-synthesized to fix format violations.")
+        # === QA-REVISION LOOP ===
+        # From here, only QA + planner review + targeted revision (no full specialist rerun)
+        while True:
             # Step 6: QA validation (with evidence context)
             if qa_active:
                 state = _step_qa(chat_model, state, trace, all_outputs,
                     trace.append("\n═══ WORKFLOW COMPLETE — APPROVED ═══")
                     break
+                # QA failed and planner was forced to revise —
+                # store revision instruction reliably
+                revision_instr = ""
+                if "REVISED INSTRUCTIONS:" in state.get("plan", ""):
+                    revision_instr = state["plan"]
+                elif qa_result.correction_instruction:
+                    revision_instr = qa_result.correction_instruction
+                state["revision_instruction"] = revision_instr
+                planner_state.revision_instruction = revision_instr
+                planner_state.record_event("revision_instruction_stored",
+                                           revision_instr[:200] if revision_instr else "MISSING")
                 state["revision_count"] += 1
                 planner_state.revision_count = state["revision_count"]
                     planner_state.record_event("escalation", escalation)
                     if escalation == "suppress_role":
                         suppress = planner_state.get_roles_to_suppress()
                         for role_label in suppress:
                             role_key = _ROLE_LABEL_TO_KEY.get(role_label)
                             selected_roles = [primary_role]
                     elif escalation == "rewrite_from_state":
                         trace.append("  ⚠ Synthesizer will rewrite from state instead of reusing draft")
+                        state["draft_output"] = ""
                     elif escalation == "narrow_scope":
                         if len(selected_roles) > 1:
                             selected_roles = [selected_roles[0]]
                             trace.append(f"  ⚠ Narrowed to single specialist: {selected_roles[0]}")
+                # Step 9: TARGETED REVISIONS — only rerun the failing role(s)
                 revision_targets = identify_revision_targets(qa_result, _ROLE_LABEL_TO_KEY)
                 trace.append(
                     f"\n═���═ REVISION {state['revision_count']} / {MAX_REVISIONS} ═══\n"
                 )
                 planner_state.record_event("revision", f"targets={revision_targets}")
+                # Only rerun the targeted specialists — NOT all specialists
                 rerun_specialists = [
                     t for t in revision_targets
                     if t in _SPECIALIST_STEPS and t in selected_roles
                 ]
+                rerun_synthesizer = "synthesizer" in revision_targets or bool(rerun_specialists)
                 if rerun_specialists:
                     new_outputs = []
                     for rk in rerun_specialists:
+                        state = _run_specialist(rk)
                         new_outputs.append((rk, state["draft_output"]))
                         planner_state.specialist_outputs[rk] = state["draft_output"][:500]
+                    # Merge: replace updated roles, keep others unchanged
                     updated_keys = {rk for rk, _ in new_outputs}
+                    all_outputs = [
                         (rk, out) for rk, out in all_outputs if rk not in updated_keys
                     ] + new_outputs
                 if rerun_synthesizer or rerun_specialists:
                     state = _step_synthesize(chat_model, state, trace, all_outputs,
                                              evidence=evidence)
+                    # Post-revision format validation
+                    fmt_violations = validate_output_format(
+                        state["draft_output"], output_format, brevity
+                    )
+                    if fmt_violations:
+                        trace.append(
+                            "\n[FORMAT VALIDATION] Post-revision violations:\n"
+                            + "\n".join(f"  - {v}" for v in fmt_violations)
+                        )
+                        violation_instr = format_violations_instruction(fmt_violations)
+                        state["plan"] = state["plan"] + "\n\n" + violation_instr
+                        state = _step_synthesize(chat_model, state, trace, all_outputs,
+                                                 evidence=evidence)
+                # Loop back to QA — NOT back to specialists
+                continue
             else:
                 # No Planner review loop — accept the draft

test_workflow.py CHANGED Viewed

@@ -35,6 +35,10 @@ from workflow_helpers import (
     FailureRecord,
     get_synthesizer_format_instruction,
     get_qa_format_instruction,
 )
 from evidence import (
     EvidenceItem,
@@ -1023,6 +1027,198 @@ class TestPlannerStateExtended(unittest.TestCase):
 # Test: Scenario - Role Selection with Task Categories
 # ============================================================
 class TestTaskAwareScenarios(unittest.TestCase):
     """End-to-end scenario tests validating the 4 user-specified cases."""

     FailureRecord,
     get_synthesizer_format_instruction,
     get_qa_format_instruction,
+    validate_output_format,
+    format_violations_instruction,
+    parse_task_assumptions,
+    format_assumptions_for_prompt,
 )
 from evidence import (
     EvidenceItem,
 # Test: Scenario - Role Selection with Task Categories
 # ============================================================
+# ============================================================
+# Test: Output Format Validation
+# ============================================================
+class TestFormatValidation(unittest.TestCase):
+    def test_paragraph_with_bullets_fails(self):
+        text = "This is a paragraph.\n- bullet one\n- bullet two"
+        violations = validate_output_format(text, "paragraph", "normal")
+        self.assertTrue(any("bullet" in v.lower() for v in violations))
+    def test_paragraph_with_headings_fails(self):
+        text = "## Heading\nSome paragraph text."
+        violations = validate_output_format(text, "paragraph", "normal")
+        self.assertTrue(any("heading" in v.lower() for v in violations))
+    def test_paragraph_with_table_fails(self):
+        text = "Some text.\n| A | B |\n|---|---|\n| 1 | 2 |"
+        violations = validate_output_format(text, "paragraph", "normal")
+        self.assertTrue(any("table" in v.lower() for v in violations))
+    def test_paragraph_clean_passes(self):
+        text = "This is a clean paragraph without any lists or headings."
+        violations = validate_output_format(text, "paragraph", "normal")
+        self.assertEqual(violations, [])
+    def test_code_without_code_fails(self):
+        text = "Here is an explanation about coding but no actual code."
+        violations = validate_output_format(text, "code", "normal")
+        self.assertTrue(any("code" in v.lower() for v in violations))
+    def test_code_with_block_passes(self):
+        text = "```python\nprint('hello')\n```"
+        violations = validate_output_format(text, "code", "normal")
+        self.assertEqual(violations, [])
+    def test_code_with_recognisable_code_passes(self):
+        text = "def hello():\n    return 'world'"
+        violations = validate_output_format(text, "code", "normal")
+        self.assertEqual(violations, [])
+    def test_table_without_table_fails(self):
+        text = "Just a paragraph about tables."
+        violations = validate_output_format(text, "table", "normal")
+        self.assertTrue(any("table" in v.lower() for v in violations))
+    def test_table_with_table_passes(self):
+        text = "| Name | Value |\n|------|-------|\n| A | 1 |"
+        violations = validate_output_format(text, "table", "normal")
+        self.assertEqual(violations, [])
+    def test_single_choice_too_many_lines_fails(self):
+        text = "\n".join(f"Line {i}" for i in range(10))
+        violations = validate_output_format(text, "single_choice", "normal")
+        self.assertTrue(any("single choice" in v.lower() for v in violations))
+    def test_single_choice_short_passes(self):
+        text = "Vegan is the best choice."
+        violations = validate_output_format(text, "single_choice", "normal")
+        self.assertEqual(violations, [])
+    def test_minimal_brevity_too_long(self):
+        text = "\n".join(f"Line {i}" for i in range(12))
+        violations = validate_output_format(text, "paragraph", "minimal")
+        self.assertTrue(any("minimal" in v.lower() for v in violations))
+    def test_short_brevity_too_long(self):
+        text = "\n".join(f"Line {i}" for i in range(25))
+        violations = validate_output_format(text, "paragraph", "short")
+        self.assertTrue(any("short" in v.lower() for v in violations))
+    def test_normal_brevity_no_length_check(self):
+        text = "\n".join(f"Line {i}" for i in range(50))
+        violations = validate_output_format(text, "paragraph", "normal")
+        self.assertEqual(violations, [])
+    def test_empty_output(self):
+        violations = validate_output_format("", "paragraph", "normal")
+        self.assertTrue(any("empty" in v.lower() for v in violations))
+class TestFormatViolationsInstruction(unittest.TestCase):
+    def test_produces_instruction(self):
+        violations = ["Output has bullets.", "Too many lines."]
+        result = format_violations_instruction(violations)
+        self.assertIn("FORMAT VIOLATIONS", result)
+        self.assertIn("Output has bullets.", result)
+        self.assertIn("Too many lines.", result)
+        self.assertIn("Rewrite", result)
+    def test_empty_violations(self):
+        result = format_violations_instruction([])
+        self.assertIn("FORMAT VIOLATIONS", result)
+# ============================================================
+# Test: Task Assumptions Parsing
+# ============================================================
+class TestTaskAssumptions(unittest.TestCase):
+    def test_parse_assumptions_basic(self):
+        plan = (
+            "TASK ASSUMPTIONS:\n"
+            "- cost_model: per-unit pricing\n"
+            "- coverage_rate: 95%\n"
+            "- time_frame: 2024 Q4\n"
+            "TASK BREAKDOWN:\n"
+            "1. Do the thing"
+        )
+        result = parse_task_assumptions(plan)
+        self.assertEqual(result["cost_model"], "per-unit pricing")
+        self.assertEqual(result["coverage_rate"], "95%")
+        self.assertEqual(result["time_frame"], "2024 Q4")
+    def test_parse_assumptions_missing_section(self):
+        plan = "TASK BREAKDOWN:\n1. Do the thing"
+        result = parse_task_assumptions(plan)
+        self.assertEqual(result, {})
+    def test_parse_assumptions_multiple_headers(self):
+        plan = (
+            "TASK ASSUMPTIONS:\n"
+            "units: metric\n"
+            "scope: global\n"
+            "ROLE TO CALL:\n"
+            "Technical Specialist"
+        )
+        result = parse_task_assumptions(plan)
+        self.assertEqual(result["units"], "metric")
+        self.assertEqual(result["scope"], "global")
+        self.assertNotIn("technical_specialist", result)
+    def test_parse_assumptions_normalises_keys(self):
+        plan = "TASK ASSUMPTIONS:\nCost Model: expensive\n"
+        result = parse_task_assumptions(plan)
+        self.assertIn("cost_model", result)
+    def test_format_assumptions_empty(self):
+        result = format_assumptions_for_prompt({})
+        self.assertEqual(result, "")
+    def test_format_assumptions_nonempty(self):
+        result = format_assumptions_for_prompt({"units": "metric", "scope": "global"})
+        self.assertIn("SHARED TASK ASSUMPTIONS", result)
+        self.assertIn("units: metric", result)
+        self.assertIn("scope: global", result)
+        self.assertIn("do NOT invent your own", result)
+# ============================================================
+# Test: PlannerState Assumptions & Revision Instruction
+# ============================================================
+class TestPlannerStateNewFields(unittest.TestCase):
+    def test_task_assumptions_in_state_dict(self):
+        ps = PlannerState(user_request="test")
+        ps.task_assumptions = {"units": "metric", "scope": "global"}
+        d = ps.to_state_dict()
+        self.assertEqual(d["task_assumptions"], {"units": "metric", "scope": "global"})
+    def test_revision_instruction_in_state_dict(self):
+        ps = PlannerState(user_request="test")
+        ps.revision_instruction = "Fix the table format."
+        d = ps.to_state_dict()
+        self.assertEqual(d["revision_instruction"], "Fix the table format.")
+    def test_task_assumptions_in_context_string(self):
+        ps = PlannerState(user_request="test")
+        ps.task_assumptions = {"rate": "5%"}
+        ctx = ps.to_context_string()
+        self.assertIn("rate: 5%", ctx)
+        self.assertIn("Shared assumptions", ctx)
+    def test_revision_instruction_in_context_string(self):
+        ps = PlannerState(user_request="test")
+        ps.revision_instruction = "Shorten the output."
+        ctx = ps.to_context_string()
+        self.assertIn("Shorten the output.", ctx)
+    def test_empty_assumptions_not_in_context(self):
+        ps = PlannerState(user_request="test")
+        ctx = ps.to_context_string()
+        self.assertNotIn("Shared assumptions", ctx)
+# ============================================================
+# Test: Task-Aware Scenarios
+# ============================================================
 class TestTaskAwareScenarios(unittest.TestCase):
     """End-to-end scenario tests validating the 4 user-specified cases."""

workflow_helpers.py CHANGED Viewed

@@ -523,6 +523,11 @@ def select_relevant_roles(
             if kw.lower() in lower:
                 score += 1
         # Task-category affinity bonus
         role_tasks = meta.get("task_types", [])
         if task_category in role_tasks:
@@ -751,10 +756,12 @@ class PlannerState:
     selected_roles: List[str] = field(default_factory=list)
     specialist_outputs: Dict[str, str] = field(default_factory=dict)
     evidence: Optional[Dict] = None  # serialised EvidenceResult
     current_draft: str = ""
     qa_result: Optional[QAResult] = None
     revision_count: int = 0
     max_revisions: int = 3
     failure_history: List[FailureRecord] = field(default_factory=list)
     history: List[Dict[str, str]] = field(default_factory=list)
     final_answer: str = ""
@@ -829,10 +836,15 @@ class PlannerState:
         ]
         if self.success_criteria:
             lines.append(f"Success criteria: {'; '.join(self.success_criteria)}")
         if self.evidence:
             conf = self.evidence.get("confidence", "unknown")
             n_items = len(self.evidence.get("results", []))
             lines.append(f"Evidence: {n_items} items (confidence: {conf})")
         if self.qa_result and not self.qa_result.passed:
             lines.append(f"QA status: FAIL — {self.qa_result.reason}")
             if self.qa_result.correction_instruction:
@@ -856,9 +868,11 @@ class PlannerState:
             "selected_roles": self.selected_roles,
             "specialist_outputs": self.specialist_outputs,
             "evidence": self.evidence,
             "current_draft": self.current_draft[:500],
             "revision_count": self.revision_count,
             "max_revisions": self.max_revisions,
             "failure_history": [f.to_dict() for f in self.failure_history],
             "final_answer": self.final_answer[:500] if self.final_answer else "",
         }
@@ -931,3 +945,114 @@ def get_qa_format_instruction(output_format: str, brevity: str) -> str:
     if brevity in ("minimal", "short"):
         rules.append("FAIL if the output is excessively verbose for a brevity requirement.")
     return "\n".join(rules) if rules else ""

             if kw.lower() in lower:
                 score += 1
+        # Domain affinity — boost if the request touches a role's domain
+        for domain in meta.get("domains", []):
+            if domain.lower() in lower:
+                score += 1
         # Task-category affinity bonus
         role_tasks = meta.get("task_types", [])
         if task_category in role_tasks:
     selected_roles: List[str] = field(default_factory=list)
     specialist_outputs: Dict[str, str] = field(default_factory=dict)
     evidence: Optional[Dict] = None  # serialised EvidenceResult
+    task_assumptions: Dict[str, str] = field(default_factory=dict)
     current_draft: str = ""
     qa_result: Optional[QAResult] = None
     revision_count: int = 0
     max_revisions: int = 3
+    revision_instruction: str = ""  # latest revision instruction from planner
     failure_history: List[FailureRecord] = field(default_factory=list)
     history: List[Dict[str, str]] = field(default_factory=list)
     final_answer: str = ""
         ]
         if self.success_criteria:
             lines.append(f"Success criteria: {'; '.join(self.success_criteria)}")
+        if self.task_assumptions:
+            assumptions_str = "; ".join(f"{k}: {v}" for k, v in self.task_assumptions.items())
+            lines.append(f"Shared assumptions: {assumptions_str}")
         if self.evidence:
             conf = self.evidence.get("confidence", "unknown")
             n_items = len(self.evidence.get("results", []))
             lines.append(f"Evidence: {n_items} items (confidence: {conf})")
+        if self.revision_instruction:
+            lines.append(f"Revision instruction: {self.revision_instruction}")
         if self.qa_result and not self.qa_result.passed:
             lines.append(f"QA status: FAIL — {self.qa_result.reason}")
             if self.qa_result.correction_instruction:
             "selected_roles": self.selected_roles,
             "specialist_outputs": self.specialist_outputs,
             "evidence": self.evidence,
+            "task_assumptions": self.task_assumptions,
             "current_draft": self.current_draft[:500],
             "revision_count": self.revision_count,
             "max_revisions": self.max_revisions,
+            "revision_instruction": self.revision_instruction,
             "failure_history": [f.to_dict() for f in self.failure_history],
             "final_answer": self.final_answer[:500] if self.final_answer else "",
         }
     if brevity in ("minimal", "short"):
         rules.append("FAIL if the output is excessively verbose for a brevity requirement.")
     return "\n".join(rules) if rules else ""
+# ============================================================
+# Output Format Validation (pre-QA structural check)
+# ============================================================
+def validate_output_format(text: str, output_format: str, brevity: str) -> List[str]:
+    """Check structural format constraints before QA.
+    Returns a list of violation descriptions. Empty list means the output is valid.
+    This catches common structural problems that the synthesizer repeatedly ignores
+    (e.g., bullet lists when paragraph-only was requested).
+    """
+    violations: List[str] = []
+    stripped = text.strip()
+    if not stripped:
+        violations.append("Output is empty.")
+        return violations
+    has_bullets = bool(re.search(r"^[\s]*[-•*]\s", stripped, re.MULTILINE))
+    has_numbered = bool(re.search(r"^[\s]*\d+[.)]\s", stripped, re.MULTILINE))
+    has_headings = bool(re.search(r"^#{1,4}\s", stripped, re.MULTILINE))
+    has_table = bool(re.search(r"\|.*\|.*\|", stripped))
+    has_code_block = "```" in stripped
+    line_count = len([ln for ln in stripped.splitlines() if ln.strip()])
+    if output_format == "paragraph":
+        if has_bullets or has_numbered:
+            violations.append("Paragraph format requested but output contains bullet/numbered lists.")
+        if has_headings:
+            violations.append("Paragraph format requested but output contains markdown headings.")
+        if has_table:
+            violations.append("Paragraph format requested but output contains a table.")
+    elif output_format == "code":
+        if not has_code_block and not re.search(r"(?:def |class |import |function |const |let |var )", stripped):
+            violations.append("Code format requested but output contains no code block or recognisable code.")
+    elif output_format == "table":
+        if not has_table:
+            violations.append("Table format requested but output contains no markdown table.")
+    elif output_format == "single_choice":
+        if line_count > 5:
+            violations.append("Single choice requested but output is multi-section (too many lines).")
+        if has_bullets and line_count > 3:
+            violations.append("Single choice requested but output contains a bullet list.")
+    # Brevity checks
+    if brevity == "minimal" and line_count > 8:
+        violations.append(f"Minimal brevity requested but output has {line_count} lines.")
+    elif brevity == "short" and line_count > 20:
+        violations.append(f"Short brevity requested but output has {line_count} lines.")
+    return violations
+def format_violations_instruction(violations: List[str]) -> str:
+    """Turn format violation descriptions into a synthesis rewrite instruction."""
+    return (
+        "FORMAT VIOLATIONS DETECTED — you MUST fix these before QA:\n"
+        + "\n".join(f"- {v}" for v in violations)
+        + "\nRewrite the output to satisfy the required format strictly."
+    )
+# ============================================================
+# Shared Assumptions Parsing
+# ============================================================
+def parse_task_assumptions(plan_text: str) -> Dict[str, str]:
+    """Extract TASK ASSUMPTIONS from planner output.
+    Looks for lines like 'key: value' under a TASK ASSUMPTIONS: header.
+    Returns a dict of assumption key → value.
+    """
+    assumptions: Dict[str, str] = {}
+    if "TASK ASSUMPTIONS:" not in plan_text:
+        return assumptions
+    section = plan_text.split("TASK ASSUMPTIONS:", 1)[1]
+    # Section ends at the next header (a line that ends with ':' and starts with caps)
+    for header in (
+        "TASK BREAKDOWN:", "ROLE TO CALL:", "SUCCESS CRITERIA:",
+        "GUIDANCE FOR SPECIALIST:", "REVISED INSTRUCTIONS:",
+    ):
+        if header in section:
+            section = section.split(header, 1)[0]
+            break
+    for line in section.strip().splitlines():
+        line = line.strip().lstrip("•-* ")
+        if ":" not in line:
+            continue
+        key, _, value = line.partition(":")
+        key = key.strip().lower().replace(" ", "_")
+        value = value.strip()
+        if key and value:
+            assumptions[key] = value
+    return assumptions
+def format_assumptions_for_prompt(assumptions: Dict[str, str]) -> str:
+    """Format shared assumptions for injection into specialist prompts."""
+    if not assumptions:
+        return ""
+    lines = ["SHARED TASK ASSUMPTIONS (use these — do NOT invent your own):"]
+    for key, value in assumptions.items():
+        lines.append(f"  - {key}: {value}")
+    return "\n".join(lines)