"""Session 9 - Cortex subagent tests (WorldModeler, Planner, Critic). Per Phase A docs/CORTEX_ARCHITECTURE.md Decisions 1-8 + 62 and the user's proposal acceptance with 11 tests total. RED-tests-first. """ from __future__ import annotations import json import re from typing import Any, Dict, Optional import pytest from cortex.schemas import ( BeliefState, CandidatePlan, CriticReport, EvidenceCitation, PerceptionReport, SubagentInput, ) from cortex.subagents import ( PROMPTS_DIR, CriticSubagent, PlannerSubagent, WorldModelerSubagent, ) from tests._helpers.llm_stub import StubLLMClient # ============================================================================ # Test fixtures # ============================================================================ _VALID_BELIEF_PAYLOAD: Dict[str, Any] = { "brain": "epidemiology", "latent_estimates": { "R1": { "estimated_infection_rate": 0.05, "estimated_r_effective": 1.2, "estimated_compliance": 0.85, "confidence_intervals": {}, }, }, "hypotheses": [{"label": "rising-r1", "weight": 0.6, "explanation": "telemetry trending up"}], "uncertainty": 0.4, "reducible_by_more_thought": 0.3, "evidence": [ {"source": "telemetry", "ref": "R1.reported_cases@t3", "excerpt": "rising"}, {"source": "policy", "ref": "R1.restriction", "excerpt": "moderate"}, ], } _VALID_PLAN_PAYLOAD: Dict[str, Any] = { "action_sketch": "Deploy 100 test_kits to R1", "expected_outer_action": { "kind": "deploy_resource", "region": "R1", "resource_type": "test_kits", "quantity": 100, }, "expected_value": 0.6, "cost": 200.0, "assumptions": ["kits inventory > 100"], "falsifiers": ["R1 cases drop without intervention"], "confidence": 0.75, } _VALID_CRITIC_PAYLOAD: Dict[str, Any] = { "brain": "logistics", "target_plan_id": "plan-1", "attacks": ["ignores R3 hospital saturation"], "missing_considerations": ["compliance decay over 4 ticks"], "would_change_mind_if": ["new R3 telemetry shows under-utilisation"], "severity": 0.6, } def _valid_json_for(role: str, brain: str = "epidemiology") -> str: """Return a valid JSON-schema response for ``role``, brain-substituted.""" if role == "world_modeler": payload = dict(_VALID_BELIEF_PAYLOAD) payload["brain"] = brain return json.dumps(payload) if role == "planner": return json.dumps(_VALID_PLAN_PAYLOAD) if role == "critic": payload = dict(_VALID_CRITIC_PAYLOAD) payload["brain"] = brain return json.dumps(payload) raise ValueError(f"unknown role: {role}") def _make_subagent_input( brain: str = "epidemiology", role: str = "world_modeler", tick: int = 3, round_: int = 1, target_plan_id: Optional[str] = None, ) -> SubagentInput: """Minimal valid SubagentInput. ``round_`` arg avoids shadowing builtin.""" return SubagentInput( brain=brain, role=role, tick=tick, round=round_, perception=PerceptionReport( brain=brain, salient_signals=["R1 cases rising"], anomalies=[], confidence=0.7, evidence=[EvidenceCitation(source="telemetry", ref="R1.cases", excerpt="rising")], ), prior_belief=None, prior_plans=[], target_plan_id=target_plan_id, last_reward=0.5, recent_action_log_excerpt=[], ) # ============================================================================ # T1 - WorldModeler emits BeliefState # ============================================================================ def test_world_modeler_emits_belief_state() -> None: stub = StubLLMClient(scripted_responses=[_valid_json_for("world_modeler", "epidemiology")]) agent = WorldModelerSubagent(llm_client=stub) input_data = _make_subagent_input(brain="epidemiology", role="world_modeler") result = agent.run(input_data, step_idx=0) assert isinstance(result, BeliefState) assert result.brain == "epidemiology" assert "R1" in result.latent_estimates assert len(result.evidence) >= 1 assert stub.call_count == 1 # ============================================================================ # T2 - Planner emits CandidatePlan # ============================================================================ def test_planner_emits_candidate_plan() -> None: stub = StubLLMClient(scripted_responses=[_valid_json_for("planner", "epidemiology")]) agent = PlannerSubagent(llm_client=stub) input_data = _make_subagent_input(brain="epidemiology", role="planner") result = agent.run(input_data, step_idx=1) assert isinstance(result, CandidatePlan) assert result.expected_outer_action.kind == "deploy_resource" assert result.confidence == 0.75 # ============================================================================ # T3 - Critic emits CriticReport # ============================================================================ def test_critic_emits_critic_report() -> None: stub = StubLLMClient(scripted_responses=[_valid_json_for("critic", "logistics")]) agent = CriticSubagent(llm_client=stub) input_data = _make_subagent_input(brain="logistics", role="critic", target_plan_id="plan-1") result = agent.run(input_data, step_idx=2) assert isinstance(result, CriticReport) assert result.brain == "logistics" assert result.target_plan_id == "plan-1" assert result.severity == 0.6 # ============================================================================ # T4 - Parse failure then retry succeeds (2 LLM calls) # ============================================================================ def test_subagent_parse_failure_then_retry_succeeds() -> None: stub = StubLLMClient( scripted_responses=["not-json-garbage", _valid_json_for("world_modeler", "epidemiology")] ) agent = WorldModelerSubagent(llm_client=stub) result = agent.run(_make_subagent_input(), step_idx=0) assert isinstance(result, BeliefState) assert result.brain == "epidemiology" assert stub.call_count == 2, "expected 1 initial call + 1 retry" # ============================================================================ # T5 - Parse failure twice -> empty fallback (no third LLM call) # ============================================================================ def test_subagent_parse_failure_then_retry_fails_returns_empty() -> None: stub = StubLLMClient(scripted_responses=["garbage-1", "garbage-2"]) agent = WorldModelerSubagent(llm_client=stub) result = agent.run(_make_subagent_input(), step_idx=0) assert isinstance(result, BeliefState) assert result.brain == "epidemiology" assert result.latent_estimates == {} assert result.hypotheses == [] assert result.evidence == [] assert result.uncertainty == 1.0 assert result.reducible_by_more_thought == 0.0 assert stub.call_count == 2, "must NOT make a 3rd call after retry failure" # ============================================================================ # T6 - caller_id format matches Phase A Decision 7 # ============================================================================ _CALLER_ID_RE = re.compile( r"^cortex:(epidemiology|logistics|governance):" r"(world_modeler|planner|critic):" r"t\d+:r[12]:s\d+$" ) @pytest.mark.parametrize( "role_cls,brain,role_name", [ (WorldModelerSubagent, "epidemiology", "world_modeler"), (PlannerSubagent, "logistics", "planner"), (CriticSubagent, "governance", "critic"), ], ) def test_subagent_caller_id_format( role_cls: type, brain: str, role_name: str, ) -> None: stub = StubLLMClient(scripted_responses=[_valid_json_for(role_name, brain)]) agent = role_cls(llm_client=stub) input_data = _make_subagent_input( brain=brain, role=role_name, tick=7, round_=2, target_plan_id="plan-X" if role_name == "critic" else None, ) agent.run(input_data, step_idx=4) caller_id = stub.calls[0].caller_id assert _CALLER_ID_RE.match(caller_id), ( f"caller_id={caller_id!r} does not match the locked format" ) assert caller_id == f"cortex:{brain}:{role_name}:t7:r2:s4" # ============================================================================ # T8 - SYS prompt loaded from prompts/.txt and brain-formatted # (folds in the prompt-formatting refinement: format() must not raise) # ============================================================================ @pytest.mark.parametrize( "role_cls,role_name", [ (WorldModelerSubagent, "world_modeler"), (PlannerSubagent, "planner"), (CriticSubagent, "critic"), ], ) def test_subagent_uses_loaded_prompt_from_file(role_cls: type, role_name: str) -> None: raw = (PROMPTS_DIR / f"{role_name}.txt").read_text(encoding="utf-8") # Refinement: format() must not raise even with extra kwargs ignored. # Catches {{/}}-escape regressions in JSON-schema sections of the prompts. formatted = raw.format(brain="epidemiology", target_plan_id="plan-X") assert isinstance(formatted, str) assert "epidemiology" in formatted stub = StubLLMClient(scripted_responses=[_valid_json_for(role_name, "epidemiology")]) agent = role_cls(llm_client=stub) input_data = _make_subagent_input( brain="epidemiology", role=role_name, target_plan_id="plan-X" if role_name == "critic" else None, ) agent.run(input_data, step_idx=0) sys_msg = stub.calls[0].messages[0] assert sys_msg.role == "system" assert sys_msg.content == formatted # ============================================================================ # T9 - Token counter is billed to the expected caller_id # ============================================================================ def test_subagent_token_counter_billed_correctly() -> None: stub = StubLLMClient(scripted_responses=[_valid_json_for("world_modeler", "epidemiology")]) agent = WorldModelerSubagent(llm_client=stub) input_data = _make_subagent_input(brain="epidemiology", role="world_modeler", tick=3, round_=1) agent.run(input_data, step_idx=0) expected_caller_id = "cortex:epidemiology:world_modeler:t3:r1:s0" assert stub.tokens_used_for(expected_caller_id) > 0, ( "tokens must be billed to the per-role caller_id, not silently lost" ) assert stub.tokens_used_for("never:called") == 0 # ============================================================================ # T10 - empty_fallback shape locked per Phase A Decisions 6 + 62 # ============================================================================ def test_subagent_empty_fallback_shape_locked() -> None: # WorldModeler: empty BeliefState bs = WorldModelerSubagent.empty_fallback("epidemiology") assert isinstance(bs, BeliefState) assert bs.brain == "epidemiology" assert bs.latent_estimates == {} assert bs.hypotheses == [] assert bs.uncertainty == 1.0 assert bs.reducible_by_more_thought == 0.0 assert bs.evidence == [] # Planner: empty CandidatePlan with NoOp + confidence=0 cp = PlannerSubagent.empty_fallback("epidemiology") assert isinstance(cp, CandidatePlan) assert cp.expected_outer_action.kind == "no_op" assert cp.expected_value == 0.0 assert cp.cost == 0.0 assert cp.assumptions == [] assert cp.falsifiers == [] assert cp.confidence == 0.0 # Critic: empty CriticReport with severity=0 cr = CriticSubagent.empty_fallback("epidemiology", target_plan_id="plan-X") assert isinstance(cr, CriticReport) assert cr.brain == "epidemiology" assert cr.target_plan_id == "plan-X" assert cr.attacks == [] assert cr.missing_considerations == [] assert cr.would_change_mind_if == [] assert cr.severity == 0.0 # ============================================================================ # T11 - retry call uses chat-history continuation (sys + usr + bad + retry) # ============================================================================ def test_subagent_run_uses_chat_history_on_retry() -> None: stub = StubLLMClient( scripted_responses=["bad-json", _valid_json_for("world_modeler", "epidemiology")] ) agent = WorldModelerSubagent(llm_client=stub) agent.run(_make_subagent_input(brain="epidemiology", role="world_modeler"), step_idx=0) assert stub.call_count == 2, "expected 2 LLM calls (initial + retry)" call1, call2 = stub.calls # call 1 has the original sys + user (2 messages). assert len(call1.messages) == 2 assert call1.messages[0].role == "system" assert call1.messages[1].role == "user" # call 2 must contain: original sys + original user + assistant(bad-json) + retry-user. # Without chat-history continuation the LLM loses schema context on retry. assert len(call2.messages) == 4, "retry must append to the chat history, not start fresh" assert call2.messages[0].role == "system" assert call2.messages[0].content == call1.messages[0].content assert call2.messages[1].role == "user" assert call2.messages[1].content == call1.messages[1].content assert call2.messages[2].role == "assistant" assert call2.messages[2].content == "bad-json" assert call2.messages[3].role == "user" assert "failed to parse" in call2.messages[3].content.lower()