from pluto.models import ( Claim, ClaimStatus, Evidence, ExtractOutput, ExtractedContent, Importance, KeyClaim, MergeOutput, ModeName, ChunkType, Synthesis, ) from pluto.bus import MessageBus from pluto.stages import evidence_check as evidence_check_stage from pluto.stages.evidence_check import _parse_evidence_check, run_evidence_check from pluto.tracer import Tracer def test_parse_evidence_check_dump(): raw = """ Here is the result: { "checked_claims": [ { "claim": "The model reaches 91% accuracy.", "status": "supported", "evidence_doc_id": "paper_a", "evidence_chunk_id": "C3", "reason": "Directly stated in the results section." }, { "claim": "The training set contains 2 million images.", "status": "unsupported", "reason": "No evidence in the supplied excerpts." } ], "unsupported_claims": ["The training set contains 2 million images."], "required_followups": ["Upload the appendix for dataset details."] } """ out = _parse_evidence_check(raw) assert len(out.evidence_check.checked_claims) == 2 assert out.evidence_check.checked_claims[0].status.value == "supported" assert out.evidence_check.checked_claims[0].evidence[0].doc_id == "paper_a" assert out.evidence_check.unsupported_claims == ["The training set contains 2 million images."] assert out.evidence_check.required_followups == ["Upload the appendix for dataset details."] def test_evidence_check_directly_supports_matching_claim_without_dispatch(monkeypatch): def fail_dispatch(*args, **kwargs): raise AssertionError("dispatch should not be called for an obvious direct evidence match") monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch) merge_output = MergeOutput( synthesis=Synthesis( key_claims=[ KeyClaim(claim="The system achieved 0% ASR across defended scenarios.") ] ) ) extractions = [ ExtractOutput( doc_id="paper_a", chunk_id="C8", chunk_type=ChunkType.TEXT, mode_used=ModeName.MODE_REASONING, extracted=ExtractedContent( claims=[ Claim( claim_id="cl1", text="The system achieved 0% ASR across defended scenarios.", importance=Importance.HIGH, evidence=Evidence( doc_id="paper_a", chunk_id="C8", where="results", quote="0% ASR across defended scenarios", ), ) ] ), ) ] result = run_evidence_check(merge_output, extractions, Tracer()) assert len(result.evidence_check.checked_claims) == 1 assert result.evidence_check.checked_claims[0].status == ClaimStatus.SUPPORTED assert result.evidence_check.checked_claims[0].evidence[0].doc_id == "paper_a" assert result.evidence_check.unsupported_claims == [] def test_evidence_check_suppresses_followups_for_single_unsupported_outlier(monkeypatch): def fail_dispatch(*args, **kwargs): raise AssertionError("dispatch should not be called for direct matches or suppressed followups") monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch) merge_output = MergeOutput( synthesis=Synthesis( key_claims=[ KeyClaim(claim="The system achieved 0% ASR across defended scenarios."), KeyClaim(claim="The coordinator routes adversarial prompts through a defense worker."), KeyClaim(claim="The appendix reports a 12% latency reduction on unseen workloads."), ] ) ) extractions = [ ExtractOutput( doc_id="paper_a", chunk_id="C8", chunk_type=ChunkType.TEXT, mode_used=ModeName.MODE_REASONING, extracted=ExtractedContent( claims=[ Claim( claim_id="cl1", text="The system achieved 0% ASR across defended scenarios.", importance=Importance.HIGH, evidence=Evidence( doc_id="paper_a", chunk_id="C8", where="results", quote="0% ASR across defended scenarios", ), ), Claim( claim_id="cl2", text="The coordinator routes adversarial prompts through a defense worker.", importance=Importance.HIGH, evidence=Evidence( doc_id="paper_a", chunk_id="C4", where="method", quote="routes adversarial prompts through a defense worker", ), ), ] ), ) ] bus = MessageBus() result = run_evidence_check(merge_output, extractions, Tracer(), bus=bus) assert result.evidence_check.unsupported_claims == ["The appendix reports a 12% latency reduction on unseen workloads."] assert result.evidence_check.required_followups == [] assert bus.read(msg_type="gap_report") == [] def test_evidence_check_generates_specific_followups_when_answer_is_unsupported(monkeypatch): def fail_dispatch(*args, **kwargs): raise AssertionError("dispatch should not be called when no evidence candidates exist") monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch) merge_output = MergeOutput( synthesis=Synthesis( key_claims=[ KeyClaim(claim="The appendix reports a 12% latency reduction on unseen workloads."), KeyClaim(claim="The architecture introduces a separate recovery worker for post-attack repair."), ] ) ) bus = MessageBus() result = run_evidence_check(merge_output, [], Tracer(), bus=bus) assert result.evidence_check.unsupported_claims == [ "The appendix reports a 12% latency reduction on unseen workloads.", "The architecture introduces a separate recovery worker for post-attack repair.", ] assert result.evidence_check.required_followups == [ "Which result or metric in the document directly supports: The appendix reports a 12% latency reduction on unseen workloads?", "Where does the document explicitly describe: The architecture introduces a separate recovery worker for post-attack repair?", ] latest = bus.latest("gap_report") assert latest is not None assert latest.payload["gaps"] == result.evidence_check.required_followups