| from pluto.models import ( |
| Claim, |
| ClaimStatus, |
| Evidence, |
| ExtractOutput, |
| ExtractedContent, |
| Importance, |
| KeyClaim, |
| MergeOutput, |
| ModeName, |
| ChunkType, |
| Synthesis, |
| ) |
| from pluto.bus import MessageBus |
| from pluto.stages import evidence_check as evidence_check_stage |
| from pluto.stages.evidence_check import _parse_evidence_check, run_evidence_check |
| from pluto.tracer import Tracer |
|
|
|
|
| def test_parse_evidence_check_dump(): |
| raw = """ |
| Here is the result: |
| { |
| "checked_claims": [ |
| { |
| "claim": "The model reaches 91% accuracy.", |
| "status": "supported", |
| "evidence_doc_id": "paper_a", |
| "evidence_chunk_id": "C3", |
| "reason": "Directly stated in the results section." |
| }, |
| { |
| "claim": "The training set contains 2 million images.", |
| "status": "unsupported", |
| "reason": "No evidence in the supplied excerpts." |
| } |
| ], |
| "unsupported_claims": ["The training set contains 2 million images."], |
| "required_followups": ["Upload the appendix for dataset details."] |
| } |
| """ |
|
|
| out = _parse_evidence_check(raw) |
|
|
| assert len(out.evidence_check.checked_claims) == 2 |
| assert out.evidence_check.checked_claims[0].status.value == "supported" |
| assert out.evidence_check.checked_claims[0].evidence[0].doc_id == "paper_a" |
| assert out.evidence_check.unsupported_claims == ["The training set contains 2 million images."] |
| assert out.evidence_check.required_followups == ["Upload the appendix for dataset details."] |
|
|
|
|
| def test_evidence_check_directly_supports_matching_claim_without_dispatch(monkeypatch): |
| def fail_dispatch(*args, **kwargs): |
| raise AssertionError("dispatch should not be called for an obvious direct evidence match") |
|
|
| monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch) |
|
|
| merge_output = MergeOutput( |
| synthesis=Synthesis( |
| key_claims=[ |
| KeyClaim(claim="The system achieved 0% ASR across defended scenarios.") |
| ] |
| ) |
| ) |
| extractions = [ |
| ExtractOutput( |
| doc_id="paper_a", |
| chunk_id="C8", |
| chunk_type=ChunkType.TEXT, |
| mode_used=ModeName.MODE_REASONING, |
| extracted=ExtractedContent( |
| claims=[ |
| Claim( |
| claim_id="cl1", |
| text="The system achieved 0% ASR across defended scenarios.", |
| importance=Importance.HIGH, |
| evidence=Evidence( |
| doc_id="paper_a", |
| chunk_id="C8", |
| where="results", |
| quote="0% ASR across defended scenarios", |
| ), |
| ) |
| ] |
| ), |
| ) |
| ] |
|
|
| result = run_evidence_check(merge_output, extractions, Tracer()) |
|
|
| assert len(result.evidence_check.checked_claims) == 1 |
| assert result.evidence_check.checked_claims[0].status == ClaimStatus.SUPPORTED |
| assert result.evidence_check.checked_claims[0].evidence[0].doc_id == "paper_a" |
| assert result.evidence_check.unsupported_claims == [] |
|
|
|
|
| def test_evidence_check_suppresses_followups_for_single_unsupported_outlier(monkeypatch): |
| def fail_dispatch(*args, **kwargs): |
| raise AssertionError("dispatch should not be called for direct matches or suppressed followups") |
|
|
| monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch) |
|
|
| merge_output = MergeOutput( |
| synthesis=Synthesis( |
| key_claims=[ |
| KeyClaim(claim="The system achieved 0% ASR across defended scenarios."), |
| KeyClaim(claim="The coordinator routes adversarial prompts through a defense worker."), |
| KeyClaim(claim="The appendix reports a 12% latency reduction on unseen workloads."), |
| ] |
| ) |
| ) |
| extractions = [ |
| ExtractOutput( |
| doc_id="paper_a", |
| chunk_id="C8", |
| chunk_type=ChunkType.TEXT, |
| mode_used=ModeName.MODE_REASONING, |
| extracted=ExtractedContent( |
| claims=[ |
| Claim( |
| claim_id="cl1", |
| text="The system achieved 0% ASR across defended scenarios.", |
| importance=Importance.HIGH, |
| evidence=Evidence( |
| doc_id="paper_a", |
| chunk_id="C8", |
| where="results", |
| quote="0% ASR across defended scenarios", |
| ), |
| ), |
| Claim( |
| claim_id="cl2", |
| text="The coordinator routes adversarial prompts through a defense worker.", |
| importance=Importance.HIGH, |
| evidence=Evidence( |
| doc_id="paper_a", |
| chunk_id="C4", |
| where="method", |
| quote="routes adversarial prompts through a defense worker", |
| ), |
| ), |
| ] |
| ), |
| ) |
| ] |
|
|
| bus = MessageBus() |
| result = run_evidence_check(merge_output, extractions, Tracer(), bus=bus) |
|
|
| assert result.evidence_check.unsupported_claims == ["The appendix reports a 12% latency reduction on unseen workloads."] |
| assert result.evidence_check.required_followups == [] |
| assert bus.read(msg_type="gap_report") == [] |
|
|
|
|
| def test_evidence_check_generates_specific_followups_when_answer_is_unsupported(monkeypatch): |
| def fail_dispatch(*args, **kwargs): |
| raise AssertionError("dispatch should not be called when no evidence candidates exist") |
|
|
| monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch) |
|
|
| merge_output = MergeOutput( |
| synthesis=Synthesis( |
| key_claims=[ |
| KeyClaim(claim="The appendix reports a 12% latency reduction on unseen workloads."), |
| KeyClaim(claim="The architecture introduces a separate recovery worker for post-attack repair."), |
| ] |
| ) |
| ) |
|
|
| bus = MessageBus() |
| result = run_evidence_check(merge_output, [], Tracer(), bus=bus) |
|
|
| assert result.evidence_check.unsupported_claims == [ |
| "The appendix reports a 12% latency reduction on unseen workloads.", |
| "The architecture introduces a separate recovery worker for post-attack repair.", |
| ] |
| assert result.evidence_check.required_followups == [ |
| "Which result or metric in the document directly supports: The appendix reports a 12% latency reduction on unseen workloads?", |
| "Where does the document explicitly describe: The architecture introduces a separate recovery worker for post-attack repair?", |
| ] |
| latest = bus.latest("gap_report") |
| assert latest is not None |
| assert latest.payload["gaps"] == result.evidence_check.required_followups |
|
|