File size: 7,069 Bytes
66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed 66ad25b 23cdeed | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | from pluto.models import (
Claim,
ClaimStatus,
Evidence,
ExtractOutput,
ExtractedContent,
Importance,
KeyClaim,
MergeOutput,
ModeName,
ChunkType,
Synthesis,
)
from pluto.bus import MessageBus
from pluto.stages import evidence_check as evidence_check_stage
from pluto.stages.evidence_check import _parse_evidence_check, run_evidence_check
from pluto.tracer import Tracer
def test_parse_evidence_check_dump():
raw = """
Here is the result:
{
"checked_claims": [
{
"claim": "The model reaches 91% accuracy.",
"status": "supported",
"evidence_doc_id": "paper_a",
"evidence_chunk_id": "C3",
"reason": "Directly stated in the results section."
},
{
"claim": "The training set contains 2 million images.",
"status": "unsupported",
"reason": "No evidence in the supplied excerpts."
}
],
"unsupported_claims": ["The training set contains 2 million images."],
"required_followups": ["Upload the appendix for dataset details."]
}
"""
out = _parse_evidence_check(raw)
assert len(out.evidence_check.checked_claims) == 2
assert out.evidence_check.checked_claims[0].status.value == "supported"
assert out.evidence_check.checked_claims[0].evidence[0].doc_id == "paper_a"
assert out.evidence_check.unsupported_claims == ["The training set contains 2 million images."]
assert out.evidence_check.required_followups == ["Upload the appendix for dataset details."]
def test_evidence_check_directly_supports_matching_claim_without_dispatch(monkeypatch):
def fail_dispatch(*args, **kwargs):
raise AssertionError("dispatch should not be called for an obvious direct evidence match")
monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch)
merge_output = MergeOutput(
synthesis=Synthesis(
key_claims=[
KeyClaim(claim="The system achieved 0% ASR across defended scenarios.")
]
)
)
extractions = [
ExtractOutput(
doc_id="paper_a",
chunk_id="C8",
chunk_type=ChunkType.TEXT,
mode_used=ModeName.MODE_REASONING,
extracted=ExtractedContent(
claims=[
Claim(
claim_id="cl1",
text="The system achieved 0% ASR across defended scenarios.",
importance=Importance.HIGH,
evidence=Evidence(
doc_id="paper_a",
chunk_id="C8",
where="results",
quote="0% ASR across defended scenarios",
),
)
]
),
)
]
result = run_evidence_check(merge_output, extractions, Tracer())
assert len(result.evidence_check.checked_claims) == 1
assert result.evidence_check.checked_claims[0].status == ClaimStatus.SUPPORTED
assert result.evidence_check.checked_claims[0].evidence[0].doc_id == "paper_a"
assert result.evidence_check.unsupported_claims == []
def test_evidence_check_suppresses_followups_for_single_unsupported_outlier(monkeypatch):
def fail_dispatch(*args, **kwargs):
raise AssertionError("dispatch should not be called for direct matches or suppressed followups")
monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch)
merge_output = MergeOutput(
synthesis=Synthesis(
key_claims=[
KeyClaim(claim="The system achieved 0% ASR across defended scenarios."),
KeyClaim(claim="The coordinator routes adversarial prompts through a defense worker."),
KeyClaim(claim="The appendix reports a 12% latency reduction on unseen workloads."),
]
)
)
extractions = [
ExtractOutput(
doc_id="paper_a",
chunk_id="C8",
chunk_type=ChunkType.TEXT,
mode_used=ModeName.MODE_REASONING,
extracted=ExtractedContent(
claims=[
Claim(
claim_id="cl1",
text="The system achieved 0% ASR across defended scenarios.",
importance=Importance.HIGH,
evidence=Evidence(
doc_id="paper_a",
chunk_id="C8",
where="results",
quote="0% ASR across defended scenarios",
),
),
Claim(
claim_id="cl2",
text="The coordinator routes adversarial prompts through a defense worker.",
importance=Importance.HIGH,
evidence=Evidence(
doc_id="paper_a",
chunk_id="C4",
where="method",
quote="routes adversarial prompts through a defense worker",
),
),
]
),
)
]
bus = MessageBus()
result = run_evidence_check(merge_output, extractions, Tracer(), bus=bus)
assert result.evidence_check.unsupported_claims == ["The appendix reports a 12% latency reduction on unseen workloads."]
assert result.evidence_check.required_followups == []
assert bus.read(msg_type="gap_report") == []
def test_evidence_check_generates_specific_followups_when_answer_is_unsupported(monkeypatch):
def fail_dispatch(*args, **kwargs):
raise AssertionError("dispatch should not be called when no evidence candidates exist")
monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch)
merge_output = MergeOutput(
synthesis=Synthesis(
key_claims=[
KeyClaim(claim="The appendix reports a 12% latency reduction on unseen workloads."),
KeyClaim(claim="The architecture introduces a separate recovery worker for post-attack repair."),
]
)
)
bus = MessageBus()
result = run_evidence_check(merge_output, [], Tracer(), bus=bus)
assert result.evidence_check.unsupported_claims == [
"The appendix reports a 12% latency reduction on unseen workloads.",
"The architecture introduces a separate recovery worker for post-attack repair.",
]
assert result.evidence_check.required_followups == [
"Which result or metric in the document directly supports: The appendix reports a 12% latency reduction on unseen workloads?",
"Where does the document explicitly describe: The architecture introduces a separate recovery worker for post-attack repair?",
]
latest = bus.latest("gap_report")
assert latest is not None
assert latest.payload["gaps"] == result.evidence_check.required_followups
|