File size: 7,069 Bytes
66ad25b
 
 
 
 
 
 
 
 
 
 
 
 
 
23cdeed
 
66ad25b
 
 
23cdeed
66ad25b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23cdeed
66ad25b
23cdeed
 
 
 
 
66ad25b
 
23cdeed
66ad25b
 
 
23cdeed
66ad25b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23cdeed
66ad25b
23cdeed
 
 
 
66ad25b
 
23cdeed
66ad25b
 
 
23cdeed
66ad25b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23cdeed
66ad25b
23cdeed
 
66ad25b
 
 
23cdeed
66ad25b
 
 
23cdeed
66ad25b
 
 
 
 
 
 
 
 
 
 
23cdeed
66ad25b
23cdeed
66ad25b
 
 
23cdeed
66ad25b
 
 
 
 
23cdeed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
from pluto.models import (
    Claim,
    ClaimStatus,
    Evidence,
    ExtractOutput,
    ExtractedContent,
    Importance,
    KeyClaim,
    MergeOutput,
    ModeName,
    ChunkType,
    Synthesis,
)
from pluto.bus import MessageBus
from pluto.stages import evidence_check as evidence_check_stage
from pluto.stages.evidence_check import _parse_evidence_check, run_evidence_check
from pluto.tracer import Tracer


def test_parse_evidence_check_dump():
    raw = """
    Here is the result:
    {
      "checked_claims": [
        {
          "claim": "The model reaches 91% accuracy.",
          "status": "supported",
          "evidence_doc_id": "paper_a",
          "evidence_chunk_id": "C3",
          "reason": "Directly stated in the results section."
        },
        {
          "claim": "The training set contains 2 million images.",
          "status": "unsupported",
          "reason": "No evidence in the supplied excerpts."
        }
      ],
      "unsupported_claims": ["The training set contains 2 million images."],
      "required_followups": ["Upload the appendix for dataset details."]
    }
    """

    out = _parse_evidence_check(raw)

    assert len(out.evidence_check.checked_claims) == 2
    assert out.evidence_check.checked_claims[0].status.value == "supported"
    assert out.evidence_check.checked_claims[0].evidence[0].doc_id == "paper_a"
    assert out.evidence_check.unsupported_claims == ["The training set contains 2 million images."]
    assert out.evidence_check.required_followups == ["Upload the appendix for dataset details."]


def test_evidence_check_directly_supports_matching_claim_without_dispatch(monkeypatch):
    def fail_dispatch(*args, **kwargs):
        raise AssertionError("dispatch should not be called for an obvious direct evidence match")

    monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch)

    merge_output = MergeOutput(
        synthesis=Synthesis(
            key_claims=[
                KeyClaim(claim="The system achieved 0% ASR across defended scenarios.")
            ]
        )
    )
    extractions = [
        ExtractOutput(
            doc_id="paper_a",
            chunk_id="C8",
            chunk_type=ChunkType.TEXT,
            mode_used=ModeName.MODE_REASONING,
            extracted=ExtractedContent(
                claims=[
                    Claim(
                        claim_id="cl1",
                        text="The system achieved 0% ASR across defended scenarios.",
                        importance=Importance.HIGH,
                        evidence=Evidence(
                            doc_id="paper_a",
                            chunk_id="C8",
                            where="results",
                            quote="0% ASR across defended scenarios",
                        ),
                    )
                ]
            ),
        )
    ]

    result = run_evidence_check(merge_output, extractions, Tracer())

    assert len(result.evidence_check.checked_claims) == 1
    assert result.evidence_check.checked_claims[0].status == ClaimStatus.SUPPORTED
    assert result.evidence_check.checked_claims[0].evidence[0].doc_id == "paper_a"
    assert result.evidence_check.unsupported_claims == []


def test_evidence_check_suppresses_followups_for_single_unsupported_outlier(monkeypatch):
    def fail_dispatch(*args, **kwargs):
        raise AssertionError("dispatch should not be called for direct matches or suppressed followups")

    monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch)

    merge_output = MergeOutput(
        synthesis=Synthesis(
            key_claims=[
                KeyClaim(claim="The system achieved 0% ASR across defended scenarios."),
                KeyClaim(claim="The coordinator routes adversarial prompts through a defense worker."),
                KeyClaim(claim="The appendix reports a 12% latency reduction on unseen workloads."),
            ]
        )
    )
    extractions = [
        ExtractOutput(
            doc_id="paper_a",
            chunk_id="C8",
            chunk_type=ChunkType.TEXT,
            mode_used=ModeName.MODE_REASONING,
            extracted=ExtractedContent(
                claims=[
                    Claim(
                        claim_id="cl1",
                        text="The system achieved 0% ASR across defended scenarios.",
                        importance=Importance.HIGH,
                        evidence=Evidence(
                            doc_id="paper_a",
                            chunk_id="C8",
                            where="results",
                            quote="0% ASR across defended scenarios",
                        ),
                    ),
                    Claim(
                        claim_id="cl2",
                        text="The coordinator routes adversarial prompts through a defense worker.",
                        importance=Importance.HIGH,
                        evidence=Evidence(
                            doc_id="paper_a",
                            chunk_id="C4",
                            where="method",
                            quote="routes adversarial prompts through a defense worker",
                        ),
                    ),
                ]
            ),
        )
    ]

    bus = MessageBus()
    result = run_evidence_check(merge_output, extractions, Tracer(), bus=bus)

    assert result.evidence_check.unsupported_claims == ["The appendix reports a 12% latency reduction on unseen workloads."]
    assert result.evidence_check.required_followups == []
    assert bus.read(msg_type="gap_report") == []


def test_evidence_check_generates_specific_followups_when_answer_is_unsupported(monkeypatch):
    def fail_dispatch(*args, **kwargs):
        raise AssertionError("dispatch should not be called when no evidence candidates exist")

    monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch)

    merge_output = MergeOutput(
        synthesis=Synthesis(
            key_claims=[
                KeyClaim(claim="The appendix reports a 12% latency reduction on unseen workloads."),
                KeyClaim(claim="The architecture introduces a separate recovery worker for post-attack repair."),
            ]
        )
    )

    bus = MessageBus()
    result = run_evidence_check(merge_output, [], Tracer(), bus=bus)

    assert result.evidence_check.unsupported_claims == [
        "The appendix reports a 12% latency reduction on unseen workloads.",
        "The architecture introduces a separate recovery worker for post-attack repair.",
    ]
    assert result.evidence_check.required_followups == [
        "Which result or metric in the document directly supports: The appendix reports a 12% latency reduction on unseen workloads?",
        "Where does the document explicitly describe: The architecture introduces a separate recovery worker for post-attack repair?",
    ]
    latest = bus.latest("gap_report")
    assert latest is not None
    assert latest.payload["gaps"] == result.evidence_check.required_followups