plutoV2_miniProject_3rd-yr / mp1 /test_verify.py
ayushKishor's picture
Add Pluto memory layer and pipeline fixes
23cdeed
from pluto.models import (
Claim,
ClaimStatus,
Evidence,
ExtractOutput,
ExtractedContent,
Importance,
KeyClaim,
MergeOutput,
ModeName,
ChunkType,
Synthesis,
)
from pluto.bus import MessageBus
from pluto.stages import evidence_check as evidence_check_stage
from pluto.stages.evidence_check import _parse_evidence_check, run_evidence_check
from pluto.tracer import Tracer
def test_parse_evidence_check_dump():
raw = """
Here is the result:
{
"checked_claims": [
{
"claim": "The model reaches 91% accuracy.",
"status": "supported",
"evidence_doc_id": "paper_a",
"evidence_chunk_id": "C3",
"reason": "Directly stated in the results section."
},
{
"claim": "The training set contains 2 million images.",
"status": "unsupported",
"reason": "No evidence in the supplied excerpts."
}
],
"unsupported_claims": ["The training set contains 2 million images."],
"required_followups": ["Upload the appendix for dataset details."]
}
"""
out = _parse_evidence_check(raw)
assert len(out.evidence_check.checked_claims) == 2
assert out.evidence_check.checked_claims[0].status.value == "supported"
assert out.evidence_check.checked_claims[0].evidence[0].doc_id == "paper_a"
assert out.evidence_check.unsupported_claims == ["The training set contains 2 million images."]
assert out.evidence_check.required_followups == ["Upload the appendix for dataset details."]
def test_evidence_check_directly_supports_matching_claim_without_dispatch(monkeypatch):
def fail_dispatch(*args, **kwargs):
raise AssertionError("dispatch should not be called for an obvious direct evidence match")
monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch)
merge_output = MergeOutput(
synthesis=Synthesis(
key_claims=[
KeyClaim(claim="The system achieved 0% ASR across defended scenarios.")
]
)
)
extractions = [
ExtractOutput(
doc_id="paper_a",
chunk_id="C8",
chunk_type=ChunkType.TEXT,
mode_used=ModeName.MODE_REASONING,
extracted=ExtractedContent(
claims=[
Claim(
claim_id="cl1",
text="The system achieved 0% ASR across defended scenarios.",
importance=Importance.HIGH,
evidence=Evidence(
doc_id="paper_a",
chunk_id="C8",
where="results",
quote="0% ASR across defended scenarios",
),
)
]
),
)
]
result = run_evidence_check(merge_output, extractions, Tracer())
assert len(result.evidence_check.checked_claims) == 1
assert result.evidence_check.checked_claims[0].status == ClaimStatus.SUPPORTED
assert result.evidence_check.checked_claims[0].evidence[0].doc_id == "paper_a"
assert result.evidence_check.unsupported_claims == []
def test_evidence_check_suppresses_followups_for_single_unsupported_outlier(monkeypatch):
def fail_dispatch(*args, **kwargs):
raise AssertionError("dispatch should not be called for direct matches or suppressed followups")
monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch)
merge_output = MergeOutput(
synthesis=Synthesis(
key_claims=[
KeyClaim(claim="The system achieved 0% ASR across defended scenarios."),
KeyClaim(claim="The coordinator routes adversarial prompts through a defense worker."),
KeyClaim(claim="The appendix reports a 12% latency reduction on unseen workloads."),
]
)
)
extractions = [
ExtractOutput(
doc_id="paper_a",
chunk_id="C8",
chunk_type=ChunkType.TEXT,
mode_used=ModeName.MODE_REASONING,
extracted=ExtractedContent(
claims=[
Claim(
claim_id="cl1",
text="The system achieved 0% ASR across defended scenarios.",
importance=Importance.HIGH,
evidence=Evidence(
doc_id="paper_a",
chunk_id="C8",
where="results",
quote="0% ASR across defended scenarios",
),
),
Claim(
claim_id="cl2",
text="The coordinator routes adversarial prompts through a defense worker.",
importance=Importance.HIGH,
evidence=Evidence(
doc_id="paper_a",
chunk_id="C4",
where="method",
quote="routes adversarial prompts through a defense worker",
),
),
]
),
)
]
bus = MessageBus()
result = run_evidence_check(merge_output, extractions, Tracer(), bus=bus)
assert result.evidence_check.unsupported_claims == ["The appendix reports a 12% latency reduction on unseen workloads."]
assert result.evidence_check.required_followups == []
assert bus.read(msg_type="gap_report") == []
def test_evidence_check_generates_specific_followups_when_answer_is_unsupported(monkeypatch):
def fail_dispatch(*args, **kwargs):
raise AssertionError("dispatch should not be called when no evidence candidates exist")
monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch)
merge_output = MergeOutput(
synthesis=Synthesis(
key_claims=[
KeyClaim(claim="The appendix reports a 12% latency reduction on unseen workloads."),
KeyClaim(claim="The architecture introduces a separate recovery worker for post-attack repair."),
]
)
)
bus = MessageBus()
result = run_evidence_check(merge_output, [], Tracer(), bus=bus)
assert result.evidence_check.unsupported_claims == [
"The appendix reports a 12% latency reduction on unseen workloads.",
"The architecture introduces a separate recovery worker for post-attack repair.",
]
assert result.evidence_check.required_followups == [
"Which result or metric in the document directly supports: The appendix reports a 12% latency reduction on unseen workloads?",
"Where does the document explicitly describe: The architecture introduces a separate recovery worker for post-attack repair?",
]
latest = bus.latest("gap_report")
assert latest is not None
assert latest.payload["gaps"] == result.evidence_check.required_followups