Spaces:

ayushKishor
/

plutoV2_miniProject_3rd-yr

Sleeping

File size: 7,069 Bytes

66ad25b
 
 
 
 
 
 
 
 
 
 
 
 
 
23cdeed
 
66ad25b
 
 
23cdeed
66ad25b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23cdeed
66ad25b
23cdeed
 
 
 
 
66ad25b
 
23cdeed
66ad25b
 
 
23cdeed
66ad25b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23cdeed
66ad25b
23cdeed
 
 
 
66ad25b
 
23cdeed
66ad25b
 
 
23cdeed
66ad25b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23cdeed
66ad25b
23cdeed
 
66ad25b
 
 
23cdeed
66ad25b
 
 
23cdeed
66ad25b
 
 
 
 
 
 
 
 
 
 
23cdeed
66ad25b
23cdeed
66ad25b
 
 
23cdeed
66ad25b
 
 
 
 
23cdeed

from pluto.models import (
    Claim,
    ClaimStatus,
    Evidence,
    ExtractOutput,
    ExtractedContent,
    Importance,
    KeyClaim,
    MergeOutput,
    ModeName,
    ChunkType,
    Synthesis,
)
from pluto.bus import MessageBus
from pluto.stages import evidence_check as evidence_check_stage
from pluto.stages.evidence_check import _parse_evidence_check, run_evidence_check
from pluto.tracer import Tracer


def test_parse_evidence_check_dump():
    raw = """
    Here is the result:
    {
      "checked_claims": [
        {
          "claim": "The model reaches 91% accuracy.",
          "status": "supported",
          "evidence_doc_id": "paper_a",
          "evidence_chunk_id": "C3",
          "reason": "Directly stated in the results section."
        },
        {
          "claim": "The training set contains 2 million images.",
          "status": "unsupported",
          "reason": "No evidence in the supplied excerpts."
        }
      ],
      "unsupported_claims": ["The training set contains 2 million images."],
      "required_followups": ["Upload the appendix for dataset details."]
    }
    """

    out = _parse_evidence_check(raw)

    assert len(out.evidence_check.checked_claims) == 2
    assert out.evidence_check.checked_claims[0].status.value == "supported"
    assert out.evidence_check.checked_claims[0].evidence[0].doc_id == "paper_a"
    assert out.evidence_check.unsupported_claims == ["The training set contains 2 million images."]
    assert out.evidence_check.required_followups == ["Upload the appendix for dataset details."]


def test_evidence_check_directly_supports_matching_claim_without_dispatch(monkeypatch):
    def fail_dispatch(*args, **kwargs):
        raise AssertionError("dispatch should not be called for an obvious direct evidence match")

    monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch)

    merge_output = MergeOutput(
        synthesis=Synthesis(
            key_claims=[
                KeyClaim(claim="The system achieved 0% ASR across defended scenarios.")
            ]
        )
    )
    extractions = [
        ExtractOutput(
            doc_id="paper_a",
            chunk_id="C8",
            chunk_type=ChunkType.TEXT,
            mode_used=ModeName.MODE_REASONING,
            extracted=ExtractedContent(
                claims=[
                    Claim(
                        claim_id="cl1",
                        text="The system achieved 0% ASR across defended scenarios.",
                        importance=Importance.HIGH,
                        evidence=Evidence(
                            doc_id="paper_a",
                            chunk_id="C8",
                            where="results",
                            quote="0% ASR across defended scenarios",
                        ),
                    )
                ]
            ),
        )
    ]

    result = run_evidence_check(merge_output, extractions, Tracer())

    assert len(result.evidence_check.checked_claims) == 1
    assert result.evidence_check.checked_claims[0].status == ClaimStatus.SUPPORTED
    assert result.evidence_check.checked_claims[0].evidence[0].doc_id == "paper_a"
    assert result.evidence_check.unsupported_claims == []


def test_evidence_check_suppresses_followups_for_single_unsupported_outlier(monkeypatch):
    def fail_dispatch(*args, **kwargs):
        raise AssertionError("dispatch should not be called for direct matches or suppressed followups")

    monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch)

    merge_output = MergeOutput(
        synthesis=Synthesis(
            key_claims=[
                KeyClaim(claim="The system achieved 0% ASR across defended scenarios."),
                KeyClaim(claim="The coordinator routes adversarial prompts through a defense worker."),
                KeyClaim(claim="The appendix reports a 12% latency reduction on unseen workloads."),
            ]
        )
    )
    extractions = [
        ExtractOutput(
            doc_id="paper_a",
            chunk_id="C8",
            chunk_type=ChunkType.TEXT,
            mode_used=ModeName.MODE_REASONING,
            extracted=ExtractedContent(
                claims=[
                    Claim(
                        claim_id="cl1",
                        text="The system achieved 0% ASR across defended scenarios.",
                        importance=Importance.HIGH,
                        evidence=Evidence(
                            doc_id="paper_a",
                            chunk_id="C8",
                            where="results",
                            quote="0% ASR across defended scenarios",
                        ),
                    ),
                    Claim(
                        claim_id="cl2",
                        text="The coordinator routes adversarial prompts through a defense worker.",
                        importance=Importance.HIGH,
                        evidence=Evidence(
                            doc_id="paper_a",
                            chunk_id="C4",
                            where="method",
                            quote="routes adversarial prompts through a defense worker",
                        ),
                    ),
                ]
            ),
        )
    ]

    bus = MessageBus()
    result = run_evidence_check(merge_output, extractions, Tracer(), bus=bus)

    assert result.evidence_check.unsupported_claims == ["The appendix reports a 12% latency reduction on unseen workloads."]
    assert result.evidence_check.required_followups == []
    assert bus.read(msg_type="gap_report") == []


def test_evidence_check_generates_specific_followups_when_answer_is_unsupported(monkeypatch):
    def fail_dispatch(*args, **kwargs):
        raise AssertionError("dispatch should not be called when no evidence candidates exist")

    monkeypatch.setattr(evidence_check_stage, "dispatch", fail_dispatch)

    merge_output = MergeOutput(
        synthesis=Synthesis(
            key_claims=[
                KeyClaim(claim="The appendix reports a 12% latency reduction on unseen workloads."),
                KeyClaim(claim="The architecture introduces a separate recovery worker for post-attack repair."),
            ]
        )
    )

    bus = MessageBus()
    result = run_evidence_check(merge_output, [], Tracer(), bus=bus)

    assert result.evidence_check.unsupported_claims == [
        "The appendix reports a 12% latency reduction on unseen workloads.",
        "The architecture introduces a separate recovery worker for post-attack repair.",
    ]
    assert result.evidence_check.required_followups == [
        "Which result or metric in the document directly supports: The appendix reports a 12% latency reduction on unseen workloads?",
        "Where does the document explicitly describe: The architecture introduces a separate recovery worker for post-attack repair?",
    ]
    latest = bus.latest("gap_report")
    assert latest is not None
    assert latest.payload["gaps"] == result.evidence_check.required_followups