Spaces:

below-threshold
/

ai-response-validator

Sleeping

File size: 11,088 Bytes

"""
Unit tests for L1 graders — no network, no LLM calls.

Tests are behavioral: each test asserts what the grader DECIDES,
not how it computes the decision internally.
"""

import sys
from pathlib import Path
import pytest

sys.path.insert(0, str(Path(__file__).parent.parent.parent / "backend"))

from unittest.mock import MagicMock, patch

import numpy as np

from grader import (
    grade_pii_leakage,
    grade_token_budget,
    grade_chain_terminology,
    decompose_claims,
    grade_faithfulness_decomposed,
    FAITHFULNESS_THRESHOLD,
    TOKEN_BUDGET,
)


# ── pii_leakage ──────────────────────────────────────────────────────────────

class TestPiiLeakage:
    def test_clean_response_passes(self) -> None:
        result = grade_pii_leakage("Stock check is enabled for this retailer.")
        assert result.passed is True
        assert result.score == 1.0

    def test_email_address_fails(self) -> None:
        result = grade_pii_leakage("Contact ops@example.com for details.")
        assert result.passed is False
        assert "email" in result.detail

    def test_ssn_pattern_fails(self) -> None:
        result = grade_pii_leakage("Employee SSN: 123-45-6789 is on file.")
        assert result.passed is False
        assert "SSN" in result.detail

    def test_phone_number_fails(self) -> None:
        result = grade_pii_leakage("Call 555-867-5309 to reach the manager.")
        assert result.passed is False
        assert result.score == 0.0

    def test_multiple_pii_types_all_reported(self) -> None:
        result = grade_pii_leakage("Email ops@test.com or call 555-123-4567.")
        assert result.passed is False
        assert "email" in result.detail
        assert "phone" in result.detail

    def test_score_is_binary(self) -> None:
        clean = grade_pii_leakage("No PII here.")
        dirty = grade_pii_leakage("Email: a@b.com")
        assert clean.score == 1.0
        assert dirty.score == 0.0


# ── token_budget ──────────────────────────────────────────────────────────────

class TestTokenBudget:
    def test_short_response_passes(self) -> None:
        result = grade_token_budget("Short answer.")
        assert result.passed is True
        assert result.score == 1.0

    def test_response_at_exact_budget_passes(self) -> None:
        text = "a" * (TOKEN_BUDGET * 4)
        result = grade_token_budget(text)
        assert result.passed is True

    def test_response_over_budget_fails(self) -> None:
        text = "a" * (TOKEN_BUDGET * 4 + 4)
        result = grade_token_budget(text)
        assert result.passed is False
        assert result.score < 1.0

    def test_score_degrades_with_length(self) -> None:
        moderate = grade_token_budget("a" * (TOKEN_BUDGET * 5))
        extreme = grade_token_budget("a" * (TOKEN_BUDGET * 20))
        assert moderate.score > extreme.score

    def test_detail_reports_token_estimate(self) -> None:
        result = grade_token_budget("hello world")
        assert "tokens" in result.detail

    def test_custom_budget_respected(self) -> None:
        text = "a" * 40  # ~10 tokens
        assert grade_token_budget(text, budget=100).passed is True
        assert grade_token_budget(text, budget=5).passed is False


# ── chain_terminology ─────────────────────────────────────────────────────────

class TestChainTerminology:
    def test_correct_client_term_passes(self) -> None:
        result = grade_chain_terminology(
            "Run an availability scan to check inventory levels.",
            client="novamart",
        )
        assert result.passed is True

    def test_rival_term_without_correct_term_fails(self) -> None:
        # "stock check" is ShelfWise term for STOCK_CHECK — wrong for NovaMart
        result = grade_chain_terminology(
            "Run a stock check to see inventory levels.",
            client="novamart",
        )
        assert result.passed is False
        assert any(v["expected"] == "availability scan" for v in result.metadata["violations"])

    def test_both_terms_present_does_not_flag(self) -> None:
        # Response explains both — not a violation
        result = grade_chain_terminology(
            "Run an availability scan (also called stock check) to check inventory.",
            client="novamart",
        )
        assert result.passed is True

    def test_score_reflects_violation_ratio(self) -> None:
        result = grade_chain_terminology(
            "Run a stock check and use a feature toggle.",
            client="novamart",
        )
        assert 0.0 <= result.score < 1.0

    def test_clean_response_full_score(self) -> None:
        result = grade_chain_terminology(
            "This response uses no retail terminology at all.",
            client="novamart",
        )
        assert result.score == 1.0

    def test_pharma_client_rival_term_fails(self) -> None:
        # "prior authorization" is ClinixOne term — wrong for PharmaLink
        result = grade_chain_terminology(
            "Submit a prior authorization request to get the drug approved.",
            client="pharmalink",
        )
        assert result.passed is False
        assert any(v["expected"] == "formulary pre-approval" for v in result.metadata["violations"])


# ── decompose_claims ──────────────────────────────────────────────────────────

class TestDecomposeClaims:
    def test_single_sentence(self) -> None:
        claims = decompose_claims("The product is in stock.")
        assert claims == ["The product is in stock."]

    def test_multi_sentence_split(self) -> None:
        claims = decompose_claims("The product is in stock. It costs five dollars. Delivery takes two days.")
        assert len(claims) == 3

    def test_fragments_under_three_words_excluded(self) -> None:
        claims = decompose_claims("Yes. The product is available in all sizes.")
        assert all(len(c.split()) >= 3 for c in claims)

    def test_exclamation_and_question_split(self) -> None:
        claims = decompose_claims("Stock is low! Would you like to reorder? The threshold is five units.")
        assert len(claims) == 3

    def test_empty_string_returns_empty(self) -> None:
        assert decompose_claims("") == []


# ── grade_faithfulness_decomposed ────────────────────────────────────────────

def _make_nli(entailment: float) -> MagicMock:
    """Mock CrossEncoder whose predict() always returns the given entailment score."""
    mock = MagicMock()
    # columns: [contradiction, entailment, neutral]
    mock.predict = MagicMock(
        side_effect=lambda pairs, **kw: np.array([[0.1, entailment, 0.0]] * len(pairs))
    )
    return mock


CONTEXT = "The product costs five dollars.\n\nDelivery takes two days."


class TestGradeFaithfulnessDecomposed:
    def test_all_claims_supported_passes(self) -> None:
        with patch("grader.get_nli_model", return_value=_make_nli(0.9)):
            result = grade_faithfulness_decomposed(
                "The product costs five dollars. Delivery takes two days.", CONTEXT
            )
        assert result.passed is True
        assert result.score == 1.0
        assert result.metadata["claims"][0]["supported"] is True

    def test_all_claims_unsupported_fails(self) -> None:
        with patch("grader.get_nli_model", return_value=_make_nli(0.1)):
            result = grade_faithfulness_decomposed(
                "The product costs five dollars. Delivery takes two days.", CONTEXT
            )
        assert result.passed is False
        assert result.score == 0.0

    def test_partial_hallucination_detected(self) -> None:
        # first claim supported, second not — whole-response NLI would miss this
        call_count = 0

        def side_effect(pairs: list, **kw: object) -> np.ndarray:
            nonlocal call_count
            call_count += 1
            entailment = 0.9 if call_count == 1 else 0.1
            return np.array([[0.1, entailment, 0.0]] * len(pairs))

        mock_model = MagicMock()
        mock_model.predict = MagicMock(side_effect=side_effect)
        with patch("grader.get_nli_model", return_value=mock_model):
            result = grade_faithfulness_decomposed(
                "The product costs five dollars. It was invented in 1842.", CONTEXT
            )
        assert result.score == 0.5
        assert result.metadata["claims"][0]["supported"] is True
        assert result.metadata["claims"][1]["supported"] is False

    def test_refusal_sentinel_auto_passes(self) -> None:
        result = grade_faithfulness_decomposed(
            "NOT IN DOCUMENTS: The context does not contain information about this drug.", CONTEXT
        )
        assert result.passed is True
        assert result.score == 1.0

    def test_refusal_fallback_auto_passes(self) -> None:
        result = grade_faithfulness_decomposed(
            "I don't have enough information to answer that.", CONTEXT
        )
        assert result.passed is True
        assert result.score == 1.0

    def test_sentinel_plus_hallucination_not_auto_passed(self) -> None:
        # Sentinel on first line but additional claims follow — must be NLI-scored.
        with patch("grader.get_nli_model", return_value=_make_nli(0.1)):
            result = grade_faithfulness_decomposed(
                "NOT IN DOCUMENTS: X is not in the KB.\nHowever, it likely causes nausea and headaches.",
                CONTEXT,
            )
        assert result.passed is False

    def test_empty_context_fails(self) -> None:
        with patch("grader.get_nli_model"):
            result = grade_faithfulness_decomposed("The product costs five dollars.", "")
        assert result.passed is False
        assert result.score == 0.0

    def test_metadata_shape(self) -> None:
        with patch("grader.get_nli_model", return_value=_make_nli(0.8)):
            result = grade_faithfulness_decomposed(
                "The product is available. It ships in two days.", CONTEXT
            )
        for entry in result.metadata["claims"]:
            assert "claim" in entry
            assert "score" in entry
            assert "supported" in entry

    def test_score_is_proportion_not_max(self) -> None:
        """Verify score = supported/total, not max(entailment_scores)."""
        with patch("grader.get_nli_model", return_value=_make_nli(0.9)):
            result = grade_faithfulness_decomposed(
                "Claim one is true. Claim two is also true. Claim three too.", CONTEXT
            )
        assert result.score == 1.0