""" Unit tests for L1 graders — no network, no LLM calls. Tests are behavioral: each test asserts what the grader DECIDES, not how it computes the decision internally. """ import sys from pathlib import Path import pytest sys.path.insert(0, str(Path(__file__).parent.parent.parent / "backend")) from unittest.mock import MagicMock, patch import numpy as np from grader import ( grade_pii_leakage, grade_token_budget, grade_chain_terminology, decompose_claims, grade_faithfulness_decomposed, FAITHFULNESS_THRESHOLD, TOKEN_BUDGET, ) # ── pii_leakage ────────────────────────────────────────────────────────────── class TestPiiLeakage: def test_clean_response_passes(self) -> None: result = grade_pii_leakage("Stock check is enabled for this retailer.") assert result.passed is True assert result.score == 1.0 def test_email_address_fails(self) -> None: result = grade_pii_leakage("Contact ops@example.com for details.") assert result.passed is False assert "email" in result.detail def test_ssn_pattern_fails(self) -> None: result = grade_pii_leakage("Employee SSN: 123-45-6789 is on file.") assert result.passed is False assert "SSN" in result.detail def test_phone_number_fails(self) -> None: result = grade_pii_leakage("Call 555-867-5309 to reach the manager.") assert result.passed is False assert result.score == 0.0 def test_multiple_pii_types_all_reported(self) -> None: result = grade_pii_leakage("Email ops@test.com or call 555-123-4567.") assert result.passed is False assert "email" in result.detail assert "phone" in result.detail def test_score_is_binary(self) -> None: clean = grade_pii_leakage("No PII here.") dirty = grade_pii_leakage("Email: a@b.com") assert clean.score == 1.0 assert dirty.score == 0.0 # ── token_budget ────────────────────────────────────────────────────────────── class TestTokenBudget: def test_short_response_passes(self) -> None: result = grade_token_budget("Short answer.") assert result.passed is True assert result.score == 1.0 def test_response_at_exact_budget_passes(self) -> None: text = "a" * (TOKEN_BUDGET * 4) result = grade_token_budget(text) assert result.passed is True def test_response_over_budget_fails(self) -> None: text = "a" * (TOKEN_BUDGET * 4 + 4) result = grade_token_budget(text) assert result.passed is False assert result.score < 1.0 def test_score_degrades_with_length(self) -> None: moderate = grade_token_budget("a" * (TOKEN_BUDGET * 5)) extreme = grade_token_budget("a" * (TOKEN_BUDGET * 20)) assert moderate.score > extreme.score def test_detail_reports_token_estimate(self) -> None: result = grade_token_budget("hello world") assert "tokens" in result.detail def test_custom_budget_respected(self) -> None: text = "a" * 40 # ~10 tokens assert grade_token_budget(text, budget=100).passed is True assert grade_token_budget(text, budget=5).passed is False # ── chain_terminology ───────────────────────────────────────────────────────── class TestChainTerminology: def test_correct_client_term_passes(self) -> None: result = grade_chain_terminology( "Run an availability scan to check inventory levels.", client="novamart", ) assert result.passed is True def test_rival_term_without_correct_term_fails(self) -> None: # "stock check" is ShelfWise term for STOCK_CHECK — wrong for NovaMart result = grade_chain_terminology( "Run a stock check to see inventory levels.", client="novamart", ) assert result.passed is False assert any(v["expected"] == "availability scan" for v in result.metadata["violations"]) def test_both_terms_present_does_not_flag(self) -> None: # Response explains both — not a violation result = grade_chain_terminology( "Run an availability scan (also called stock check) to check inventory.", client="novamart", ) assert result.passed is True def test_score_reflects_violation_ratio(self) -> None: result = grade_chain_terminology( "Run a stock check and use a feature toggle.", client="novamart", ) assert 0.0 <= result.score < 1.0 def test_clean_response_full_score(self) -> None: result = grade_chain_terminology( "This response uses no retail terminology at all.", client="novamart", ) assert result.score == 1.0 def test_pharma_client_rival_term_fails(self) -> None: # "prior authorization" is ClinixOne term — wrong for PharmaLink result = grade_chain_terminology( "Submit a prior authorization request to get the drug approved.", client="pharmalink", ) assert result.passed is False assert any(v["expected"] == "formulary pre-approval" for v in result.metadata["violations"]) # ── decompose_claims ────────────────────────────────────────────────────────── class TestDecomposeClaims: def test_single_sentence(self) -> None: claims = decompose_claims("The product is in stock.") assert claims == ["The product is in stock."] def test_multi_sentence_split(self) -> None: claims = decompose_claims("The product is in stock. It costs five dollars. Delivery takes two days.") assert len(claims) == 3 def test_fragments_under_three_words_excluded(self) -> None: claims = decompose_claims("Yes. The product is available in all sizes.") assert all(len(c.split()) >= 3 for c in claims) def test_exclamation_and_question_split(self) -> None: claims = decompose_claims("Stock is low! Would you like to reorder? The threshold is five units.") assert len(claims) == 3 def test_empty_string_returns_empty(self) -> None: assert decompose_claims("") == [] # ── grade_faithfulness_decomposed ──────────────────────────────────────────── def _make_nli(entailment: float) -> MagicMock: """Mock CrossEncoder whose predict() always returns the given entailment score.""" mock = MagicMock() # columns: [contradiction, entailment, neutral] mock.predict = MagicMock( side_effect=lambda pairs, **kw: np.array([[0.1, entailment, 0.0]] * len(pairs)) ) return mock CONTEXT = "The product costs five dollars.\n\nDelivery takes two days." class TestGradeFaithfulnessDecomposed: def test_all_claims_supported_passes(self) -> None: with patch("grader.get_nli_model", return_value=_make_nli(0.9)): result = grade_faithfulness_decomposed( "The product costs five dollars. Delivery takes two days.", CONTEXT ) assert result.passed is True assert result.score == 1.0 assert result.metadata["claims"][0]["supported"] is True def test_all_claims_unsupported_fails(self) -> None: with patch("grader.get_nli_model", return_value=_make_nli(0.1)): result = grade_faithfulness_decomposed( "The product costs five dollars. Delivery takes two days.", CONTEXT ) assert result.passed is False assert result.score == 0.0 def test_partial_hallucination_detected(self) -> None: # first claim supported, second not — whole-response NLI would miss this call_count = 0 def side_effect(pairs: list, **kw: object) -> np.ndarray: nonlocal call_count call_count += 1 entailment = 0.9 if call_count == 1 else 0.1 return np.array([[0.1, entailment, 0.0]] * len(pairs)) mock_model = MagicMock() mock_model.predict = MagicMock(side_effect=side_effect) with patch("grader.get_nli_model", return_value=mock_model): result = grade_faithfulness_decomposed( "The product costs five dollars. It was invented in 1842.", CONTEXT ) assert result.score == 0.5 assert result.metadata["claims"][0]["supported"] is True assert result.metadata["claims"][1]["supported"] is False def test_refusal_sentinel_auto_passes(self) -> None: result = grade_faithfulness_decomposed( "NOT IN DOCUMENTS: The context does not contain information about this drug.", CONTEXT ) assert result.passed is True assert result.score == 1.0 def test_refusal_fallback_auto_passes(self) -> None: result = grade_faithfulness_decomposed( "I don't have enough information to answer that.", CONTEXT ) assert result.passed is True assert result.score == 1.0 def test_sentinel_plus_hallucination_not_auto_passed(self) -> None: # Sentinel on first line but additional claims follow — must be NLI-scored. with patch("grader.get_nli_model", return_value=_make_nli(0.1)): result = grade_faithfulness_decomposed( "NOT IN DOCUMENTS: X is not in the KB.\nHowever, it likely causes nausea and headaches.", CONTEXT, ) assert result.passed is False def test_empty_context_fails(self) -> None: with patch("grader.get_nli_model"): result = grade_faithfulness_decomposed("The product costs five dollars.", "") assert result.passed is False assert result.score == 0.0 def test_metadata_shape(self) -> None: with patch("grader.get_nli_model", return_value=_make_nli(0.8)): result = grade_faithfulness_decomposed( "The product is available. It ships in two days.", CONTEXT ) for entry in result.metadata["claims"]: assert "claim" in entry assert "score" in entry assert "supported" in entry def test_score_is_proportion_not_max(self) -> None: """Verify score = supported/total, not max(entailment_scores).""" with patch("grader.get_nli_model", return_value=_make_nli(0.9)): result = grade_faithfulness_decomposed( "Claim one is true. Claim two is also true. Claim three too.", CONTEXT ) assert result.score == 1.0