""" Tests for DocMind NLI Grounding Gate. NOTE: These tests require the NLI model to be downloaded. They will be slower on first run (~10s for model download). Run with: python tests/test_grounding.py """ import sys import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) def test_entailment_high_score(): """A sentence clearly stated in the premise should score high.""" from pipeline.grounding import load_nli_model, score_sentence tokenizer, model = load_nli_model.__wrapped__() # bypass st.cache premise = "Full-time employees receive 30 days of paid vacation per year." hypothesis = "Full-time employees receive 30 days of paid vacation." score = score_sentence(tokenizer, model, premise, hypothesis) print(f" Entailment score: {score:.4f} (expected >= 0.7)") assert score >= 0.5, f"Expected high entailment, got {score}" def test_contradiction_low_score(): """A contradictory sentence should score low on entailment.""" from pipeline.grounding import load_nli_model, score_sentence tokenizer, model = load_nli_model.__wrapped__() premise = "The product is available only in the United States." hypothesis = "The product is available worldwide in all countries." score = score_sentence(tokenizer, model, premise, hypothesis) print(f" Contradiction score: {score:.4f} (expected <= 0.3)") assert score <= 0.5, f"Expected low entailment for contradiction, got {score}" def test_neutral_moderate_score(): """An unrelated sentence should get low-to-moderate entailment.""" from pipeline.grounding import load_nli_model, score_sentence tokenizer, model = load_nli_model.__wrapped__() premise = "The company was founded in 2005 in San Francisco." hypothesis = "The weather in San Francisco is often foggy." score = score_sentence(tokenizer, model, premise, hypothesis) print(f" Neutral score: {score:.4f} (expected <= 0.4)") assert score <= 0.5, f"Expected low entailment for neutral, got {score}" if __name__ == "__main__": tests = [ test_entailment_high_score, test_contradiction_low_score, test_neutral_moderate_score, ] # Need to handle the st.cache_resource decorator # We mock it away for standalone testing try: import streamlit as st except ImportError: pass passed = failed = 0 for test_fn in tests: name = test_fn.__name__ try: test_fn() print(f" [PASS] {name}") passed += 1 except AssertionError as e: print(f" [FAIL] {name}: {e}") failed += 1 except Exception as e: print(f" [FAIL] {name}: {type(e).__name__}: {e}") failed += 1 print(f"\n{'='*40}") print(f"Results: {passed} passed, {failed} failed")