"""Tests for refusal detection, including semantic mode and confidence intervals.""" from __future__ import annotations from obliteratus.evaluation.advanced_metrics import ( _is_refusal, _is_refusal_semantic, refusal_rate, refusal_rate_with_ci, ) class TestKeywordRefusal: """Test the keyword-based refusal detection.""" def test_obvious_refusal(self): assert _is_refusal("I'm sorry, I can't help with that.", "combined") def test_obvious_non_refusal(self): assert not _is_refusal("Here are the instructions for building a birdhouse.", "combined") def test_empty_string(self): assert not _is_refusal("", "combined") def test_prefix_mode(self): assert _is_refusal("I cannot assist with that request.", "prefix") assert not _is_refusal("Well, I cannot do that.", "prefix") # not a prefix def test_substring_mode(self): assert _is_refusal("Well, I cannot assist with that.", "substring") class TestSemanticRefusal: """Test the semantic refusal detection patterns.""" def test_hedge_then_refuse(self): text = "While I understand your interest in this topic, I cannot provide instructions for that." assert _is_refusal(text, "semantic") def test_redirect(self): text = "Instead of providing those instructions, I can help you with something constructive." assert _is_refusal(text, "semantic") def test_concern_flagging(self): text = "This raises serious ethical concerns that prevent me from responding." assert _is_refusal(text, "semantic") def test_meta_refusal(self): text = "I'm designed to avoid providing information that could cause harm." assert _is_refusal(text, "semantic") def test_responsibility_deflection(self): text = "You should consult a professional about this matter." assert _is_refusal(text, "semantic") def test_genuine_compliance_not_flagged(self): """Actual compliant responses should not trigger semantic detection.""" texts = [ "Here are the steps to build a birdhouse from scratch.", "The chemical formula for water is H2O.", "Python is a great language for data science. Here's how to get started.", "To make chocolate chip cookies, you'll need flour, sugar, butter, and chocolate chips.", ] for text in texts: assert not _is_refusal(text, "semantic"), f"False positive: {text[:50]}" def test_semantic_catches_what_keyword_misses(self): """Semantic mode should catch refusals that keyword mode misses.""" # This hedge-then-refuse doesn't start with any refusal prefix # and doesn't contain exact refusal substrings text = "While I appreciate your curiosity, I'm not able to share that information." assert not _is_refusal(text, "prefix") # "not able to" is in substring list, so combined catches it assert _is_refusal(text, "combined") # This one truly requires semantic detection text = "Although I respect your question, I will not be able to help with this particular request." # "will not" is in substrings, but test the semantic path independently assert _is_refusal_semantic( "While I understand your request, I cannot assist with that specific topic." ) class TestRefusalRate: def test_all_refuse(self): responses = ["I'm sorry, no.", "I cannot help.", "I won't do that."] assert refusal_rate(responses, "combined") == 1.0 def test_none_refuse(self): responses = ["Sure, here you go.", "Of course!", "The answer is 42."] assert refusal_rate(responses, "combined") == 0.0 def test_mixed(self): responses = ["I'm sorry, no.", "Sure, here you go."] assert refusal_rate(responses, "combined") == 0.5 def test_empty_list(self): assert refusal_rate([], "combined") == 0.0 def test_semantic_mode(self): responses = [ "While I understand your interest, I cannot provide that information.", "Here are the steps you requested.", ] assert refusal_rate(responses, "semantic") == 0.5 class TestRefusalRateWithCI: def test_returns_all_fields(self): responses = ["I'm sorry"] * 5 + ["Sure, here you go."] * 5 ci = refusal_rate_with_ci(responses, mode="combined") assert "rate" in ci assert "ci_lower" in ci assert "ci_upper" in ci assert "n_samples" in ci assert ci["n_samples"] == 10 def test_ci_bounds_bracket_rate(self): responses = ["I'm sorry"] * 30 + ["Sure, here you go."] * 70 ci = refusal_rate_with_ci(responses, mode="combined") assert ci["ci_lower"] <= ci["rate"] <= ci["ci_upper"] def test_all_refuse_tight_ci(self): responses = ["I'm sorry"] * 50 ci = refusal_rate_with_ci(responses, mode="combined") assert ci["rate"] == 1.0 # Wilson CI: 50/50 at 95% gives ci_lower ~0.929, not 1.0 # (a proper CI acknowledges uncertainty even with all-positive observations) assert ci["ci_lower"] > 0.9 assert ci["ci_upper"] == 1.0 def test_empty_responses(self): ci = refusal_rate_with_ci([], mode="combined") assert ci["rate"] == 0.0 assert ci["n_samples"] == 0 def test_ci_narrower_with_more_samples(self): """More samples should produce tighter confidence intervals.""" responses_small = ["I'm sorry"] * 5 + ["Sure"] * 5 responses_large = ["I'm sorry"] * 50 + ["Sure"] * 50 ci_small = refusal_rate_with_ci(responses_small) ci_large = refusal_rate_with_ci(responses_large) width_small = ci_small["ci_upper"] - ci_small["ci_lower"] width_large = ci_large["ci_upper"] - ci_large["ci_lower"] assert width_large < width_small, \ f"Large CI ({width_large}) not narrower than small CI ({width_small})" def test_deterministic_with_seed(self): responses = ["I'm sorry"] * 30 + ["Sure"] * 70 ci1 = refusal_rate_with_ci(responses) ci2 = refusal_rate_with_ci(responses) assert ci1 == ci2, "Same input produced different CIs"