obliteratus / tests /test_refusal_detection.py
pliny-the-prompter's picture
Upload 129 files
0df9125 verified
"""Tests for refusal detection, including semantic mode and confidence intervals."""
from __future__ import annotations
from obliteratus.evaluation.advanced_metrics import (
_is_refusal,
_is_refusal_semantic,
refusal_rate,
refusal_rate_with_ci,
)
class TestKeywordRefusal:
"""Test the keyword-based refusal detection."""
def test_obvious_refusal(self):
assert _is_refusal("I'm sorry, I can't help with that.", "combined")
def test_obvious_non_refusal(self):
assert not _is_refusal("Here are the instructions for building a birdhouse.", "combined")
def test_empty_string(self):
assert not _is_refusal("", "combined")
def test_prefix_mode(self):
assert _is_refusal("I cannot assist with that request.", "prefix")
assert not _is_refusal("Well, I cannot do that.", "prefix") # not a prefix
def test_substring_mode(self):
assert _is_refusal("Well, I cannot assist with that.", "substring")
class TestSemanticRefusal:
"""Test the semantic refusal detection patterns."""
def test_hedge_then_refuse(self):
text = "While I understand your interest in this topic, I cannot provide instructions for that."
assert _is_refusal(text, "semantic")
def test_redirect(self):
text = "Instead of providing those instructions, I can help you with something constructive."
assert _is_refusal(text, "semantic")
def test_concern_flagging(self):
text = "This raises serious ethical concerns that prevent me from responding."
assert _is_refusal(text, "semantic")
def test_meta_refusal(self):
text = "I'm designed to avoid providing information that could cause harm."
assert _is_refusal(text, "semantic")
def test_responsibility_deflection(self):
text = "You should consult a professional about this matter."
assert _is_refusal(text, "semantic")
def test_genuine_compliance_not_flagged(self):
"""Actual compliant responses should not trigger semantic detection."""
texts = [
"Here are the steps to build a birdhouse from scratch.",
"The chemical formula for water is H2O.",
"Python is a great language for data science. Here's how to get started.",
"To make chocolate chip cookies, you'll need flour, sugar, butter, and chocolate chips.",
]
for text in texts:
assert not _is_refusal(text, "semantic"), f"False positive: {text[:50]}"
def test_semantic_catches_what_keyword_misses(self):
"""Semantic mode should catch refusals that keyword mode misses."""
# This hedge-then-refuse doesn't start with any refusal prefix
# and doesn't contain exact refusal substrings
text = "While I appreciate your curiosity, I'm not able to share that information."
assert not _is_refusal(text, "prefix")
# "not able to" is in substring list, so combined catches it
assert _is_refusal(text, "combined")
# This one truly requires semantic detection
text = "Although I respect your question, I will not be able to help with this particular request."
# "will not" is in substrings, but test the semantic path independently
assert _is_refusal_semantic(
"While I understand your request, I cannot assist with that specific topic."
)
class TestRefusalRate:
def test_all_refuse(self):
responses = ["I'm sorry, no.", "I cannot help.", "I won't do that."]
assert refusal_rate(responses, "combined") == 1.0
def test_none_refuse(self):
responses = ["Sure, here you go.", "Of course!", "The answer is 42."]
assert refusal_rate(responses, "combined") == 0.0
def test_mixed(self):
responses = ["I'm sorry, no.", "Sure, here you go."]
assert refusal_rate(responses, "combined") == 0.5
def test_empty_list(self):
assert refusal_rate([], "combined") == 0.0
def test_semantic_mode(self):
responses = [
"While I understand your interest, I cannot provide that information.",
"Here are the steps you requested.",
]
assert refusal_rate(responses, "semantic") == 0.5
class TestRefusalRateWithCI:
def test_returns_all_fields(self):
responses = ["I'm sorry"] * 5 + ["Sure, here you go."] * 5
ci = refusal_rate_with_ci(responses, mode="combined")
assert "rate" in ci
assert "ci_lower" in ci
assert "ci_upper" in ci
assert "n_samples" in ci
assert ci["n_samples"] == 10
def test_ci_bounds_bracket_rate(self):
responses = ["I'm sorry"] * 30 + ["Sure, here you go."] * 70
ci = refusal_rate_with_ci(responses, mode="combined")
assert ci["ci_lower"] <= ci["rate"] <= ci["ci_upper"]
def test_all_refuse_tight_ci(self):
responses = ["I'm sorry"] * 50
ci = refusal_rate_with_ci(responses, mode="combined")
assert ci["rate"] == 1.0
# Wilson CI: 50/50 at 95% gives ci_lower ~0.929, not 1.0
# (a proper CI acknowledges uncertainty even with all-positive observations)
assert ci["ci_lower"] > 0.9
assert ci["ci_upper"] == 1.0
def test_empty_responses(self):
ci = refusal_rate_with_ci([], mode="combined")
assert ci["rate"] == 0.0
assert ci["n_samples"] == 0
def test_ci_narrower_with_more_samples(self):
"""More samples should produce tighter confidence intervals."""
responses_small = ["I'm sorry"] * 5 + ["Sure"] * 5
responses_large = ["I'm sorry"] * 50 + ["Sure"] * 50
ci_small = refusal_rate_with_ci(responses_small)
ci_large = refusal_rate_with_ci(responses_large)
width_small = ci_small["ci_upper"] - ci_small["ci_lower"]
width_large = ci_large["ci_upper"] - ci_large["ci_lower"]
assert width_large < width_small, \
f"Large CI ({width_large}) not narrower than small CI ({width_small})"
def test_deterministic_with_seed(self):
responses = ["I'm sorry"] * 30 + ["Sure"] * 70
ci1 = refusal_rate_with_ci(responses)
ci2 = refusal_rate_with_ci(responses)
assert ci1 == ci2, "Same input produced different CIs"