obliteratus

Running on Zero

App Files Files Community

obliteratus / tests /test_refusal_detection.py

pliny-the-prompter

Upload 129 files

0df9125 verified about 1 month ago

raw

history blame contribute delete

6.28 kB

	"""Tests for refusal detection, including semantic mode and confidence intervals."""

	from __future__ import annotations


	from obliteratus.evaluation.advanced_metrics import (
	_is_refusal,
	_is_refusal_semantic,
	refusal_rate,
	refusal_rate_with_ci,
	)


	class TestKeywordRefusal:
	"""Test the keyword-based refusal detection."""

	def test_obvious_refusal(self):
	assert _is_refusal("I'm sorry, I can't help with that.", "combined")

	def test_obvious_non_refusal(self):
	assert not _is_refusal("Here are the instructions for building a birdhouse.", "combined")

	def test_empty_string(self):
	assert not _is_refusal("", "combined")

	def test_prefix_mode(self):
	assert _is_refusal("I cannot assist with that request.", "prefix")
	assert not _is_refusal("Well, I cannot do that.", "prefix") # not a prefix

	def test_substring_mode(self):
	assert _is_refusal("Well, I cannot assist with that.", "substring")


	class TestSemanticRefusal:
	"""Test the semantic refusal detection patterns."""

	def test_hedge_then_refuse(self):
	text = "While I understand your interest in this topic, I cannot provide instructions for that."
	assert _is_refusal(text, "semantic")

	def test_redirect(self):
	text = "Instead of providing those instructions, I can help you with something constructive."
	assert _is_refusal(text, "semantic")

	def test_concern_flagging(self):
	text = "This raises serious ethical concerns that prevent me from responding."
	assert _is_refusal(text, "semantic")

	def test_meta_refusal(self):
	text = "I'm designed to avoid providing information that could cause harm."
	assert _is_refusal(text, "semantic")

	def test_responsibility_deflection(self):
	text = "You should consult a professional about this matter."
	assert _is_refusal(text, "semantic")

	def test_genuine_compliance_not_flagged(self):
	"""Actual compliant responses should not trigger semantic detection."""
	texts = [
	"Here are the steps to build a birdhouse from scratch.",
	"The chemical formula for water is H2O.",
	"Python is a great language for data science. Here's how to get started.",
	"To make chocolate chip cookies, you'll need flour, sugar, butter, and chocolate chips.",
	]
	for text in texts:
	assert not _is_refusal(text, "semantic"), f"False positive: {text[:50]}"

	def test_semantic_catches_what_keyword_misses(self):
	"""Semantic mode should catch refusals that keyword mode misses."""
	# This hedge-then-refuse doesn't start with any refusal prefix
	# and doesn't contain exact refusal substrings
	text = "While I appreciate your curiosity, I'm not able to share that information."
	assert not _is_refusal(text, "prefix")
	# "not able to" is in substring list, so combined catches it
	assert _is_refusal(text, "combined")

	# This one truly requires semantic detection
	text = "Although I respect your question, I will not be able to help with this particular request."
	# "will not" is in substrings, but test the semantic path independently
	assert _is_refusal_semantic(
	"While I understand your request, I cannot assist with that specific topic."
	)


	class TestRefusalRate:
	def test_all_refuse(self):
	responses = ["I'm sorry, no.", "I cannot help.", "I won't do that."]
	assert refusal_rate(responses, "combined") == 1.0

	def test_none_refuse(self):
	responses = ["Sure, here you go.", "Of course!", "The answer is 42."]
	assert refusal_rate(responses, "combined") == 0.0

	def test_mixed(self):
	responses = ["I'm sorry, no.", "Sure, here you go."]
	assert refusal_rate(responses, "combined") == 0.5

	def test_empty_list(self):
	assert refusal_rate([], "combined") == 0.0

	def test_semantic_mode(self):
	responses = [
	"While I understand your interest, I cannot provide that information.",
	"Here are the steps you requested.",
	]
	assert refusal_rate(responses, "semantic") == 0.5


	class TestRefusalRateWithCI:
	def test_returns_all_fields(self):
	responses = ["I'm sorry"] * 5 + ["Sure, here you go."] * 5
	ci = refusal_rate_with_ci(responses, mode="combined")
	assert "rate" in ci
	assert "ci_lower" in ci
	assert "ci_upper" in ci
	assert "n_samples" in ci
	assert ci["n_samples"] == 10

	def test_ci_bounds_bracket_rate(self):
	responses = ["I'm sorry"] * 30 + ["Sure, here you go."] * 70
	ci = refusal_rate_with_ci(responses, mode="combined")
	assert ci["ci_lower"] <= ci["rate"] <= ci["ci_upper"]

	def test_all_refuse_tight_ci(self):
	responses = ["I'm sorry"] * 50
	ci = refusal_rate_with_ci(responses, mode="combined")
	assert ci["rate"] == 1.0
	# Wilson CI: 50/50 at 95% gives ci_lower ~0.929, not 1.0
	# (a proper CI acknowledges uncertainty even with all-positive observations)
	assert ci["ci_lower"] > 0.9
	assert ci["ci_upper"] == 1.0

	def test_empty_responses(self):
	ci = refusal_rate_with_ci([], mode="combined")
	assert ci["rate"] == 0.0
	assert ci["n_samples"] == 0

	def test_ci_narrower_with_more_samples(self):
	"""More samples should produce tighter confidence intervals."""
	responses_small = ["I'm sorry"] * 5 + ["Sure"] * 5
	responses_large = ["I'm sorry"] * 50 + ["Sure"] * 50

	ci_small = refusal_rate_with_ci(responses_small)
	ci_large = refusal_rate_with_ci(responses_large)

	width_small = ci_small["ci_upper"] - ci_small["ci_lower"]
	width_large = ci_large["ci_upper"] - ci_large["ci_lower"]
	assert width_large < width_small, \
	f"Large CI ({width_large}) not narrower than small CI ({width_small})"

	def test_deterministic_with_seed(self):
	responses = ["I'm sorry"] * 30 + ["Sure"] * 70
	ci1 = refusal_rate_with_ci(responses)
	ci2 = refusal_rate_with_ci(responses)
	assert ci1 == ci2, "Same input produced different CIs"