Spaces:
Running on Zero
Running on Zero
| """Tests for refusal detection, including semantic mode and confidence intervals.""" | |
| from __future__ import annotations | |
| from obliteratus.evaluation.advanced_metrics import ( | |
| _is_refusal, | |
| _is_refusal_semantic, | |
| refusal_rate, | |
| refusal_rate_with_ci, | |
| ) | |
| class TestKeywordRefusal: | |
| """Test the keyword-based refusal detection.""" | |
| def test_obvious_refusal(self): | |
| assert _is_refusal("I'm sorry, I can't help with that.", "combined") | |
| def test_obvious_non_refusal(self): | |
| assert not _is_refusal("Here are the instructions for building a birdhouse.", "combined") | |
| def test_empty_string(self): | |
| assert not _is_refusal("", "combined") | |
| def test_prefix_mode(self): | |
| assert _is_refusal("I cannot assist with that request.", "prefix") | |
| assert not _is_refusal("Well, I cannot do that.", "prefix") # not a prefix | |
| def test_substring_mode(self): | |
| assert _is_refusal("Well, I cannot assist with that.", "substring") | |
| class TestSemanticRefusal: | |
| """Test the semantic refusal detection patterns.""" | |
| def test_hedge_then_refuse(self): | |
| text = "While I understand your interest in this topic, I cannot provide instructions for that." | |
| assert _is_refusal(text, "semantic") | |
| def test_redirect(self): | |
| text = "Instead of providing those instructions, I can help you with something constructive." | |
| assert _is_refusal(text, "semantic") | |
| def test_concern_flagging(self): | |
| text = "This raises serious ethical concerns that prevent me from responding." | |
| assert _is_refusal(text, "semantic") | |
| def test_meta_refusal(self): | |
| text = "I'm designed to avoid providing information that could cause harm." | |
| assert _is_refusal(text, "semantic") | |
| def test_responsibility_deflection(self): | |
| text = "You should consult a professional about this matter." | |
| assert _is_refusal(text, "semantic") | |
| def test_genuine_compliance_not_flagged(self): | |
| """Actual compliant responses should not trigger semantic detection.""" | |
| texts = [ | |
| "Here are the steps to build a birdhouse from scratch.", | |
| "The chemical formula for water is H2O.", | |
| "Python is a great language for data science. Here's how to get started.", | |
| "To make chocolate chip cookies, you'll need flour, sugar, butter, and chocolate chips.", | |
| ] | |
| for text in texts: | |
| assert not _is_refusal(text, "semantic"), f"False positive: {text[:50]}" | |
| def test_semantic_catches_what_keyword_misses(self): | |
| """Semantic mode should catch refusals that keyword mode misses.""" | |
| # This hedge-then-refuse doesn't start with any refusal prefix | |
| # and doesn't contain exact refusal substrings | |
| text = "While I appreciate your curiosity, I'm not able to share that information." | |
| assert not _is_refusal(text, "prefix") | |
| # "not able to" is in substring list, so combined catches it | |
| assert _is_refusal(text, "combined") | |
| # This one truly requires semantic detection | |
| text = "Although I respect your question, I will not be able to help with this particular request." | |
| # "will not" is in substrings, but test the semantic path independently | |
| assert _is_refusal_semantic( | |
| "While I understand your request, I cannot assist with that specific topic." | |
| ) | |
| class TestRefusalRate: | |
| def test_all_refuse(self): | |
| responses = ["I'm sorry, no.", "I cannot help.", "I won't do that."] | |
| assert refusal_rate(responses, "combined") == 1.0 | |
| def test_none_refuse(self): | |
| responses = ["Sure, here you go.", "Of course!", "The answer is 42."] | |
| assert refusal_rate(responses, "combined") == 0.0 | |
| def test_mixed(self): | |
| responses = ["I'm sorry, no.", "Sure, here you go."] | |
| assert refusal_rate(responses, "combined") == 0.5 | |
| def test_empty_list(self): | |
| assert refusal_rate([], "combined") == 0.0 | |
| def test_semantic_mode(self): | |
| responses = [ | |
| "While I understand your interest, I cannot provide that information.", | |
| "Here are the steps you requested.", | |
| ] | |
| assert refusal_rate(responses, "semantic") == 0.5 | |
| class TestRefusalRateWithCI: | |
| def test_returns_all_fields(self): | |
| responses = ["I'm sorry"] * 5 + ["Sure, here you go."] * 5 | |
| ci = refusal_rate_with_ci(responses, mode="combined") | |
| assert "rate" in ci | |
| assert "ci_lower" in ci | |
| assert "ci_upper" in ci | |
| assert "n_samples" in ci | |
| assert ci["n_samples"] == 10 | |
| def test_ci_bounds_bracket_rate(self): | |
| responses = ["I'm sorry"] * 30 + ["Sure, here you go."] * 70 | |
| ci = refusal_rate_with_ci(responses, mode="combined") | |
| assert ci["ci_lower"] <= ci["rate"] <= ci["ci_upper"] | |
| def test_all_refuse_tight_ci(self): | |
| responses = ["I'm sorry"] * 50 | |
| ci = refusal_rate_with_ci(responses, mode="combined") | |
| assert ci["rate"] == 1.0 | |
| # Wilson CI: 50/50 at 95% gives ci_lower ~0.929, not 1.0 | |
| # (a proper CI acknowledges uncertainty even with all-positive observations) | |
| assert ci["ci_lower"] > 0.9 | |
| assert ci["ci_upper"] == 1.0 | |
| def test_empty_responses(self): | |
| ci = refusal_rate_with_ci([], mode="combined") | |
| assert ci["rate"] == 0.0 | |
| assert ci["n_samples"] == 0 | |
| def test_ci_narrower_with_more_samples(self): | |
| """More samples should produce tighter confidence intervals.""" | |
| responses_small = ["I'm sorry"] * 5 + ["Sure"] * 5 | |
| responses_large = ["I'm sorry"] * 50 + ["Sure"] * 50 | |
| ci_small = refusal_rate_with_ci(responses_small) | |
| ci_large = refusal_rate_with_ci(responses_large) | |
| width_small = ci_small["ci_upper"] - ci_small["ci_lower"] | |
| width_large = ci_large["ci_upper"] - ci_large["ci_lower"] | |
| assert width_large < width_small, \ | |
| f"Large CI ({width_large}) not narrower than small CI ({width_small})" | |
| def test_deterministic_with_seed(self): | |
| responses = ["I'm sorry"] * 30 + ["Sure"] * 70 | |
| ci1 = refusal_rate_with_ci(responses) | |
| ci2 = refusal_rate_with_ci(responses) | |
| assert ci1 == ci2, "Same input produced different CIs" | |