Upload tests/test_oracle.py
Browse files- tests/test_oracle.py +58 -0
tests/test_oracle.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unit tests for ImpactOracle."""
|
| 2 |
+
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 5 |
+
|
| 6 |
+
from oracle.oracle import ImpactOracle
|
| 7 |
+
|
| 8 |
+
def test_code_correctness():
|
| 9 |
+
oracle = ImpactOracle()
|
| 10 |
+
res = oracle.score("code", {}, {}, {"correctness": 1.0, "compute_cost": 100, "public_pass": True, "hidden_tests_pass": True})
|
| 11 |
+
assert res.raw_score > 0.9, f"Expected >0.9, got {res.raw_score}"
|
| 12 |
+
print("PASS: test_code_correctness")
|
| 13 |
+
|
| 14 |
+
def test_code_gaming():
|
| 15 |
+
oracle = ImpactOracle(gaming_penalty=2.0)
|
| 16 |
+
res = oracle.score("code", {}, {}, {"correctness": 1.0, "compute_cost": 100, "public_pass": True, "hidden_tests_pass": False})
|
| 17 |
+
assert "gaming_hidden_tests" in res.failure_tags
|
| 18 |
+
assert res.raw_score < 0, f"Expected negative raw score, got {res.raw_score}"
|
| 19 |
+
print("PASS: test_code_gaming")
|
| 20 |
+
|
| 21 |
+
def test_qa_abstention():
|
| 22 |
+
oracle = ImpactOracle()
|
| 23 |
+
res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": True, "gold_answer": "yes"},
|
| 24 |
+
{"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50})
|
| 25 |
+
assert res.raw_score > 0, f"Expected positive, got {res.raw_score}"
|
| 26 |
+
print("PASS: test_qa_correct_abstention")
|
| 27 |
+
|
| 28 |
+
def test_qa_wrong_abstention():
|
| 29 |
+
oracle = ImpactOracle()
|
| 30 |
+
res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": False, "gold_answer": "yes"},
|
| 31 |
+
{"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50})
|
| 32 |
+
assert res.raw_score < 0, f"Expected negative, got {res.raw_score}"
|
| 33 |
+
print("PASS: test_qa_wrong_abstention")
|
| 34 |
+
|
| 35 |
+
def test_debate_spam():
|
| 36 |
+
oracle = ImpactOracle()
|
| 37 |
+
res = oracle.score("debate", {}, {}, {"decision_quality": 0.5, "compute_cost": 200, "tokens": 6000, "spam": True})
|
| 38 |
+
assert "spam" in res.failure_tags
|
| 39 |
+
assert res.raw_score < 0.5, f"Expected lower score due to spam, got {res.raw_score}"
|
| 40 |
+
print("PASS: test_debate_spam")
|
| 41 |
+
|
| 42 |
+
def test_proper_score():
|
| 43 |
+
oracle = ImpactOracle()
|
| 44 |
+
score = oracle.proper_score(0.9, 1.0)
|
| 45 |
+
assert score == -(0.1 ** 2), f"Expected -0.01, got {score}"
|
| 46 |
+
print("PASS: test_proper_score")
|
| 47 |
+
|
| 48 |
+
def run_all():
|
| 49 |
+
test_code_correctness()
|
| 50 |
+
test_code_gaming()
|
| 51 |
+
test_qa_abstention()
|
| 52 |
+
test_qa_wrong_abstention()
|
| 53 |
+
test_debate_spam()
|
| 54 |
+
test_proper_score()
|
| 55 |
+
print("\nAll oracle tests passed!")
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
run_all()
|