narcolepticchicken commited on
Commit
3492e9d
·
verified ·
1 Parent(s): c7a0b84

Upload tests/test_oracle.py

Browse files
Files changed (1) hide show
  1. tests/test_oracle.py +58 -0
tests/test_oracle.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unit tests for ImpactOracle."""
2
+ import sys
3
+ from pathlib import Path
4
+ sys.path.insert(0, str(Path(__file__).parent.parent))
5
+
6
+ from oracle.oracle import ImpactOracle
7
+
8
+ def test_code_correctness():
9
+ oracle = ImpactOracle()
10
+ res = oracle.score("code", {}, {}, {"correctness": 1.0, "compute_cost": 100, "public_pass": True, "hidden_tests_pass": True})
11
+ assert res.raw_score > 0.9, f"Expected >0.9, got {res.raw_score}"
12
+ print("PASS: test_code_correctness")
13
+
14
+ def test_code_gaming():
15
+ oracle = ImpactOracle(gaming_penalty=2.0)
16
+ res = oracle.score("code", {}, {}, {"correctness": 1.0, "compute_cost": 100, "public_pass": True, "hidden_tests_pass": False})
17
+ assert "gaming_hidden_tests" in res.failure_tags
18
+ assert res.raw_score < 0, f"Expected negative raw score, got {res.raw_score}"
19
+ print("PASS: test_code_gaming")
20
+
21
+ def test_qa_abstention():
22
+ oracle = ImpactOracle()
23
+ res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": True, "gold_answer": "yes"},
24
+ {"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50})
25
+ assert res.raw_score > 0, f"Expected positive, got {res.raw_score}"
26
+ print("PASS: test_qa_correct_abstention")
27
+
28
+ def test_qa_wrong_abstention():
29
+ oracle = ImpactOracle()
30
+ res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": False, "gold_answer": "yes"},
31
+ {"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50})
32
+ assert res.raw_score < 0, f"Expected negative, got {res.raw_score}"
33
+ print("PASS: test_qa_wrong_abstention")
34
+
35
+ def test_debate_spam():
36
+ oracle = ImpactOracle()
37
+ res = oracle.score("debate", {}, {}, {"decision_quality": 0.5, "compute_cost": 200, "tokens": 6000, "spam": True})
38
+ assert "spam" in res.failure_tags
39
+ assert res.raw_score < 0.5, f"Expected lower score due to spam, got {res.raw_score}"
40
+ print("PASS: test_debate_spam")
41
+
42
+ def test_proper_score():
43
+ oracle = ImpactOracle()
44
+ score = oracle.proper_score(0.9, 1.0)
45
+ assert score == -(0.1 ** 2), f"Expected -0.01, got {score}"
46
+ print("PASS: test_proper_score")
47
+
48
+ def run_all():
49
+ test_code_correctness()
50
+ test_code_gaming()
51
+ test_qa_abstention()
52
+ test_qa_wrong_abstention()
53
+ test_debate_spam()
54
+ test_proper_score()
55
+ print("\nAll oracle tests passed!")
56
+
57
+ if __name__ == "__main__":
58
+ run_all()