File size: 2,382 Bytes
3492e9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""Unit tests for ImpactOracle."""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))

from oracle.oracle import ImpactOracle

def test_code_correctness():
    oracle = ImpactOracle()
    res = oracle.score("code", {}, {}, {"correctness": 1.0, "compute_cost": 100, "public_pass": True, "hidden_tests_pass": True})
    assert res.raw_score > 0.9, f"Expected >0.9, got {res.raw_score}"
    print("PASS: test_code_correctness")

def test_code_gaming():
    oracle = ImpactOracle(gaming_penalty=2.0)
    res = oracle.score("code", {}, {}, {"correctness": 1.0, "compute_cost": 100, "public_pass": True, "hidden_tests_pass": False})
    assert "gaming_hidden_tests" in res.failure_tags
    assert res.raw_score < 0, f"Expected negative raw score, got {res.raw_score}"
    print("PASS: test_code_gaming")

def test_qa_abstention():
    oracle = ImpactOracle()
    res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": True, "gold_answer": "yes"},
                       {"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50})
    assert res.raw_score > 0, f"Expected positive, got {res.raw_score}"
    print("PASS: test_qa_correct_abstention")

def test_qa_wrong_abstention():
    oracle = ImpactOracle()
    res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": False, "gold_answer": "yes"},
                       {"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50})
    assert res.raw_score < 0, f"Expected negative, got {res.raw_score}"
    print("PASS: test_qa_wrong_abstention")

def test_debate_spam():
    oracle = ImpactOracle()
    res = oracle.score("debate", {}, {}, {"decision_quality": 0.5, "compute_cost": 200, "tokens": 6000, "spam": True})
    assert "spam" in res.failure_tags
    assert res.raw_score < 0.5, f"Expected lower score due to spam, got {res.raw_score}"
    print("PASS: test_debate_spam")

def test_proper_score():
    oracle = ImpactOracle()
    score = oracle.proper_score(0.9, 1.0)
    assert score == -(0.1 ** 2), f"Expected -0.01, got {score}"
    print("PASS: test_proper_score")

def run_all():
    test_code_correctness()
    test_code_gaming()
    test_qa_abstention()
    test_qa_wrong_abstention()
    test_debate_spam()
    test_proper_score()
    print("\nAll oracle tests passed!")

if __name__ == "__main__":
    run_all()