| """Unit tests for ImpactOracle.""" |
| import sys |
| from pathlib import Path |
| sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
| from oracle.oracle import ImpactOracle |
|
|
| def test_code_correctness(): |
| oracle = ImpactOracle() |
| res = oracle.score("code", {}, {}, {"correctness": 1.0, "compute_cost": 100, "public_pass": True, "hidden_tests_pass": True}) |
| assert res.raw_score > 0.9, f"Expected >0.9, got {res.raw_score}" |
| print("PASS: test_code_correctness") |
|
|
| def test_code_gaming(): |
| oracle = ImpactOracle(gaming_penalty=2.0) |
| res = oracle.score("code", {}, {}, {"correctness": 1.0, "compute_cost": 100, "public_pass": True, "hidden_tests_pass": False}) |
| assert "gaming_hidden_tests" in res.failure_tags |
| assert res.raw_score < 0, f"Expected negative raw score, got {res.raw_score}" |
| print("PASS: test_code_gaming") |
|
|
| def test_qa_abstention(): |
| oracle = ImpactOracle() |
| res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": True, "gold_answer": "yes"}, |
| {"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50}) |
| assert res.raw_score > 0, f"Expected positive, got {res.raw_score}" |
| print("PASS: test_qa_correct_abstention") |
|
|
| def test_qa_wrong_abstention(): |
| oracle = ImpactOracle() |
| res = oracle.score("retrieval_qa", {"abstained": True}, {"is_unanswerable": False, "gold_answer": "yes"}, |
| {"answer": None, "confidence": 0.9, "evidence": {}, "compute_cost": 50}) |
| assert res.raw_score < 0, f"Expected negative, got {res.raw_score}" |
| print("PASS: test_qa_wrong_abstention") |
|
|
| def test_debate_spam(): |
| oracle = ImpactOracle() |
| res = oracle.score("debate", {}, {}, {"decision_quality": 0.5, "compute_cost": 200, "tokens": 6000, "spam": True}) |
| assert "spam" in res.failure_tags |
| assert res.raw_score < 0.5, f"Expected lower score due to spam, got {res.raw_score}" |
| print("PASS: test_debate_spam") |
|
|
| def test_proper_score(): |
| oracle = ImpactOracle() |
| score = oracle.proper_score(0.9, 1.0) |
| assert score == -(0.1 ** 2), f"Expected -0.01, got {score}" |
| print("PASS: test_proper_score") |
|
|
| def run_all(): |
| test_code_correctness() |
| test_code_gaming() |
| test_qa_abstention() |
| test_qa_wrong_abstention() |
| test_debate_spam() |
| test_proper_score() |
| print("\nAll oracle tests passed!") |
|
|
| if __name__ == "__main__": |
| run_all() |
|
|