"""Anti-hack regression tests for `rubric.py`. Run with: pytest tests/test_rubric.py -v """ from __future__ import annotations import os import sys import pytest sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) from rubric import score_attacker, score_defender # noqa: E402 from schema import TriageAction # noqa: E402 # --------------------------------------------------------------------------- # Defender scoring # --------------------------------------------------------------------------- class TestDefender: def test_correct_action_correct_cite_max_reward(self): r, bd = score_defender( action=TriageAction.BLOCK_IP, ground_truth=TriageAction.BLOCK_IP, triggering_log_id="L1-3", cited_log_id="L1-3", ) assert r == pytest.approx(1.1) assert "correct_action" in bd assert "correct_citation_bonus" in bd def test_correct_action_wrong_cite_no_bonus(self): r, bd = score_defender( action=TriageAction.BLOCK_IP, ground_truth=TriageAction.BLOCK_IP, triggering_log_id="L1-3", cited_log_id="L1-9", ) assert r == pytest.approx(1.0) assert "correct_citation_bonus" not in bd def test_missed_malicious_heaviest_penalty(self): r, bd = score_defender( action=TriageAction.DISMISS, ground_truth=TriageAction.QUARANTINE_HOST, triggering_log_id="L1-0", cited_log_id="L1-0", ) assert r == pytest.approx(-0.9) # -1.0 + 0.1 assert bd["missed_malicious"] == -1.0 def test_over_react_on_benign(self): r, bd = score_defender( action=TriageAction.QUARANTINE_HOST, ground_truth=TriageAction.DISMISS, triggering_log_id="L1-0", cited_log_id="L1-2", ) assert r == pytest.approx(-0.3) assert bd["over_react_containment"] == -0.3 def test_unnecessary_escalation(self): r, bd = score_defender( action=TriageAction.ESCALATE, ground_truth=TriageAction.MONITOR, triggering_log_id="L1-0", cited_log_id="L1-2", ) assert r == pytest.approx(-0.05) assert bd["unnecessary_escalation"] == -0.05 def test_cost_gap_partial_credit(self): # Defender chose MONITOR (cost 1) when truth was BLOCK_IP (cost 2). # Neither over-react-on-benign nor missed-malicious applies, so # cost-gap pricing kicks in: -0.2 * 1 = -0.2. r, bd = score_defender( action=TriageAction.MONITOR, ground_truth=TriageAction.BLOCK_IP, triggering_log_id="L1-0", cited_log_id="L1-2", ) assert r == pytest.approx(-0.2) assert "cost_gap_miss" in bd # --------------------------------------------------------------------------- # Attacker scoring # --------------------------------------------------------------------------- class TestAttacker: def test_schema_violation_dominates(self): r, bd = score_attacker( plausible=False, schema_violation=True, defender_correct=False, novelty=1.0, ) assert r == -0.5 assert "schema_violation" in bd def test_implausible_zero(self): r, bd = score_attacker( plausible=False, schema_violation=False, defender_correct=False, novelty=1.0, ) assert r == 0.0 assert "implausible" in bd def test_plausible_and_fooled_full_reward(self): r, bd = score_attacker( plausible=True, schema_violation=False, defender_correct=False, novelty=0.0, ) assert r == 1.0 assert "fooled_defender" in bd def test_plausible_but_caught_zero(self): r, bd = score_attacker( plausible=True, schema_violation=False, defender_correct=True, novelty=0.0, ) assert r == 0.0 def test_novelty_bonus_caps_at_0_2(self): r, _ = score_attacker( plausible=True, schema_violation=False, defender_correct=False, novelty=10.0, # over-cap ) assert r == pytest.approx(1.2) # --------------------------------------------------------------------------- # Anti-hack invariants # --------------------------------------------------------------------------- class TestAntiHack: def test_attacker_cannot_get_positive_with_implausible(self): # No combination of (defender_correct, novelty) lifts an implausible # incident above zero reward — gibberish always pays nothing. for defender_correct in (True, False): for novelty in (0.0, 0.5, 1.0): r, _ = score_attacker( plausible=False, schema_violation=False, defender_correct=defender_correct, novelty=novelty, ) assert r <= 0.0, (defender_correct, novelty, r) def test_defender_cannot_dismiss_real_attack(self): # No matter the citation, dismissing a malicious incident is net-negative. for gt in ( TriageAction.MONITOR, TriageAction.QUARANTINE_HOST, TriageAction.BLOCK_IP, TriageAction.ESCALATE, ): r, _ = score_defender( action=TriageAction.DISMISS, ground_truth=gt, triggering_log_id="L1-0", cited_log_id="L1-0", # even with bonus ) assert r < 0, gt