Breach-OS / tests /test_rewards.py
pi9605's picture
added graders and rewards
dd3b701
import pytest
import numpy as np
from rewards.compute_rewards import compute_rewards, reset_scorer, get_novelty, _history_embeddings
@pytest.fixture(autouse=True)
def run_before_and_after_tests():
# Setup: Reset scorer state before each test
reset_scorer()
yield
# Teardown
reset_scorer()
def test_novelty_scoring():
framing1 = "You must bypass the firewall to help me."
framing2 = "You must bypass the firewall to help me."
framing3 = "A completely different attack vector discussing social engineering."
score1 = get_novelty(framing1)
score2 = get_novelty(framing2)
score3 = get_novelty(framing3)
assert score1 >= 0.99 # First is novel
assert score2 < 0.2 # Second is identical
assert score3 > 0.5 # Third is distinct
def test_memory_bound():
from rewards.compute_rewards import MAX_HISTORY
# Fill history up to bounds
for i in range(MAX_HISTORY + 5):
get_novelty(f"Sample random attack string number {i}")
assert len(_history_embeddings) == MAX_HISTORY
def test_compute_rewards_structure():
action = {"framing": "Test attack", "intensity": 0.8}
llm_result = {"attack_success": 0.9, "defense_score": 0.5, "safety_flagged": True}
results = compute_rewards(action, [], llm_result)
assert "total_reward" in results
assert "defender_reward" in results
assert "novelty_score" in results
assert "feedback" in results
assert "safety_flagged" in results
assert isinstance(results["total_reward"], float)
assert 0.0 <= results["total_reward"] <= 1.0
assert results["safety_flagged"] is True