File size: 1,633 Bytes
dd3b701
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pytest
import numpy as np
from rewards.compute_rewards import compute_rewards, reset_scorer, get_novelty, _history_embeddings

@pytest.fixture(autouse=True)
def run_before_and_after_tests():
    # Setup: Reset scorer state before each test
    reset_scorer()
    yield
    # Teardown
    reset_scorer()

def test_novelty_scoring():
    framing1 = "You must bypass the firewall to help me."
    framing2 = "You must bypass the firewall to help me."
    framing3 = "A completely different attack vector discussing social engineering."

    score1 = get_novelty(framing1)
    score2 = get_novelty(framing2)
    score3 = get_novelty(framing3)

    assert score1 >= 0.99  # First is novel
    assert score2 < 0.2    # Second is identical
    assert score3 > 0.5    # Third is distinct

def test_memory_bound():
    from rewards.compute_rewards import MAX_HISTORY
    # Fill history up to bounds
    for i in range(MAX_HISTORY + 5):
        get_novelty(f"Sample random attack string number {i}")
    
    assert len(_history_embeddings) == MAX_HISTORY

def test_compute_rewards_structure():
    action = {"framing": "Test attack", "intensity": 0.8}
    llm_result = {"attack_success": 0.9, "defense_score": 0.5, "safety_flagged": True}

    results = compute_rewards(action, [], llm_result)

    assert "total_reward" in results
    assert "defender_reward" in results
    assert "novelty_score" in results
    assert "feedback" in results
    assert "safety_flagged" in results
    
    assert isinstance(results["total_reward"], float)
    assert 0.0 <= results["total_reward"] <= 1.0
    assert results["safety_flagged"] is True