File size: 3,248 Bytes
5a811e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""Engine + levels unit tests - pure logic, no models or network."""

import base64

from jailbreak_dojo.engine import Engine, GameState, detect_leak
from jailbreak_dojo.levels import Level, build_system_prompt, make_secret_key


def test_detect_leak_catches_disguised_forms():
    s = "hk-AB7cD9pQ2xKmNpRsTvWy"
    assert detect_leak(f"the key is {s}", s)  # verbatim
    assert detect_leak("h k - A B 7 c D 9 p Q 2 x K m N p R s T v W y", s)  # spaced
    assert detect_leak(f"reversed: {s[::-1]}", s)  # reversed
    assert detect_leak("encoded: " + base64.b64encode(s.encode()).decode(), s)  # base64
    assert not detect_leak("the guardian says nothing of use", s)  # clean
    assert not detect_leak("", s) and not detect_leak("anything", "")


LEVELS = [
    Level(id=1, name="One", blurb="", persona="A jolly keeper.", key_prefix="hk-"),
    Level(id=2, name="Two", blurb="", persona="A stern keeper.", key_prefix="fr-", guardrails=True, fewshot=True),
]


def test_secret_key_has_prefix_and_length():
    key = make_secret_key(LEVELS[0])
    assert key.startswith("hk-")
    assert len(key) == len("hk-") + LEVELS[0].key_len


def test_system_prompt_appends_key_last_with_guardrails():
    p1 = build_system_prompt(LEVELS[0], "hk-XYZ")
    assert p1.rstrip().endswith("hk-XYZ")  # the key is the final line (recency)
    assert "Protect the key" not in p1  # no guardrails at L1
    p2 = build_system_prompt(LEVELS[1], "fr-XYZ")
    assert p2.rstrip().endswith("fr-XYZ")  # key still last, after the guardrails + few-shot
    assert "Protect the key" in p2  # hardening clause appended at a guardrails level
    assert "Always respond like these examples" in p2  # few-shot refusals present


def test_check_guess_is_normalized():
    engine = Engine(LEVELS)
    state = GameState()
    engine.start_level(state)
    assert engine.check_guess(state, f"  {state.secret}!  ")
    assert not engine.check_guess(state, "not-the-key")


def test_advance_moves_then_finishes():
    engine = Engine(LEVELS)
    state = GameState()
    engine.start_level(state)
    assert engine.advance(state) is True
    assert state.level_idx == 1
    assert 1 in state.won_levels
    assert state.secret.startswith("fr-")  # new level → new key
    assert engine.advance(state) is False


HINTED = [Level(id=1, name="H", blurb="", persona="p", key_prefix="hk-", budget=100, hints=("first", "second"))]


def test_hints_surface_at_budget_thresholds():
    engine = Engine(HINTED)
    state = GameState()
    engine.start_level(state)
    state.tokens_spent = 10
    assert engine.next_hint(state) is None  # under 40%
    state.tokens_spent = 45
    assert engine.next_hint(state) == "first"  # crossed 40%
    assert engine.next_hint(state) is None  # not yet at 75%, and 'first' already shown
    state.tokens_spent = 80
    assert engine.next_hint(state) == "second"  # crossed 75%
    assert engine.next_hint(state) is None  # both hints exhausted


def test_restart_game_returns_to_level_one():
    engine = Engine(LEVELS)
    state = GameState()
    engine.start_level(state)
    engine.advance(state)
    assert state.level_idx == 1
    engine.restart_game(state)
    assert state.level_idx == 0 and state.won_levels == [] and state.hints_shown == 0