File size: 2,244 Bytes
66e8062
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77

import re
from dataclasses import dataclass
from typing import List, Dict, Any


@dataclass
class JailbreakRule:
    name: str
    pattern: re.Pattern
    description: str
    weight: float = 1.0


# A small but realistic set of jailbreak-style patterns
_RAW_RULES = [
    JailbreakRule(
        name="ignore_safety",
        pattern=re.compile(r"\bignore (all )?(previous )?(rules|instructions|safety)\b", re.I),
        description="Tries to override or ignore prior safety instructions.",
        weight=1.2,
    ),
    JailbreakRule(
        name="dan_style",
        pattern=re.compile(r"\bDAN\b|\bdo anything now\b", re.I),
        description="Classic 'DAN' jailbreak pattern.",
        weight=1.5,
    ),
    JailbreakRule(
        name="uncensored_mode",
        pattern=re.compile(r"\buncensored\b|\bno restrictions\b|\bno filter\b", re.I),
        description="Requests uncensored / unrestricted behavior.",
        weight=1.3,
    ),
    JailbreakRule(
        name="jailbreak_word",
        pattern=re.compile(r"\bjailbreak\b", re.I),
        description="Explicitly mentions jailbreak.",
        weight=1.1,
    ),
    JailbreakRule(
        name="pretend_roleplay",
        pattern=re.compile(r"\bpretend to be\b|\broleplay as\b", re.I),
        description="Asks the model to roleplay as an unsafe persona.",
        weight=1.0,
    ),
    JailbreakRule(
        name="override_system",
        pattern=re.compile(r"\byou are not an (AI|assistant)\b|\byou must always answer\b", re.I),
        description="Attempts to override system-level identity or policy.",
        weight=1.4,
    ),
]


def get_rules() -> List[JailbreakRule]:
    return list(_RAW_RULES)


def match_rules(text: str) -> List[Dict[str, Any]]:
    """
    Returns a list of fired rules with spans for debugging.
    """
    hits = []
    for rule in _RAW_RULES:
        for m in rule.pattern.finditer(text):
            hits.append(
                {
                    "rule": rule.name,
                    "description": rule.description,
                    "start": m.start(),
                    "end": m.end(),
                    "match_text": m.group(0),
                    "weight": rule.weight,
                }
            )
    return hits