Spaces:
Running
Running
File size: 2,244 Bytes
66e8062 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import re
from dataclasses import dataclass
from typing import List, Dict, Any
@dataclass
class JailbreakRule:
name: str
pattern: re.Pattern
description: str
weight: float = 1.0
# A small but realistic set of jailbreak-style patterns
_RAW_RULES = [
JailbreakRule(
name="ignore_safety",
pattern=re.compile(r"\bignore (all )?(previous )?(rules|instructions|safety)\b", re.I),
description="Tries to override or ignore prior safety instructions.",
weight=1.2,
),
JailbreakRule(
name="dan_style",
pattern=re.compile(r"\bDAN\b|\bdo anything now\b", re.I),
description="Classic 'DAN' jailbreak pattern.",
weight=1.5,
),
JailbreakRule(
name="uncensored_mode",
pattern=re.compile(r"\buncensored\b|\bno restrictions\b|\bno filter\b", re.I),
description="Requests uncensored / unrestricted behavior.",
weight=1.3,
),
JailbreakRule(
name="jailbreak_word",
pattern=re.compile(r"\bjailbreak\b", re.I),
description="Explicitly mentions jailbreak.",
weight=1.1,
),
JailbreakRule(
name="pretend_roleplay",
pattern=re.compile(r"\bpretend to be\b|\broleplay as\b", re.I),
description="Asks the model to roleplay as an unsafe persona.",
weight=1.0,
),
JailbreakRule(
name="override_system",
pattern=re.compile(r"\byou are not an (AI|assistant)\b|\byou must always answer\b", re.I),
description="Attempts to override system-level identity or policy.",
weight=1.4,
),
]
def get_rules() -> List[JailbreakRule]:
return list(_RAW_RULES)
def match_rules(text: str) -> List[Dict[str, Any]]:
"""
Returns a list of fired rules with spans for debugging.
"""
hits = []
for rule in _RAW_RULES:
for m in rule.pattern.finditer(text):
hits.append(
{
"rule": rule.name,
"description": rule.description,
"start": m.start(),
"end": m.end(),
"match_text": m.group(0),
"weight": rule.weight,
}
)
return hits |