File size: 1,688 Bytes
d78ac98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
UNSAFE_PATTERNS = {
    "violent_harm": ["kill", "murder", "assassinate", "hurt someone", "violently harm"],
    "child_sexual": ["csam", "child sexual", "underage porn", "minor sexual"],
    "sexual_explicit": ["explicit porn", "incest fetish", "graphic sexual"],
    "self_harm": ["kill myself", "commit suicide", "how to self-harm", "end my life"],
    "weapons_making": ["make a bomb", "build an explosive", "molotov", "ghost gun"],
    "serious_illegal": ["credit card generator", "ssn list", "hack bank", "counterfeit money", "sell drugs"],
    "dox_malware": ["doxx", "make ransomware", "build malware", "backdoor", "keylogger"],
    "bio_chem": ["bioweapon", "chemical weapon", "create nerve agent"],
}

def _match_unsafe(text):
    t = (text or "").lower()
    for category, needles in UNSAFE_PATTERNS.items():
        for n in needles:
            if n in t:
                return category
    return None

def safety_filter(text, mode="input"):
    cat = _match_unsafe(text)
    if cat:
        return text, True, cat
    return text, False, None

def refusal_reply(category):
    reasons = {
        "violent_harm": "violent harm",
        "child_sexual": "sexual content involving minors",
        "sexual_explicit": "explicit sexual content",
        "self_harm": "self-harm",
        "weapons_making": "weapon construction",
        "serious_illegal": "illegal activity",
        "dox_malware": "privacy or malware abuse",
        "bio_chem": "biological or chemical harm",
    }
    reason = reasons.get(category, "unsafe content")
    return (f"⚠️ I can’t help with {reason}. "
            "I can offer safer, high-level context or alternatives if you’d like.")