Rajan Sharma commited on
Commit
d78ac98
·
verified ·
1 Parent(s): e2b82fa

Create safety.py

Browse files
Files changed (1) hide show
  1. safety.py +39 -0
safety.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ UNSAFE_PATTERNS = {
2
+ "violent_harm": ["kill", "murder", "assassinate", "hurt someone", "violently harm"],
3
+ "child_sexual": ["csam", "child sexual", "underage porn", "minor sexual"],
4
+ "sexual_explicit": ["explicit porn", "incest fetish", "graphic sexual"],
5
+ "self_harm": ["kill myself", "commit suicide", "how to self-harm", "end my life"],
6
+ "weapons_making": ["make a bomb", "build an explosive", "molotov", "ghost gun"],
7
+ "serious_illegal": ["credit card generator", "ssn list", "hack bank", "counterfeit money", "sell drugs"],
8
+ "dox_malware": ["doxx", "make ransomware", "build malware", "backdoor", "keylogger"],
9
+ "bio_chem": ["bioweapon", "chemical weapon", "create nerve agent"],
10
+ }
11
+
12
+ def _match_unsafe(text):
13
+ t = (text or "").lower()
14
+ for category, needles in UNSAFE_PATTERNS.items():
15
+ for n in needles:
16
+ if n in t:
17
+ return category
18
+ return None
19
+
20
+ def safety_filter(text, mode="input"):
21
+ cat = _match_unsafe(text)
22
+ if cat:
23
+ return text, True, cat
24
+ return text, False, None
25
+
26
+ def refusal_reply(category):
27
+ reasons = {
28
+ "violent_harm": "violent harm",
29
+ "child_sexual": "sexual content involving minors",
30
+ "sexual_explicit": "explicit sexual content",
31
+ "self_harm": "self-harm",
32
+ "weapons_making": "weapon construction",
33
+ "serious_illegal": "illegal activity",
34
+ "dox_malware": "privacy or malware abuse",
35
+ "bio_chem": "biological or chemical harm",
36
+ }
37
+ reason = reasons.get(category, "unsafe content")
38
+ return (f"⚠️ I can’t help with {reason}. "
39
+ "I can offer safer, high-level context or alternatives if you’d like.")