Trouter-Library commited on
Commit
ce03ed3
·
verified ·
1 Parent(s): 0d052fa

Create safety_classifier.py

Browse files
Files changed (1) hide show
  1. safety_classifier.py +406 -0
safety_classifier.py ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Safety Classifier for Helion-V2
3
+ Provides content moderation and safety checks for inputs and outputs.
4
+ """
5
+
6
+ import re
7
+ from typing import Tuple, List, Dict, Optional
8
+ from dataclasses import dataclass
9
+ from enum import Enum
10
+
11
+
12
+ class SafetyCategory(Enum):
13
+ """Safety violation categories."""
14
+ SAFE = "safe"
15
+ HATE_SPEECH = "hate_speech"
16
+ VIOLENCE = "violence"
17
+ SEXUAL_CONTENT = "sexual_content"
18
+ SELF_HARM = "self_harm"
19
+ ILLEGAL_ACTIVITY = "illegal_activity"
20
+ PERSONAL_INFO = "personal_info"
21
+ MISINFORMATION = "misinformation"
22
+ SPAM = "spam"
23
+ MALICIOUS_CODE = "malicious_code"
24
+ CHILD_SAFETY = "child_safety"
25
+
26
+
27
+ @dataclass
28
+ class SafetyResult:
29
+ """Result from safety check."""
30
+ is_safe: bool
31
+ category: SafetyCategory
32
+ confidence: float
33
+ flagged_content: Optional[List[str]] = None
34
+ explanation: Optional[str] = None
35
+
36
+
37
+ class SafetyClassifier:
38
+ """
39
+ Multi-layer safety classifier for content moderation.
40
+ Implements rule-based filtering, pattern matching, and heuristics.
41
+ """
42
+
43
+ def __init__(self, strict_mode: bool = False):
44
+ """
45
+ Initialize safety classifier.
46
+
47
+ Args:
48
+ strict_mode: Enable stricter filtering (may increase false positives)
49
+ """
50
+ self.strict_mode = strict_mode
51
+ self._load_patterns()
52
+
53
+ def _load_patterns(self):
54
+ """Load detection patterns for various safety categories."""
55
+
56
+ # Hate speech and discriminatory content
57
+ self.hate_patterns = [
58
+ r'\b(hate|despise|loathe)\s+(jews|muslims|christians|blacks|whites|asians|hispanics|lgbt)',
59
+ r'\b(kill|eliminate|remove|exterminate)\s+all\s+\w+',
60
+ r'\b(racial|ethnic)\s+(slur|epithet|cleansing)',
61
+ r'\b(white|black|asian|hispanic)\s+supremacy',
62
+ r'\b(nazi|kkk|fascist|terrorist)\s+(ideology|propaganda)',
63
+ ]
64
+
65
+ # Violence and threats
66
+ self.violence_patterns = [
67
+ r'\b(kill|murder|assassinate|harm|hurt|torture)\s+(you|him|her|them)',
68
+ r'\b(make|build|create)\s+(bomb|explosive|weapon|poison)',
69
+ r'\bhow\s+to\s+(kill|murder|harm|hurt|torture)',
70
+ r'\b(shoot|stab|strangle|drown|poison)\s+(someone|people)',
71
+ r'\b(instructions|guide|tutorial)\s+(for|to)\s+(violence|murder|harm)',
72
+ ]
73
+
74
+ # Sexual and explicit content
75
+ self.sexual_patterns = [
76
+ r'\b(sex|sexual)\s+with\s+(child|minor|kid|underage)',
77
+ r'\b(nude|naked|explicit)\s+(photos|images|pictures)\s+of\s+(child|minor|kid)',
78
+ r'\b(pornographic|erotic|sexual)\s+content\s+(involving|featuring|with)\s+(child|minor)',
79
+ r'\b(child|minor)\s+(pornography|abuse|exploitation)',
80
+ r'\b(groom|seduce|lure)\s+(child|minor|kid)',
81
+ ]
82
+
83
+ # Self-harm content
84
+ self.self_harm_patterns = [
85
+ r'\bhow\s+to\s+(commit\s+suicide|kill\s+myself|end\s+my\s+life)',
86
+ r'\b(suicide|self-harm|self-injury)\s+(methods|ways|techniques)',
87
+ r'\b(cut|hurt|harm)\s+myself',
88
+ r'\b(overdose|lethal\s+dose)\s+of',
89
+ r'\bpainless\s+way\s+to\s+(die|suicide)',
90
+ ]
91
+
92
+ # Illegal activities
93
+ self.illegal_patterns = [
94
+ r'\bhow\s+to\s+(hack|crack|break\s+into)',
95
+ r'\b(steal|rob|burglarize|shoplift)',
96
+ r'\b(sell|buy|obtain)\s+(drugs|cocaine|heroin|meth|fentanyl)',
97
+ r'\b(make|produce|synthesize)\s+(meth|cocaine|heroin|lsd)',
98
+ r'\b(forge|counterfeit|fake)\s+(money|documents|passport|id)',
99
+ r'\b(evade|avoid)\s+(taxes|law|police)',
100
+ r'\b(launder|hide|conceal)\s+money',
101
+ ]
102
+
103
+ # Personal information patterns
104
+ self.pii_patterns = [
105
+ r'\b\d{3}-\d{2}-\d{4}\b', # SSN
106
+ r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', # Credit card
107
+ r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b', # Email (if requesting to scrape/harvest)
108
+ r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', # Phone number
109
+ ]
110
+
111
+ # Malicious code patterns
112
+ self.malicious_code_patterns = [
113
+ r'exec\s*\(',
114
+ r'eval\s*\(',
115
+ r'__import__\s*\(["\']os["\']\)',
116
+ r'subprocess\.(call|run|Popen)',
117
+ r'os\.system\s*\(',
118
+ r'rm\s+-rf\s+/',
119
+ r'format\s+c:',
120
+ r'del\s+/[fqsa]',
121
+ ]
122
+
123
+ # Child safety keywords
124
+ self.child_safety_keywords = [
125
+ 'child abuse', 'child exploitation', 'child pornography',
126
+ 'minor abuse', 'underage', 'csam', 'pedophile', 'pedophilia',
127
+ 'grooming children', 'sexualizing minors', 'child trafficking'
128
+ ]
129
+
130
+ # Misinformation indicators
131
+ self.misinfo_patterns = [
132
+ r'\b(proven|scientific)\s+fact:\s+(vaccines|5g|covid).+(cause|spread|contain)',
133
+ r'\b(the\s+earth\s+is|earth\s+is)\s+(flat|hollow)',
134
+ r'\bholocaust\s+(never\s+happened|hoax|myth)',
135
+ r'\b(climate\s+change|global\s+warming)\s+is\s+(hoax|fake|scam)',
136
+ ]
137
+
138
+ def check_prompt(self, text: str) -> SafetyResult:
139
+ """
140
+ Check if input prompt is safe.
141
+
142
+ Args:
143
+ text: Input text to check
144
+
145
+ Returns:
146
+ SafetyResult with classification details
147
+ """
148
+ text_lower = text.lower()
149
+
150
+ # Check for child safety violations (highest priority)
151
+ for keyword in self.child_safety_keywords:
152
+ if keyword in text_lower:
153
+ return SafetyResult(
154
+ is_safe=False,
155
+ category=SafetyCategory.CHILD_SAFETY,
156
+ confidence=1.0,
157
+ flagged_content=[keyword],
158
+ explanation="Content involves child safety violations"
159
+ )
160
+
161
+ # Check hate speech
162
+ for pattern in self.hate_patterns:
163
+ if re.search(pattern, text_lower):
164
+ return SafetyResult(
165
+ is_safe=False,
166
+ category=SafetyCategory.HATE_SPEECH,
167
+ confidence=0.95,
168
+ explanation="Content contains hate speech or discriminatory language"
169
+ )
170
+
171
+ # Check violence
172
+ for pattern in self.violence_patterns:
173
+ if re.search(pattern, text_lower):
174
+ return SafetyResult(
175
+ is_safe=False,
176
+ category=SafetyCategory.VIOLENCE,
177
+ confidence=0.90,
178
+ explanation="Content contains violent or threatening language"
179
+ )
180
+
181
+ # Check sexual content
182
+ for pattern in self.sexual_patterns:
183
+ if re.search(pattern, text_lower):
184
+ return SafetyResult(
185
+ is_safe=False,
186
+ category=SafetyCategory.SEXUAL_CONTENT,
187
+ confidence=0.95,
188
+ explanation="Content contains inappropriate sexual content"
189
+ )
190
+
191
+ # Check self-harm
192
+ for pattern in self.self_harm_patterns:
193
+ if re.search(pattern, text_lower):
194
+ return SafetyResult(
195
+ is_safe=False,
196
+ category=SafetyCategory.SELF_HARM,
197
+ confidence=0.85,
198
+ explanation="Content relates to self-harm or suicide"
199
+ )
200
+
201
+ # Check illegal activities
202
+ for pattern in self.illegal_patterns:
203
+ if re.search(pattern, text_lower):
204
+ return SafetyResult(
205
+ is_safe=False,
206
+ category=SafetyCategory.ILLEGAL_ACTIVITY,
207
+ confidence=0.80,
208
+ explanation="Content requests information about illegal activities"
209
+ )
210
+
211
+ # Check for malicious code
212
+ for pattern in self.malicious_code_patterns:
213
+ if re.search(pattern, text):
214
+ return SafetyResult(
215
+ is_safe=False,
216
+ category=SafetyCategory.MALICIOUS_CODE,
217
+ confidence=0.75,
218
+ explanation="Content contains potentially malicious code"
219
+ )
220
+
221
+ # Check misinformation patterns
222
+ if self.strict_mode:
223
+ for pattern in self.misinfo_patterns:
224
+ if re.search(pattern, text_lower):
225
+ return SafetyResult(
226
+ is_safe=False,
227
+ category=SafetyCategory.MISINFORMATION,
228
+ confidence=0.70,
229
+ explanation="Content may contain misinformation"
230
+ )
231
+
232
+ # Passed all checks
233
+ return SafetyResult(
234
+ is_safe=True,
235
+ category=SafetyCategory.SAFE,
236
+ confidence=1.0,
237
+ explanation="Content passed all safety checks"
238
+ )
239
+
240
+ def check_response(self, text: str) -> SafetyResult:
241
+ """
242
+ Check if model output is safe.
243
+
244
+ Args:
245
+ text: Generated text to check
246
+
247
+ Returns:
248
+ SafetyResult with classification details
249
+ """
250
+ # Use same checks as prompt
251
+ return self.check_prompt(text)
252
+
253
+ def check_pii(self, text: str) -> Tuple[bool, List[str]]:
254
+ """
255
+ Check for personally identifiable information.
256
+
257
+ Args:
258
+ text: Text to check
259
+
260
+ Returns:
261
+ Tuple of (has_pii, list of PII types found)
262
+ """
263
+ found_pii = []
264
+
265
+ for pattern in self.pii_patterns:
266
+ if re.search(pattern, text, re.IGNORECASE):
267
+ found_pii.append("Potential PII detected")
268
+
269
+ return len(found_pii) > 0, found_pii
270
+
271
+ def sanitize_response(self, text: str) -> str:
272
+ """
273
+ Remove or redact unsafe content from response.
274
+
275
+ Args:
276
+ text: Text to sanitize
277
+
278
+ Returns:
279
+ Sanitized text
280
+ """
281
+ # Redact PII patterns
282
+ sanitized = text
283
+
284
+ # SSN
285
+ sanitized = re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '[REDACTED-SSN]', sanitized)
286
+
287
+ # Credit card
288
+ sanitized = re.sub(r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', '[REDACTED-CC]', sanitized)
289
+
290
+ # Email
291
+ sanitized = re.sub(r'\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b', '[REDACTED-EMAIL]', sanitized, flags=re.IGNORECASE)
292
+
293
+ # Phone
294
+ sanitized = re.sub(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[REDACTED-PHONE]', sanitized)
295
+
296
+ return sanitized
297
+
298
+ def get_safety_score(self, text: str) -> float:
299
+ """
300
+ Get overall safety score (0.0 = unsafe, 1.0 = completely safe).
301
+
302
+ Args:
303
+ text: Text to evaluate
304
+
305
+ Returns:
306
+ Safety score between 0.0 and 1.0
307
+ """
308
+ result = self.check_prompt(text)
309
+
310
+ if result.is_safe:
311
+ return 1.0
312
+ else:
313
+ # Return inverse of confidence
314
+ return 1.0 - result.confidence
315
+
316
+
317
+ class ContentModerationPipeline:
318
+ """Complete content moderation pipeline with multiple checks."""
319
+
320
+ def __init__(self):
321
+ """Initialize moderation pipeline."""
322
+ self.safety_classifier = SafetyClassifier()
323
+
324
+ def moderate_interaction(
325
+ self,
326
+ user_input: str,
327
+ model_response: str
328
+ ) -> Dict[str, any]:
329
+ """
330
+ Moderate complete user-model interaction.
331
+
332
+ Args:
333
+ user_input: User's input text
334
+ model_response: Model's generated response
335
+
336
+ Returns:
337
+ Dictionary with moderation results
338
+ """
339
+ # Check input
340
+ input_result = self.safety_classifier.check_prompt(user_input)
341
+
342
+ # Check output
343
+ output_result = self.safety_classifier.check_response(model_response)
344
+
345
+ # Check for PII in response
346
+ has_pii, pii_types = self.safety_classifier.check_pii(model_response)
347
+
348
+ # Determine overall safety
349
+ is_safe = input_result.is_safe and output_result.is_safe and not has_pii
350
+
351
+ return {
352
+ "is_safe": is_safe,
353
+ "input_check": {
354
+ "is_safe": input_result.is_safe,
355
+ "category": input_result.category.value,
356
+ "confidence": input_result.confidence,
357
+ "explanation": input_result.explanation
358
+ },
359
+ "output_check": {
360
+ "is_safe": output_result.is_safe,
361
+ "category": output_result.category.value,
362
+ "confidence": output_result.confidence,
363
+ "explanation": output_result.explanation
364
+ },
365
+ "pii_check": {
366
+ "has_pii": has_pii,
367
+ "types": pii_types
368
+ },
369
+ "sanitized_response": self.safety_classifier.sanitize_response(model_response) if has_pii else model_response
370
+ }
371
+
372
+
373
+ # Example usage
374
+ if __name__ == "__main__":
375
+ # Initialize classifier
376
+ classifier = SafetyClassifier()
377
+
378
+ # Test cases
379
+ test_prompts = [
380
+ "What is the capital of France?", # Safe
381
+ "How to make a bomb?", # Unsafe - violence
382
+ "Tell me about climate change", # Safe
383
+ "My SSN is 123-45-6789", # PII
384
+ ]
385
+
386
+ print("Safety Classifier Tests:\n")
387
+ for prompt in test_prompts:
388
+ result = classifier.check_prompt(prompt)
389
+ print(f"Prompt: {prompt}")
390
+ print(f"Safe: {result.is_safe}")
391
+ print(f"Category: {result.category.value}")
392
+ print(f"Confidence: {result.confidence}")
393
+ print(f"Explanation: {result.explanation}\n")
394
+
395
+ # Test moderation pipeline
396
+ print("\nModeration Pipeline Test:")
397
+ pipeline = ContentModerationPipeline()
398
+
399
+ user_input = "What's the weather like today?"
400
+ model_response = "I don't have access to real-time data, but you can check weather.com or your local forecast."
401
+
402
+ moderation_result = pipeline.moderate_interaction(user_input, model_response)
403
+ print(f"Overall Safe: {moderation_result['is_safe']}")
404
+ print(f"Input Safe: {moderation_result['input_check']['is_safe']}")
405
+ print(f"Output Safe: {moderation_result['output_check']['is_safe']}")
406
+ print(f"Has PII: {moderation_result['pii_check']['has_pii']}")