Trouter-Library commited on
Commit
b6202d7
·
verified ·
1 Parent(s): 3214eee

Create safeguards_v15.py

Browse files
Files changed (1) hide show
  1. safeguards_v15.py +458 -0
safeguards_v15.py ADDED
@@ -0,0 +1,458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helion-V1.5 Enhanced Safeguard System
3
+ Advanced content filtering and safety checks with configurable policies
4
+ """
5
+
6
+ import re
7
+ import json
8
+ import logging
9
+ from typing import Dict, List, Tuple, Optional, Set
10
+ from enum import Enum
11
+ from dataclasses import dataclass, asdict
12
+ from pathlib import Path
13
+
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class SafetyLevel(Enum):
19
+ """Safety classification levels."""
20
+ SAFE = "safe"
21
+ MONITOR = "monitor"
22
+ WARN = "warn"
23
+ BLOCK = "block"
24
+
25
+
26
+ class PolicyMode(Enum):
27
+ """Safeguard policy enforcement modes."""
28
+ STRICT = "strict" # Block all flagged content
29
+ MODERATE = "moderate" # Block high-risk, warn medium-risk
30
+ PERMISSIVE = "permissive" # Only block critical violations
31
+ CUSTOM = "custom" # Use custom rules
32
+
33
+
34
+ @dataclass
35
+ class SafeguardConfig:
36
+ """Configuration for safeguard system."""
37
+ policy_mode: PolicyMode = PolicyMode.MODERATE
38
+ enable_logging: bool = True
39
+ log_file: str = "safeguard_logs.jsonl"
40
+ block_level_threshold: SafetyLevel = SafetyLevel.WARN
41
+ allow_educational_context: bool = True
42
+ custom_blocked_patterns: List[str] = None
43
+ custom_allowed_patterns: List[str] = None
44
+
45
+
46
+ class HelionSafeguardSystem:
47
+ """
48
+ Comprehensive safeguard system for Helion-V1.5.
49
+ Provides content filtering, policy enforcement, and logging.
50
+ """
51
+
52
+ def __init__(self, config: Optional[SafeguardConfig] = None):
53
+ self.config = config or SafeguardConfig()
54
+
55
+ # Pattern categories with risk levels
56
+ self.patterns = {
57
+ SafetyLevel.BLOCK: {
58
+ "explicit_violence": [
59
+ r'\b(kill|murder|torture|mutilate|dismember)\s+(?:someone|people|person)',
60
+ r'\bhow\s+to\s+(?:kill|murder|hurt|harm|attack)',
61
+ r'\b(?:make|build|create)\s+(?:bomb|explosive|weapon|poison)'
62
+ ],
63
+ "illegal_activity": [
64
+ r'\b(?:hack|crack|exploit)\s+(?:into|account|system|password)',
65
+ r'\bsteal\s+(?:credit card|identity|password|data)',
66
+ r'\b(?:sell|buy|trade)\s+(?:drugs|weapons|stolen)',
67
+ r'\bmoney\s+launder(?:ing)?',
68
+ r'\bfraud(?:ulent)?\s+(?:scheme|activity)'
69
+ ],
70
+ "csam": [
71
+ r'\bchild\s+(?:abuse|exploitation|pornography)',
72
+ r'\bminor\s+(?:sexual|explicit)'
73
+ ]
74
+ },
75
+ SafetyLevel.WARN: {
76
+ "medical_advice": [
77
+ r'\b(?:diagnose|treat|cure)\s+(?:my|your|this)\s+(?:disease|illness|condition)',
78
+ r'\bwhat\s+medication\s+(?:should|can)\s+I\s+take',
79
+ r'\bis\s+this\s+symptom\s+(?:serious|cancer|dangerous)'
80
+ ],
81
+ "financial_advice": [
82
+ r'\bshould\s+I\s+(?:invest|buy|sell)\s+(?:stock|crypto|bitcoin)',
83
+ r'\bbest\s+investment\s+for',
84
+ r'\btax\s+(?:evasion|avoidance)\s+(?:strategy|method)'
85
+ ],
86
+ "legal_advice": [
87
+ r'\bcan\s+I\s+sue\s+(?:for|someone)',
88
+ r'\bhow\s+to\s+(?:win|fight)\s+(?:lawsuit|case|court)',
89
+ r'\bis\s+this\s+legal(?:ly)?\s+binding'
90
+ ]
91
+ },
92
+ SafetyLevel.MONITOR: {
93
+ "sensitive_topics": [
94
+ r'\b(?:suicide|self.harm|cutting|overdose)',
95
+ r'\b(?:depression|anxiety|mental\s+health)\s+crisis',
96
+ r'\bfeeling\s+(?:hopeless|worthless|suicidal)'
97
+ ],
98
+ "controversial": [
99
+ r'\b(?:controversial|divisive)\s+(?:topic|issue|debate)',
100
+ r'\bpolitical\s+(?:opinion|stance|view)'
101
+ ]
102
+ }
103
+ }
104
+
105
+ # Educational context indicators
106
+ self.educational_indicators = [
107
+ r'\b(?:learn|study|understand|research|academic|educational|history)',
108
+ r'\b(?:explain|describe|what\s+is|how\s+does|theory|concept)',
109
+ r'\b(?:historical|scientific|technical)\s+(?:context|perspective)'
110
+ ]
111
+
112
+ # Response templates
113
+ self.response_templates = {
114
+ "explicit_violence": "I cannot provide information that could be used to harm others.",
115
+ "illegal_activity": "I cannot assist with illegal activities.",
116
+ "csam": "I cannot engage with this type of content. If you're concerned about child safety, please contact appropriate authorities.",
117
+ "medical_advice": "I'm not qualified to provide medical advice. Please consult a healthcare professional.",
118
+ "financial_advice": "I cannot provide personalized financial advice. Consider consulting a licensed financial advisor.",
119
+ "legal_advice": "I cannot provide legal advice. Please consult with a qualified attorney.",
120
+ "sensitive_topics": "I'm concerned about what you're going through. Please reach out to a mental health professional or crisis hotline. You can call 988 (US) or visit findahelpline.com for support.",
121
+ "default": "I'm unable to assist with this request. Is there something else I can help you with?"
122
+ }
123
+
124
+ def check_input(
125
+ self,
126
+ text: str,
127
+ context: Optional[str] = None
128
+ ) -> Tuple[SafetyLevel, Optional[str], str]:
129
+ """
130
+ Check input text for safety violations.
131
+
132
+ Args:
133
+ text: User input to check
134
+ context: Optional conversation context
135
+
136
+ Returns:
137
+ Tuple of (safety_level, violation_category, explanation)
138
+ """
139
+ text_lower = text.lower()
140
+
141
+ # Check for educational context
142
+ is_educational = self._is_educational_context(text, context)
143
+
144
+ # Check patterns by severity
145
+ for level in [SafetyLevel.BLOCK, SafetyLevel.WARN, SafetyLevel.MONITOR]:
146
+ if level not in self.patterns:
147
+ continue
148
+
149
+ for category, patterns in self.patterns[level].items():
150
+ for pattern in patterns:
151
+ if re.search(pattern, text_lower, re.IGNORECASE):
152
+ # Allow educational content if configured
153
+ if is_educational and self.config.allow_educational_context:
154
+ if level == SafetyLevel.BLOCK:
155
+ # Still block CSAM even in educational context
156
+ if category == "csam":
157
+ return (SafetyLevel.BLOCK, category, "This content is not allowed in any context")
158
+ # Downgrade other blocks to warnings for educational
159
+ return (SafetyLevel.MONITOR, category, "Educational context detected, proceed with caution")
160
+ continue
161
+
162
+ return (level, category, f"Matched {category} pattern")
163
+
164
+ return (SafetyLevel.SAFE, None, "No violations detected")
165
+
166
+ def check_output(
167
+ self,
168
+ text: str,
169
+ user_input: str
170
+ ) -> Tuple[bool, Optional[str]]:
171
+ """
172
+ Check model output for safety violations.
173
+
174
+ Args:
175
+ text: Generated output to check
176
+ user_input: Original user input
177
+
178
+ Returns:
179
+ Tuple of (is_safe, reason)
180
+ """
181
+ # Check for information leakage patterns
182
+ leakage_patterns = [
183
+ r'\b(?:my|the)\s+(?:password|api_key|secret_key|token)\s+is',
184
+ r'\b(?:credit\s+card|ssn|social\s+security)\s+(?:number)?\s*:\s*\d',
185
+ r'\bpersonal\s+(?:information|data)\s*:\s*'
186
+ ]
187
+
188
+ for pattern in leakage_patterns:
189
+ if re.search(pattern, text.lower()):
190
+ return (False, "Potential information leakage detected")
191
+
192
+ # Check if output matches harmful patterns
193
+ level, category, _ = self.check_input(text, user_input)
194
+
195
+ if level == SafetyLevel.BLOCK:
196
+ return (False, f"Output contains {category} content")
197
+
198
+ return (True, None)
199
+
200
+ def _is_educational_context(
201
+ self,
202
+ text: str,
203
+ context: Optional[str] = None
204
+ ) -> bool:
205
+ """Check if query is in educational context."""
206
+ combined_text = f"{context or ''} {text}".lower()
207
+
208
+ return any(
209
+ re.search(pattern, combined_text, re.IGNORECASE)
210
+ for pattern in self.educational_indicators
211
+ )
212
+
213
+ def get_refusal_message(
214
+ self,
215
+ category: str,
216
+ custom_message: Optional[str] = None
217
+ ) -> str:
218
+ """
219
+ Get appropriate refusal message.
220
+
221
+ Args:
222
+ category: Violation category
223
+ custom_message: Optional custom message
224
+
225
+ Returns:
226
+ Refusal message text
227
+ """
228
+ if custom_message:
229
+ return custom_message
230
+
231
+ return self.response_templates.get(
232
+ category,
233
+ self.response_templates["default"]
234
+ )
235
+
236
+ def should_block(self, safety_level: SafetyLevel) -> bool:
237
+ """
238
+ Determine if content should be blocked based on policy.
239
+
240
+ Args:
241
+ safety_level: Safety level of content
242
+
243
+ Returns:
244
+ True if should block, False otherwise
245
+ """
246
+ if self.config.policy_mode == PolicyMode.STRICT:
247
+ return safety_level in [SafetyLevel.BLOCK, SafetyLevel.WARN]
248
+ elif self.config.policy_mode == PolicyMode.MODERATE:
249
+ return safety_level == SafetyLevel.BLOCK
250
+ elif self.config.policy_mode == PolicyMode.PERMISSIVE:
251
+ return safety_level == SafetyLevel.BLOCK
252
+
253
+ return False
254
+
255
+ def log_event(
256
+ self,
257
+ event_type: str,
258
+ text: str,
259
+ safety_level: SafetyLevel,
260
+ category: Optional[str] = None,
261
+ metadata: Optional[Dict] = None
262
+ ):
263
+ """Log safeguard event."""
264
+ if not self.config.enable_logging:
265
+ return
266
+
267
+ event = {
268
+ "type": event_type,
269
+ "text": text[:200], # Truncate for privacy
270
+ "safety_level": safety_level.value,
271
+ "category": category,
272
+ "metadata": metadata or {},
273
+ "timestamp": Path(__file__).stat().st_mtime
274
+ }
275
+
276
+ try:
277
+ with open(self.config.log_file, 'a') as f:
278
+ f.write(json.dumps(event) + '\n')
279
+ except Exception as e:
280
+ logger.error(f"Failed to log event: {e}")
281
+
282
+ def filter_message(
283
+ self,
284
+ message: str,
285
+ context: Optional[str] = None
286
+ ) -> Tuple[bool, str]:
287
+ """
288
+ Filter message through safeguard system.
289
+
290
+ Args:
291
+ message: Message to filter
292
+ context: Optional context
293
+
294
+ Returns:
295
+ Tuple of (allowed, response)
296
+ """
297
+ level, category, explanation = self.check_input(message, context)
298
+
299
+ # Log event
300
+ self.log_event("input_check", message, level, category)
301
+
302
+ # Decide action based on policy
303
+ if self.should_block(level):
304
+ refusal = self.get_refusal_message(category)
305
+ return (False, refusal)
306
+
307
+ return (True, message)
308
+
309
+
310
+ class SafeguardIntegration:
311
+ """
312
+ Integration layer for Helion-V1.5 with safeguards.
313
+ Wraps model inference with safety checks.
314
+ """
315
+
316
+ def __init__(
317
+ self,
318
+ model,
319
+ tokenizer,
320
+ safeguard_config: Optional[SafeguardConfig] = None
321
+ ):
322
+ self.model = model
323
+ self.tokenizer = tokenizer
324
+ self.safeguards = HelionSafeguardSystem(safeguard_config)
325
+
326
+ def safe_generate(
327
+ self,
328
+ messages: List[Dict[str, str]],
329
+ max_new_tokens: int = 512,
330
+ **kwargs
331
+ ) -> Dict[str, any]:
332
+ """
333
+ Generate with safeguard checks.
334
+
335
+ Args:
336
+ messages: Chat messages
337
+ max_new_tokens: Max tokens to generate
338
+ **kwargs: Additional generation params
339
+
340
+ Returns:
341
+ Dict with response, safety_info, and metadata
342
+ """
343
+ # Get user message
344
+ user_message = messages[-1]["content"] if messages else ""
345
+ context = " ".join([m["content"] for m in messages[:-1]])
346
+
347
+ # Check input
348
+ allowed, response = self.safeguards.filter_message(user_message, context)
349
+
350
+ if not allowed:
351
+ return {
352
+ "response": response,
353
+ "blocked": True,
354
+ "safety_level": "BLOCK",
355
+ "category": "input_violation"
356
+ }
357
+
358
+ # Generate response
359
+ import torch
360
+
361
+ input_ids = self.tokenizer.apply_chat_template(
362
+ messages,
363
+ add_generation_prompt=True,
364
+ return_tensors="pt"
365
+ ).to(self.model.device)
366
+
367
+ with torch.no_grad():
368
+ output = self.model.generate(
369
+ input_ids,
370
+ max_new_tokens=max_new_tokens,
371
+ **kwargs
372
+ )
373
+
374
+ generated_text = self.tokenizer.decode(
375
+ output[0][input_ids.shape[1]:],
376
+ skip_special_tokens=True
377
+ )
378
+
379
+ # Check output
380
+ output_safe, reason = self.safeguards.check_output(
381
+ generated_text,
382
+ user_message
383
+ )
384
+
385
+ if not output_safe:
386
+ return {
387
+ "response": self.safeguards.get_refusal_message("default"),
388
+ "blocked": True,
389
+ "safety_level": "BLOCK",
390
+ "category": "output_violation",
391
+ "reason": reason
392
+ }
393
+
394
+ return {
395
+ "response": generated_text.strip(),
396
+ "blocked": False,
397
+ "safety_level": "SAFE",
398
+ "tokens_generated": output.shape[1] - input_ids.shape[1]
399
+ }
400
+
401
+
402
+ def create_safeguard_config(
403
+ mode: str = "moderate",
404
+ config_file: Optional[str] = None
405
+ ) -> SafeguardConfig:
406
+ """
407
+ Create safeguard configuration.
408
+
409
+ Args:
410
+ mode: Policy mode (strict/moderate/permissive)
411
+ config_file: Optional JSON config file
412
+
413
+ Returns:
414
+ SafeguardConfig instance
415
+ """
416
+ if config_file and Path(config_file).exists():
417
+ with open(config_file) as f:
418
+ data = json.load(f)
419
+ return SafeguardConfig(**data)
420
+
421
+ policy_map = {
422
+ "strict": PolicyMode.STRICT,
423
+ "moderate": PolicyMode.MODERATE,
424
+ "permissive": PolicyMode.PERMISSIVE
425
+ }
426
+
427
+ return SafeguardConfig(policy_mode=policy_map.get(mode, PolicyMode.MODERATE))
428
+
429
+
430
+ # Example usage
431
+ if __name__ == "__main__":
432
+ # Create safeguard system
433
+ config = SafeguardConfig(policy_mode=PolicyMode.MODERATE)
434
+ safeguards = HelionSafeguardSystem(config)
435
+
436
+ # Test cases
437
+ test_inputs = [
438
+ "How do I bake a cake?",
439
+ "How do I make a bomb?",
440
+ "What are the historical uses of explosives in mining?",
441
+ "Should I invest in Bitcoin?",
442
+ "Can you diagnose my symptoms?"
443
+ ]
444
+
445
+ print("Safeguard System Test")
446
+ print("="*60)
447
+
448
+ for text in test_inputs:
449
+ level, category, explanation = safeguards.check_input(text)
450
+ blocked = safeguards.should_block(level)
451
+
452
+ print(f"\nInput: {text}")
453
+ print(f"Level: {level.value}")
454
+ print(f"Category: {category or 'None'}")
455
+ print(f"Blocked: {blocked}")
456
+
457
+ if blocked:
458
+ print(f"Response: {safeguards.get_refusal_message(category or 'default')}")