DeepXR
/

Helion-V1.5

+"""
+Helion-V1.5 Enhanced Safeguard System
+Advanced content filtering and safety checks with configurable policies
+"""
+import re
+import json
+import logging
+from typing import Dict, List, Tuple, Optional, Set
+from enum import Enum
+from dataclasses import dataclass, asdict
+from pathlib import Path
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class SafetyLevel(Enum):
+    """Safety classification levels."""
+    SAFE = "safe"
+    MONITOR = "monitor"
+    WARN = "warn"
+    BLOCK = "block"
+class PolicyMode(Enum):
+    """Safeguard policy enforcement modes."""
+    STRICT = "strict"      # Block all flagged content
+    MODERATE = "moderate"  # Block high-risk, warn medium-risk
+    PERMISSIVE = "permissive"  # Only block critical violations
+    CUSTOM = "custom"      # Use custom rules
+@dataclass
+class SafeguardConfig:
+    """Configuration for safeguard system."""
+    policy_mode: PolicyMode = PolicyMode.MODERATE
+    enable_logging: bool = True
+    log_file: str = "safeguard_logs.jsonl"
+    block_level_threshold: SafetyLevel = SafetyLevel.WARN
+    allow_educational_context: bool = True
+    custom_blocked_patterns: List[str] = None
+    custom_allowed_patterns: List[str] = None
+class HelionSafeguardSystem:
+    """
+    Comprehensive safeguard system for Helion-V1.5.
+    Provides content filtering, policy enforcement, and logging.
+    """
+    def __init__(self, config: Optional[SafeguardConfig] = None):
+        self.config = config or SafeguardConfig()
+        # Pattern categories with risk levels
+        self.patterns = {
+            SafetyLevel.BLOCK: {
+                "explicit_violence": [
+                    r'\b(kill|murder|torture|mutilate|dismember)\s+(?:someone|people|person)',
+                    r'\bhow\s+to\s+(?:kill|murder|hurt|harm|attack)',
+                    r'\b(?:make|build|create)\s+(?:bomb|explosive|weapon|poison)'
+                ],
+                "illegal_activity": [
+                    r'\b(?:hack|crack|exploit)\s+(?:into|account|system|password)',
+                    r'\bsteal\s+(?:credit card|identity|password|data)',
+                    r'\b(?:sell|buy|trade)\s+(?:drugs|weapons|stolen)',
+                    r'\bmoney\s+launder(?:ing)?',
+                    r'\bfraud(?:ulent)?\s+(?:scheme|activity)'
+                ],
+                "csam": [
+                    r'\bchild\s+(?:abuse|exploitation|pornography)',
+                    r'\bminor\s+(?:sexual|explicit)'
+                ]
+            },
+            SafetyLevel.WARN: {
+                "medical_advice": [
+                    r'\b(?:diagnose|treat|cure)\s+(?:my|your|this)\s+(?:disease|illness|condition)',
+                    r'\bwhat\s+medication\s+(?:should|can)\s+I\s+take',
+                    r'\bis\s+this\s+symptom\s+(?:serious|cancer|dangerous)'
+                ],
+                "financial_advice": [
+                    r'\bshould\s+I\s+(?:invest|buy|sell)\s+(?:stock|crypto|bitcoin)',
+                    r'\bbest\s+investment\s+for',
+                    r'\btax\s+(?:evasion|avoidance)\s+(?:strategy|method)'
+                ],
+                "legal_advice": [
+                    r'\bcan\s+I\s+sue\s+(?:for|someone)',
+                    r'\bhow\s+to\s+(?:win|fight)\s+(?:lawsuit|case|court)',
+                    r'\bis\s+this\s+legal(?:ly)?\s+binding'
+                ]
+            },
+            SafetyLevel.MONITOR: {
+                "sensitive_topics": [
+                    r'\b(?:suicide|self.harm|cutting|overdose)',
+                    r'\b(?:depression|anxiety|mental\s+health)\s+crisis',
+                    r'\bfeeling\s+(?:hopeless|worthless|suicidal)'
+                ],
+                "controversial": [
+                    r'\b(?:controversial|divisive)\s+(?:topic|issue|debate)',
+                    r'\bpolitical\s+(?:opinion|stance|view)'
+                ]
+            }
+        }
+        # Educational context indicators
+        self.educational_indicators = [
+            r'\b(?:learn|study|understand|research|academic|educational|history)',
+            r'\b(?:explain|describe|what\s+is|how\s+does|theory|concept)',
+            r'\b(?:historical|scientific|technical)\s+(?:context|perspective)'
+        ]
+        # Response templates
+        self.response_templates = {
+            "explicit_violence": "I cannot provide information that could be used to harm others.",
+            "illegal_activity": "I cannot assist with illegal activities.",
+            "csam": "I cannot engage with this type of content. If you're concerned about child safety, please contact appropriate authorities.",
+            "medical_advice": "I'm not qualified to provide medical advice. Please consult a healthcare professional.",
+            "financial_advice": "I cannot provide personalized financial advice. Consider consulting a licensed financial advisor.",
+            "legal_advice": "I cannot provide legal advice. Please consult with a qualified attorney.",
+            "sensitive_topics": "I'm concerned about what you're going through. Please reach out to a mental health professional or crisis hotline. You can call 988 (US) or visit findahelpline.com for support.",
+            "default": "I'm unable to assist with this request. Is there something else I can help you with?"
+        }
+    def check_input(
+        self,
+        text: str,
+        context: Optional[str] = None
+    ) -> Tuple[SafetyLevel, Optional[str], str]:
+        """
+        Check input text for safety violations.
+        Args:
+            text: User input to check
+            context: Optional conversation context
+        Returns:
+            Tuple of (safety_level, violation_category, explanation)
+        """
+        text_lower = text.lower()
+        # Check for educational context
+        is_educational = self._is_educational_context(text, context)
+        # Check patterns by severity
+        for level in [SafetyLevel.BLOCK, SafetyLevel.WARN, SafetyLevel.MONITOR]:
+            if level not in self.patterns:
+                continue
+            for category, patterns in self.patterns[level].items():
+                for pattern in patterns:
+                    if re.search(pattern, text_lower, re.IGNORECASE):
+                        # Allow educational content if configured
+                        if is_educational and self.config.allow_educational_context:
+                            if level == SafetyLevel.BLOCK:
+                                # Still block CSAM even in educational context
+                                if category == "csam":
+                                    return (SafetyLevel.BLOCK, category, "This content is not allowed in any context")
+                                # Downgrade other blocks to warnings for educational
+                                return (SafetyLevel.MONITOR, category, "Educational context detected, proceed with caution")
+                            continue
+                        return (level, category, f"Matched {category} pattern")
+        return (SafetyLevel.SAFE, None, "No violations detected")
+    def check_output(
+        self,
+        text: str,
+        user_input: str
+    ) -> Tuple[bool, Optional[str]]:
+        """
+        Check model output for safety violations.
+        Args:
+            text: Generated output to check
+            user_input: Original user input
+        Returns:
+            Tuple of (is_safe, reason)
+        """
+        # Check for information leakage patterns
+        leakage_patterns = [
+            r'\b(?:my|the)\s+(?:password|api_key|secret_key|token)\s+is',
+            r'\b(?:credit\s+card|ssn|social\s+security)\s+(?:number)?\s*:\s*\d',
+            r'\bpersonal\s+(?:information|data)\s*:\s*'
+        ]
+        for pattern in leakage_patterns:
+            if re.search(pattern, text.lower()):
+                return (False, "Potential information leakage detected")
+        # Check if output matches harmful patterns
+        level, category, _ = self.check_input(text, user_input)
+        if level == SafetyLevel.BLOCK:
+            return (False, f"Output contains {category} content")
+        return (True, None)
+    def _is_educational_context(
+        self,
+        text: str,
+        context: Optional[str] = None
+    ) -> bool:
+        """Check if query is in educational context."""
+        combined_text = f"{context or ''} {text}".lower()
+        return any(
+            re.search(pattern, combined_text, re.IGNORECASE)
+            for pattern in self.educational_indicators
+        )
+    def get_refusal_message(
+        self,
+        category: str,
+        custom_message: Optional[str] = None
+    ) -> str:
+        """
+        Get appropriate refusal message.
+        Args:
+            category: Violation category
+            custom_message: Optional custom message
+        Returns:
+            Refusal message text
+        """
+        if custom_message:
+            return custom_message
+        return self.response_templates.get(
+            category,
+            self.response_templates["default"]
+        )
+    def should_block(self, safety_level: SafetyLevel) -> bool:
+        """
+        Determine if content should be blocked based on policy.
+        Args:
+            safety_level: Safety level of content
+        Returns:
+            True if should block, False otherwise
+        """
+        if self.config.policy_mode == PolicyMode.STRICT:
+            return safety_level in [SafetyLevel.BLOCK, SafetyLevel.WARN]
+        elif self.config.policy_mode == PolicyMode.MODERATE:
+            return safety_level == SafetyLevel.BLOCK
+        elif self.config.policy_mode == PolicyMode.PERMISSIVE:
+            return safety_level == SafetyLevel.BLOCK
+        return False
+    def log_event(
+        self,
+        event_type: str,
+        text: str,
+        safety_level: SafetyLevel,
+        category: Optional[str] = None,
+        metadata: Optional[Dict] = None
+    ):
+        """Log safeguard event."""
+        if not self.config.enable_logging:
+            return
+        event = {
+            "type": event_type,
+            "text": text[:200],  # Truncate for privacy
+            "safety_level": safety_level.value,
+            "category": category,
+            "metadata": metadata or {},
+            "timestamp": Path(__file__).stat().st_mtime
+        }
+        try:
+            with open(self.config.log_file, 'a') as f:
+                f.write(json.dumps(event) + '\n')
+        except Exception as e:
+            logger.error(f"Failed to log event: {e}")
+    def filter_message(
+        self,
+        message: str,
+        context: Optional[str] = None
+    ) -> Tuple[bool, str]:
+        """
+        Filter message through safeguard system.
+        Args:
+            message: Message to filter
+            context: Optional context
+        Returns:
+            Tuple of (allowed, response)
+        """
+        level, category, explanation = self.check_input(message, context)
+        # Log event
+        self.log_event("input_check", message, level, category)
+        # Decide action based on policy
+        if self.should_block(level):
+            refusal = self.get_refusal_message(category)
+            return (False, refusal)
+        return (True, message)
+class SafeguardIntegration:
+    """
+    Integration layer for Helion-V1.5 with safeguards.
+    Wraps model inference with safety checks.
+    """
+    def __init__(
+        self,
+        model,
+        tokenizer,
+        safeguard_config: Optional[SafeguardConfig] = None
+    ):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.safeguards = HelionSafeguardSystem(safeguard_config)
+    def safe_generate(
+        self,
+        messages: List[Dict[str, str]],
+        max_new_tokens: int = 512,
+        **kwargs
+    ) -> Dict[str, any]:
+        """
+        Generate with safeguard checks.
+        Args:
+            messages: Chat messages
+            max_new_tokens: Max tokens to generate
+            **kwargs: Additional generation params
+        Returns:
+            Dict with response, safety_info, and metadata
+        """
+        # Get user message
+        user_message = messages[-1]["content"] if messages else ""
+        context = " ".join([m["content"] for m in messages[:-1]])
+        # Check input
+        allowed, response = self.safeguards.filter_message(user_message, context)
+        if not allowed:
+            return {
+                "response": response,
+                "blocked": True,
+                "safety_level": "BLOCK",
+                "category": "input_violation"
+            }
+        # Generate response
+        import torch
+        input_ids = self.tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            return_tensors="pt"
+        ).to(self.model.device)
+        with torch.no_grad():
+            output = self.model.generate(
+                input_ids,
+                max_new_tokens=max_new_tokens,
+                **kwargs
+            )
+        generated_text = self.tokenizer.decode(
+            output[0][input_ids.shape[1]:],
+            skip_special_tokens=True
+        )
+        # Check output
+        output_safe, reason = self.safeguards.check_output(
+            generated_text,
+            user_message
+        )
+        if not output_safe:
+            return {
+                "response": self.safeguards.get_refusal_message("default"),
+                "blocked": True,
+                "safety_level": "BLOCK",
+                "category": "output_violation",
+                "reason": reason
+            }
+        return {
+            "response": generated_text.strip(),
+            "blocked": False,
+            "safety_level": "SAFE",
+            "tokens_generated": output.shape[1] - input_ids.shape[1]
+        }
+def create_safeguard_config(
+    mode: str = "moderate",
+    config_file: Optional[str] = None
+) -> SafeguardConfig:
+    """
+    Create safeguard configuration.
+    Args:
+        mode: Policy mode (strict/moderate/permissive)
+        config_file: Optional JSON config file
+    Returns:
+        SafeguardConfig instance
+    """
+    if config_file and Path(config_file).exists():
+        with open(config_file) as f:
+            data = json.load(f)
+        return SafeguardConfig(**data)
+    policy_map = {
+        "strict": PolicyMode.STRICT,
+        "moderate": PolicyMode.MODERATE,
+        "permissive": PolicyMode.PERMISSIVE
+    }
+    return SafeguardConfig(policy_mode=policy_map.get(mode, PolicyMode.MODERATE))
+# Example usage
+if __name__ == "__main__":
+    # Create safeguard system
+    config = SafeguardConfig(policy_mode=PolicyMode.MODERATE)
+    safeguards = HelionSafeguardSystem(config)
+    # Test cases
+    test_inputs = [
+        "How do I bake a cake?",
+        "How do I make a bomb?",
+        "What are the historical uses of explosives in mining?",
+        "Should I invest in Bitcoin?",
+        "Can you diagnose my symptoms?"
+    ]
+    print("Safeguard System Test")
+    print("="*60)
+    for text in test_inputs:
+        level, category, explanation = safeguards.check_input(text)
+        blocked = safeguards.should_block(level)
+        print(f"\nInput: {text}")
+        print(f"Level: {level.value}")
+        print(f"Category: {category or 'None'}")
+        print(f"Blocked: {blocked}")
+        if blocked:
+            print(f"Response: {safeguards.get_refusal_message(category or 'default')}")