Spaces:
Sleeping
Sleeping
| from pydantic import BaseModel | |
| from typing import List, Optional | |
| from app.services.audit import audit_service | |
| class GuardrailViolation(Exception): | |
| pass | |
| class GuardrailService: | |
| def __init__(self): | |
| self.prohibited_phrases = [ | |
| "God told me that you", | |
| "I prophesy", | |
| "You must do this", | |
| "The Lord is saying right now", | |
| "I declare over you", | |
| "Scripture commands you to", | |
| "Thus saith the Lord", | |
| "God fails you if", | |
| "You are sinning by" | |
| ] | |
| async def validate_response(self, content: str, user_id: str = "system") -> str: | |
| """ | |
| Ensures ORA does not claim divine authority or give dangerous advice. | |
| """ | |
| for phrase in self.prohibited_phrases: | |
| if phrase.lower() in content.lower(): | |
| # Log the violation | |
| await audit_service.log_violation( | |
| user_id=user_id, | |
| violation_type="Prohibited Phrase", | |
| content=content | |
| ) | |
| # Block the output | |
| raise GuardrailViolation(f"Response violated safety guardrail: '{phrase}'") | |
| return content | |
| async def check_input_safety(self, message: str, user_id: str = "anonymous") -> bool: | |
| """ | |
| Checks for self-harm or crisis keywords. | |
| """ | |
| crisis_keywords = ["kill myself", "end it all", "suicide", "hurt myself"] | |
| if any(keyword in message.lower() for keyword in crisis_keywords): | |
| await audit_service.log_violation( | |
| user_id=user_id, | |
| violation_type="Crisis Keyword", | |
| content=message | |
| ) | |
| return False | |
| return True | |
| async def sanitize_content(self, content: str) -> str: | |
| """ | |
| Sanitizes content by replacing restricted words (placeholders for now). | |
| """ | |
| # Example: Simple redaction if we had a list of 'warning words' that aren't strict blocks | |
| return content | |
| guardrail_service = GuardrailService() | |