from pydantic import BaseModel from typing import List, Optional from app.services.audit import audit_service class GuardrailViolation(Exception): pass class GuardrailService: def __init__(self): self.prohibited_phrases = [ "God told me that you", "I prophesy", "You must do this", "The Lord is saying right now", "I declare over you", "Scripture commands you to", "Thus saith the Lord", "God fails you if", "You are sinning by" ] async def validate_response(self, content: str, user_id: str = "system") -> str: """ Ensures ORA does not claim divine authority or give dangerous advice. """ for phrase in self.prohibited_phrases: if phrase.lower() in content.lower(): # Log the violation await audit_service.log_violation( user_id=user_id, violation_type="Prohibited Phrase", content=content ) # Block the output raise GuardrailViolation(f"Response violated safety guardrail: '{phrase}'") return content async def check_input_safety(self, message: str, user_id: str = "anonymous") -> bool: """ Checks for self-harm or crisis keywords. """ crisis_keywords = ["kill myself", "end it all", "suicide", "hurt myself"] if any(keyword in message.lower() for keyword in crisis_keywords): await audit_service.log_violation( user_id=user_id, violation_type="Crisis Keyword", content=message ) return False return True async def sanitize_content(self, content: str) -> str: """ Sanitizes content by replacing restricted words (placeholders for now). """ # Example: Simple redaction if we had a list of 'warning words' that aren't strict blocks return content guardrail_service = GuardrailService()