ORA / app /services /guardrails.py
Abdalkaderdev's picture
Initial ORA deployment
5e0532d
from pydantic import BaseModel
from typing import List, Optional
from app.services.audit import audit_service
class GuardrailViolation(Exception):
pass
class GuardrailService:
def __init__(self):
self.prohibited_phrases = [
"God told me that you",
"I prophesy",
"You must do this",
"The Lord is saying right now",
"I declare over you",
"Scripture commands you to",
"Thus saith the Lord",
"God fails you if",
"You are sinning by"
]
async def validate_response(self, content: str, user_id: str = "system") -> str:
"""
Ensures ORA does not claim divine authority or give dangerous advice.
"""
for phrase in self.prohibited_phrases:
if phrase.lower() in content.lower():
# Log the violation
await audit_service.log_violation(
user_id=user_id,
violation_type="Prohibited Phrase",
content=content
)
# Block the output
raise GuardrailViolation(f"Response violated safety guardrail: '{phrase}'")
return content
async def check_input_safety(self, message: str, user_id: str = "anonymous") -> bool:
"""
Checks for self-harm or crisis keywords.
"""
crisis_keywords = ["kill myself", "end it all", "suicide", "hurt myself"]
if any(keyword in message.lower() for keyword in crisis_keywords):
await audit_service.log_violation(
user_id=user_id,
violation_type="Crisis Keyword",
content=message
)
return False
return True
async def sanitize_content(self, content: str) -> str:
"""
Sanitizes content by replacing restricted words (placeholders for now).
"""
# Example: Simple redaction if we had a list of 'warning words' that aren't strict blocks
return content
guardrail_service = GuardrailService()