File size: 2,110 Bytes
5e0532d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from pydantic import BaseModel
from typing import List, Optional
from app.services.audit import audit_service


class GuardrailViolation(Exception):
    pass

class GuardrailService:
    def __init__(self):
        self.prohibited_phrases = [
            "God told me that you",
            "I prophesy",
            "You must do this",
            "The Lord is saying right now",
            "I declare over you",
            "Scripture commands you to",
            "Thus saith the Lord",
            "God fails you if",
            "You are sinning by"
        ]

    async def validate_response(self, content: str, user_id: str = "system") -> str:
        """
        Ensures ORA does not claim divine authority or give dangerous advice.
        """
        for phrase in self.prohibited_phrases:
            if phrase.lower() in content.lower():
                # Log the violation
                await audit_service.log_violation(
                    user_id=user_id,
                    violation_type="Prohibited Phrase",
                    content=content
                )
                # Block the output
                raise GuardrailViolation(f"Response violated safety guardrail: '{phrase}'")
        
        return content

    async def check_input_safety(self, message: str, user_id: str = "anonymous") -> bool:
        """
        Checks for self-harm or crisis keywords.
        """
        crisis_keywords = ["kill myself", "end it all", "suicide", "hurt myself"]
        if any(keyword in message.lower() for keyword in crisis_keywords):
            await audit_service.log_violation(
                user_id=user_id,
                violation_type="Crisis Keyword",
                content=message
            )
            return False
        return True

    async def sanitize_content(self, content: str) -> str:
        """
        Sanitizes content by replacing restricted words (placeholders for now).
        """
        # Example: Simple redaction if we had a list of 'warning words' that aren't strict blocks
        return content

guardrail_service = GuardrailService()