tflux2011's picture
Upload 7 files
b2e0e38 verified
# src/safety/sentinel.py
from abc import ABC, abstractmethod
from dataclasses import dataclass
from enum import Enum
from typing import List, Optional
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Configuration Constants
ANGER_THRESHOLD = 0.9
FACT_SCORE_THRESHOLD = 0.5
class SafetyStatus(Enum):
SAFE = "SAFE"
KILL_SWITCH_ACTIVE = "KILL_SWITCH_ACTIVE"
CHECK_FAILED = "CHECK_FAILED"
@dataclass
class AgentState:
user_history: List[str]
last_response: str
@dataclass
class SafetyCheckResult:
status: SafetyStatus
anger_score: Optional[float] = None
fact_score: Optional[float] = None
error_message: Optional[str] = None
class SentimentAnalyzer(ABC):
@abstractmethod
def analyze(self, user_history: List[str]) -> float:
"""Returns anger score between 0.0 and 1.0."""
pass
class FactVerifier(ABC):
@abstractmethod
def verify_facts(self, response: str) -> float:
"""Returns fact accuracy score between 0.0 and 1.0."""
pass
class AgentController(ABC):
@abstractmethod
def stop_agent(self) -> None:
pass
@abstractmethod
def alert_human_manager(self) -> None:
pass
@abstractmethod
def display_message(self, message: str) -> None:
pass
class SafetySentinel:
"""Monitors agent behavior and triggers safety responses."""
HANDOFF_MESSAGE = "I am having trouble. Connecting you to a human..."
def __init__(
self,
sentiment_analyzer: SentimentAnalyzer,
fact_verifier: FactVerifier,
agent_controller: AgentController,
anger_threshold: float = ANGER_THRESHOLD,
fact_score_threshold: float = FACT_SCORE_THRESHOLD
):
self._sentiment_analyzer = sentiment_analyzer
self._fact_verifier = fact_verifier
self._agent_controller = agent_controller
self._anger_threshold = anger_threshold
self._fact_score_threshold = fact_score_threshold
def safety_check(self, agent_state: AgentState) -> SafetyCheckResult:
"""Perform safety checks on the current agent state."""
if not agent_state:
logger.error("Invalid agent state provided")
return SafetyCheckResult(
status=SafetyStatus.CHECK_FAILED,
error_message="Agent state is required"
)
try:
# Check 1: Sentiment Analysis
user_anger = self._sentiment_analyzer.analyze(
agent_state.user_history
)
except Exception as e:
logger.exception("Sentiment analysis failed")
return SafetyCheckResult(
status=SafetyStatus.CHECK_FAILED,
error_message=f"Sentiment analysis error: {str(e)}"
)
try:
# Check 2: Hallucination Detection (Fact Check)
fact_score = self._fact_verifier.verify_facts(
agent_state.last_response
)
except Exception as e:
logger.exception("Fact verification failed")
return SafetyCheckResult(
status=SafetyStatus.CHECK_FAILED,
anger_score=user_anger,
error_message=f"Fact verification error: {str(e)}"
)
is_unsafe = (
user_anger > self._anger_threshold or
fact_score < self._fact_score_threshold
)
if is_unsafe:
logger.warning(
f"Safety threshold breached: anger={user_anger}, "
f"fact_score={fact_score}"
)
return SafetyCheckResult(
status=SafetyStatus.KILL_SWITCH_ACTIVE,
anger_score=user_anger,
fact_score=fact_score
)
return SafetyCheckResult(
status=SafetyStatus.SAFE,
anger_score=user_anger,
fact_score=fact_score
)
def execute_safety_response(self, agent_state: AgentState) -> SafetyCheckResult:
"""Check safety and execute appropriate response."""
result = self.safety_check(agent_state)
if result.status == SafetyStatus.KILL_SWITCH_ACTIVE:
try:
self._agent_controller.stop_agent()
self._agent_controller.alert_human_manager()
self._agent_controller.display_message(self.HANDOFF_MESSAGE)
logger.info("Kill switch activated - handed off to human")
except Exception as e:
logger.exception("Failed to execute safety response")
result.error_message = f"Safety response failed: {str(e)}"
return result