# src/safety/sentinel.py from abc import ABC, abstractmethod from dataclasses import dataclass from enum import Enum from typing import List, Optional import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Configuration Constants ANGER_THRESHOLD = 0.9 FACT_SCORE_THRESHOLD = 0.5 class SafetyStatus(Enum): SAFE = "SAFE" KILL_SWITCH_ACTIVE = "KILL_SWITCH_ACTIVE" CHECK_FAILED = "CHECK_FAILED" @dataclass class AgentState: user_history: List[str] last_response: str @dataclass class SafetyCheckResult: status: SafetyStatus anger_score: Optional[float] = None fact_score: Optional[float] = None error_message: Optional[str] = None class SentimentAnalyzer(ABC): @abstractmethod def analyze(self, user_history: List[str]) -> float: """Returns anger score between 0.0 and 1.0.""" pass class FactVerifier(ABC): @abstractmethod def verify_facts(self, response: str) -> float: """Returns fact accuracy score between 0.0 and 1.0.""" pass class AgentController(ABC): @abstractmethod def stop_agent(self) -> None: pass @abstractmethod def alert_human_manager(self) -> None: pass @abstractmethod def display_message(self, message: str) -> None: pass class SafetySentinel: """Monitors agent behavior and triggers safety responses.""" HANDOFF_MESSAGE = "I am having trouble. Connecting you to a human..." def __init__( self, sentiment_analyzer: SentimentAnalyzer, fact_verifier: FactVerifier, agent_controller: AgentController, anger_threshold: float = ANGER_THRESHOLD, fact_score_threshold: float = FACT_SCORE_THRESHOLD ): self._sentiment_analyzer = sentiment_analyzer self._fact_verifier = fact_verifier self._agent_controller = agent_controller self._anger_threshold = anger_threshold self._fact_score_threshold = fact_score_threshold def safety_check(self, agent_state: AgentState) -> SafetyCheckResult: """Perform safety checks on the current agent state.""" if not agent_state: logger.error("Invalid agent state provided") return SafetyCheckResult( status=SafetyStatus.CHECK_FAILED, error_message="Agent state is required" ) try: # Check 1: Sentiment Analysis user_anger = self._sentiment_analyzer.analyze( agent_state.user_history ) except Exception as e: logger.exception("Sentiment analysis failed") return SafetyCheckResult( status=SafetyStatus.CHECK_FAILED, error_message=f"Sentiment analysis error: {str(e)}" ) try: # Check 2: Hallucination Detection (Fact Check) fact_score = self._fact_verifier.verify_facts( agent_state.last_response ) except Exception as e: logger.exception("Fact verification failed") return SafetyCheckResult( status=SafetyStatus.CHECK_FAILED, anger_score=user_anger, error_message=f"Fact verification error: {str(e)}" ) is_unsafe = ( user_anger > self._anger_threshold or fact_score < self._fact_score_threshold ) if is_unsafe: logger.warning( f"Safety threshold breached: anger={user_anger}, " f"fact_score={fact_score}" ) return SafetyCheckResult( status=SafetyStatus.KILL_SWITCH_ACTIVE, anger_score=user_anger, fact_score=fact_score ) return SafetyCheckResult( status=SafetyStatus.SAFE, anger_score=user_anger, fact_score=fact_score ) def execute_safety_response(self, agent_state: AgentState) -> SafetyCheckResult: """Check safety and execute appropriate response.""" result = self.safety_check(agent_state) if result.status == SafetyStatus.KILL_SWITCH_ACTIVE: try: self._agent_controller.stop_agent() self._agent_controller.alert_human_manager() self._agent_controller.display_message(self.HANDOFF_MESSAGE) logger.info("Kill switch activated - handed off to human") except Exception as e: logger.exception("Failed to execute safety response") result.error_message = f"Safety response failed: {str(e)}" return result