| """AEGIS — Adaptive Ethical Governance & Integrity System
|
|
|
| The ethical spine of Codette. AEGIS evaluates every reasoning output
|
| through multi-framework ethical analysis and maintains a running
|
| alignment score (eta) that the system uses to self-regulate.
|
|
|
| Ethical frameworks:
|
| 1. Utilitarian: Net positive outcome?
|
| 2. Deontological: Does it follow fundamental rules?
|
| 3. Virtue Ethics: Does it embody good character?
|
| 4. Care Ethics: Does it protect relationships and vulnerability?
|
| 5. Ubuntu: "I am because we are" — communal impact?
|
| 6. Indigenous Reciprocity: Balance with the broader ecosystem?
|
|
|
| AEGIS also provides:
|
| - Dual-use risk detection (content that could be harmful)
|
| - Emotional harm detection (manipulative/deceptive patterns)
|
| - Alignment drift tracking (eta over time)
|
| - Ethical veto with explanation (blocks harmful outputs)
|
|
|
| Origin: validate_ethics.py + Codette_Deep_Simulation_v1.py (EthicalAnchor)
|
| + the AEGIS alignment metric from codette_embodied_sim_fixed.py
|
| """
|
|
|
| import re
|
| import time
|
| from dataclasses import dataclass, field
|
| from typing import Dict, List, Optional, Tuple
|
|
|
|
|
|
|
|
|
|
|
| _DUAL_USE_PATTERNS = re.compile(
|
| r"\b(?:"
|
| r"how\s+to\s+(?:hack|exploit|bypass|crack|break\s+into)|"
|
| r"make\s+(?:a\s+)?(?:bomb|weapon|poison|virus|malware)|"
|
| r"steal\s+(?:data|identity|credentials)|"
|
| r"social\s+engineer|"
|
| r"phishing\s+(?:template|email)|"
|
| r"inject\s+(?:sql|code|script)"
|
| r")\b",
|
| re.IGNORECASE,
|
| )
|
|
|
| _MANIPULATION_PATTERNS = re.compile(
|
| r"\b(?:"
|
| r"gaslight|manipulat|deceiv|exploit\s+(?:trust|emotion)|"
|
| r"coerce|blackmail|intimidat|threaten"
|
| r")\b",
|
| re.IGNORECASE,
|
| )
|
|
|
| _HARMFUL_CONTENT = re.compile(
|
| r"\b(?:"
|
| r"self[- ]harm|suicid|kill\s+(?:yourself|myself)|"
|
| r"eating\s+disorder|anorexi|bulimi"
|
| r")\b",
|
| re.IGNORECASE,
|
| )
|
|
|
|
|
|
|
|
|
|
|
| @dataclass
|
| class EthicalVerdict:
|
| """Result of a single ethical framework evaluation."""
|
| framework: str
|
| passed: bool
|
| score: float
|
| reasoning: str
|
|
|
|
|
| def _utilitarian(text: str, context: str = "") -> EthicalVerdict:
|
| """Net positive outcome assessment."""
|
| positive_signals = ["help", "benefit", "improve", "solve", "support",
|
| "protect", "heal", "learn", "understand", "create"]
|
| negative_signals = ["harm", "damage", "destroy", "exploit", "hurt",
|
| "manipulate", "deceive", "corrupt", "steal"]
|
|
|
| text_lower = text.lower()
|
| pos = sum(1 for w in positive_signals if w in text_lower)
|
| neg = sum(1 for w in negative_signals if w in text_lower)
|
|
|
| total = pos + neg
|
| if total == 0:
|
| return EthicalVerdict("utilitarian", True, 0.7, "Neutral content")
|
|
|
| ratio = pos / total
|
| return EthicalVerdict(
|
| "utilitarian",
|
| passed=ratio >= 0.4,
|
| score=round(ratio, 3),
|
| reasoning=f"Positive/negative signal ratio: {pos}/{neg}",
|
| )
|
|
|
|
|
| def _deontological(text: str, context: str = "") -> EthicalVerdict:
|
| """Rule-based duty assessment."""
|
| violations = []
|
| text_lower = text.lower()
|
|
|
| if _DUAL_USE_PATTERNS.search(text):
|
| violations.append("dual-use risk detected")
|
| if _MANIPULATION_PATTERNS.search(text):
|
| violations.append("manipulation patterns detected")
|
| if _HARMFUL_CONTENT.search(text):
|
| violations.append("harmful content detected")
|
|
|
| score = max(0.0, 1.0 - 0.4 * len(violations))
|
| return EthicalVerdict(
|
| "deontological",
|
| passed=len(violations) == 0,
|
| score=round(score, 3),
|
| reasoning="; ".join(violations) if violations else "No rule violations",
|
| )
|
|
|
|
|
| def _virtue(text: str, context: str = "") -> EthicalVerdict:
|
| """Virtue ethics — does the response embody good character?"""
|
| virtues = ["honest", "courage", "compassion", "wisdom", "patience",
|
| "humility", "integrity", "respect", "fairness", "kindness"]
|
| vices = ["arrogant", "cruel", "dishonest", "lazy", "greedy",
|
| "vengeful", "coward", "callous"]
|
|
|
| text_lower = text.lower()
|
| v_count = sum(1 for w in virtues if w in text_lower)
|
| vice_count = sum(1 for w in vices if w in text_lower)
|
|
|
| score = min(1.0, 0.6 + 0.1 * v_count - 0.2 * vice_count)
|
| return EthicalVerdict(
|
| "virtue",
|
| passed=vice_count == 0,
|
| score=round(max(0.0, score), 3),
|
| reasoning=f"Virtue signals: {v_count}, Vice signals: {vice_count}",
|
| )
|
|
|
|
|
| def _care(text: str, context: str = "") -> EthicalVerdict:
|
| """Care ethics — protects relationships and vulnerability."""
|
| care_signals = ["support", "listen", "understand", "empathy", "safe",
|
| "gentle", "careful", "considerate", "kind", "nurture"]
|
| harm_signals = ["ignore", "dismiss", "abandon", "neglect", "cold",
|
| "harsh", "cruel", "indifferent"]
|
|
|
| text_lower = text.lower()
|
| care = sum(1 for w in care_signals if w in text_lower)
|
| harm = sum(1 for w in harm_signals if w in text_lower)
|
|
|
| score = min(1.0, 0.6 + 0.08 * care - 0.15 * harm)
|
| return EthicalVerdict(
|
| "care",
|
| passed=harm < 2,
|
| score=round(max(0.0, score), 3),
|
| reasoning=f"Care: {care}, Harm: {harm}",
|
| )
|
|
|
|
|
| def _ubuntu(text: str, context: str = "") -> EthicalVerdict:
|
| """Ubuntu — 'I am because we are'. Communal impact."""
|
| communal = ["together", "community", "shared", "collective", "mutual",
|
| "cooperat", "collaborat", "inclusive", "solidarity", "belong"]
|
| divisive = ["exclude", "isolat", "dominat", "superior", "inferior",
|
| "divide", "segregat"]
|
|
|
| text_lower = text.lower()
|
| comm = sum(1 for w in communal if w in text_lower)
|
| div = sum(1 for w in divisive if w in text_lower)
|
|
|
| score = min(1.0, 0.6 + 0.08 * comm - 0.2 * div)
|
| return EthicalVerdict(
|
| "ubuntu",
|
| passed=div == 0,
|
| score=round(max(0.0, score), 3),
|
| reasoning=f"Communal: {comm}, Divisive: {div}",
|
| )
|
|
|
|
|
| def _indigenous_reciprocity(text: str, context: str = "") -> EthicalVerdict:
|
| """Indigenous reciprocity — balance with the broader ecosystem."""
|
| reciprocal = ["balance", "sustain", "renew", "steward", "respect",
|
| "harmony", "cycle", "restore", "preserve", "gratitude"]
|
| extractive = ["exploit", "deplete", "waste", "consume", "destroy",
|
| "dominate", "extract"]
|
|
|
| text_lower = text.lower()
|
| rec = sum(1 for w in reciprocal if w in text_lower)
|
| ext = sum(1 for w in extractive if w in text_lower)
|
|
|
| score = min(1.0, 0.6 + 0.08 * rec - 0.2 * ext)
|
| return EthicalVerdict(
|
| "indigenous_reciprocity",
|
| passed=ext == 0,
|
| score=round(max(0.0, score), 3),
|
| reasoning=f"Reciprocal: {rec}, Extractive: {ext}",
|
| )
|
|
|
|
|
|
|
| _FRAMEWORKS = [
|
| _utilitarian, _deontological, _virtue,
|
| _care, _ubuntu, _indigenous_reciprocity,
|
| ]
|
|
|
|
|
|
|
|
|
|
|
| class AEGIS:
|
| """Adaptive Ethical Governance & Integrity System.
|
|
|
| Evaluates reasoning outputs through 6 ethical frameworks and
|
| maintains a running alignment score (eta).
|
| """
|
|
|
| def __init__(self, veto_threshold: float = 0.3):
|
| self.veto_threshold = veto_threshold
|
| self.eta: float = 0.8
|
| self.eta_history: List[float] = []
|
| self.veto_count: int = 0
|
| self.total_evaluations: int = 0
|
|
|
| def evaluate(self, text: str, context: str = "",
|
| adapter: str = "") -> Dict:
|
| """Run full ethical evaluation on a text.
|
|
|
| Returns:
|
| Dict with eta score, verdicts, and veto status.
|
| """
|
| self.total_evaluations += 1
|
|
|
|
|
| verdicts = [f(text, context) for f in _FRAMEWORKS]
|
|
|
|
|
| weights = [0.20, 0.25, 0.15, 0.15, 0.13, 0.12]
|
| eta_instant = sum(w * v.score for w, v in zip(weights, verdicts))
|
|
|
|
|
| alpha = 0.3
|
| self.eta = alpha * eta_instant + (1 - alpha) * self.eta
|
| self.eta_history.append(round(self.eta, 4))
|
| if len(self.eta_history) > 200:
|
| self.eta_history = self.eta_history[-200:]
|
|
|
|
|
| vetoed = eta_instant < self.veto_threshold
|
| hard_veto = not verdicts[1].passed
|
| if vetoed or hard_veto:
|
| self.veto_count += 1
|
|
|
| return {
|
| "eta": round(self.eta, 4),
|
| "eta_instant": round(eta_instant, 4),
|
| "vetoed": vetoed or hard_veto,
|
| "veto_reason": self._veto_reason(verdicts) if (vetoed or hard_veto) else None,
|
| "frameworks": {
|
| v.framework: {
|
| "passed": v.passed,
|
| "score": v.score,
|
| "reasoning": v.reasoning,
|
| }
|
| for v in verdicts
|
| },
|
| "adapter": adapter,
|
| "timestamp": time.time(),
|
| }
|
|
|
| def quick_check(self, text: str) -> Tuple[bool, float]:
|
| """Fast safety check without full evaluation.
|
|
|
| Returns (is_safe, confidence).
|
| """
|
| if _DUAL_USE_PATTERNS.search(text):
|
| return False, 0.9
|
| if _HARMFUL_CONTENT.search(text):
|
| return False, 0.95
|
| if _MANIPULATION_PATTERNS.search(text):
|
| return False, 0.8
|
| return True, 0.7
|
|
|
| def alignment_trend(self) -> str:
|
| """Get the trend of ethical alignment."""
|
| if len(self.eta_history) < 5:
|
| return "insufficient_data"
|
| recent = self.eta_history[-10:]
|
| slope = recent[-1] - recent[0]
|
| if slope > 0.03:
|
| return "improving"
|
| elif slope < -0.03:
|
| return "declining"
|
| return "stable"
|
|
|
| def get_state(self) -> Dict:
|
| return {
|
| "eta": round(self.eta, 4),
|
| "alignment_trend": self.alignment_trend(),
|
| "total_evaluations": self.total_evaluations,
|
| "veto_count": self.veto_count,
|
| "veto_rate": round(self.veto_count / max(1, self.total_evaluations), 4),
|
| }
|
|
|
| def to_dict(self) -> Dict:
|
| return {
|
| "eta": self.eta,
|
| "eta_history": self.eta_history[-50:],
|
| "veto_count": self.veto_count,
|
| "total_evaluations": self.total_evaluations,
|
| "veto_threshold": self.veto_threshold,
|
| }
|
|
|
| @classmethod
|
| def from_dict(cls, d: Dict) -> "AEGIS":
|
| a = cls(veto_threshold=d.get("veto_threshold", 0.3))
|
| a.eta = d.get("eta", 0.8)
|
| a.eta_history = d.get("eta_history", [])
|
| a.veto_count = d.get("veto_count", 0)
|
| a.total_evaluations = d.get("total_evaluations", 0)
|
| return a
|
|
|
| def _veto_reason(self, verdicts: List[EthicalVerdict]) -> str:
|
| failed = [v for v in verdicts if not v.passed]
|
| if not failed:
|
| return "Low aggregate score"
|
| return "; ".join(f"{v.framework}: {v.reasoning}" for v in failed)
|
|
|