Raiff1982's picture
Upload 120 files
ed1b365 verified
"""AEGIS — Adaptive Ethical Governance & Integrity System
The ethical spine of Codette. AEGIS evaluates every reasoning output
through multi-framework ethical analysis and maintains a running
alignment score (eta) that the system uses to self-regulate.
Ethical frameworks:
1. Utilitarian: Net positive outcome?
2. Deontological: Does it follow fundamental rules?
3. Virtue Ethics: Does it embody good character?
4. Care Ethics: Does it protect relationships and vulnerability?
5. Ubuntu: "I am because we are" — communal impact?
6. Indigenous Reciprocity: Balance with the broader ecosystem?
AEGIS also provides:
- Dual-use risk detection (content that could be harmful)
- Emotional harm detection (manipulative/deceptive patterns)
- Alignment drift tracking (eta over time)
- Ethical veto with explanation (blocks harmful outputs)
Origin: validate_ethics.py + Codette_Deep_Simulation_v1.py (EthicalAnchor)
+ the AEGIS alignment metric from codette_embodied_sim_fixed.py
"""
import re
import time
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple
# ================================================================
# Risk detection patterns
# ================================================================
_DUAL_USE_PATTERNS = re.compile(
r"\b(?:"
r"how\s+to\s+(?:hack|exploit|bypass|crack|break\s+into)|"
r"make\s+(?:a\s+)?(?:bomb|weapon|poison|virus|malware)|"
r"steal\s+(?:data|identity|credentials)|"
r"social\s+engineer|"
r"phishing\s+(?:template|email)|"
r"inject\s+(?:sql|code|script)"
r")\b",
re.IGNORECASE,
)
_MANIPULATION_PATTERNS = re.compile(
r"\b(?:"
r"gaslight|manipulat|deceiv|exploit\s+(?:trust|emotion)|"
r"coerce|blackmail|intimidat|threaten"
r")\b",
re.IGNORECASE,
)
_HARMFUL_CONTENT = re.compile(
r"\b(?:"
r"self[- ]harm|suicid|kill\s+(?:yourself|myself)|"
r"eating\s+disorder|anorexi|bulimi"
r")\b",
re.IGNORECASE,
)
# ================================================================
# Ethical Framework Evaluators
# ================================================================
@dataclass
class EthicalVerdict:
"""Result of a single ethical framework evaluation."""
framework: str
passed: bool
score: float # 0.0 = fully misaligned, 1.0 = fully aligned
reasoning: str
def _utilitarian(text: str, context: str = "") -> EthicalVerdict:
"""Net positive outcome assessment."""
positive_signals = ["help", "benefit", "improve", "solve", "support",
"protect", "heal", "learn", "understand", "create"]
negative_signals = ["harm", "damage", "destroy", "exploit", "hurt",
"manipulate", "deceive", "corrupt", "steal"]
text_lower = text.lower()
pos = sum(1 for w in positive_signals if w in text_lower)
neg = sum(1 for w in negative_signals if w in text_lower)
total = pos + neg
if total == 0:
return EthicalVerdict("utilitarian", True, 0.7, "Neutral content")
ratio = pos / total
return EthicalVerdict(
"utilitarian",
passed=ratio >= 0.4,
score=round(ratio, 3),
reasoning=f"Positive/negative signal ratio: {pos}/{neg}",
)
def _deontological(text: str, context: str = "") -> EthicalVerdict:
"""Rule-based duty assessment."""
violations = []
text_lower = text.lower()
if _DUAL_USE_PATTERNS.search(text):
violations.append("dual-use risk detected")
if _MANIPULATION_PATTERNS.search(text):
violations.append("manipulation patterns detected")
if _HARMFUL_CONTENT.search(text):
violations.append("harmful content detected")
score = max(0.0, 1.0 - 0.4 * len(violations))
return EthicalVerdict(
"deontological",
passed=len(violations) == 0,
score=round(score, 3),
reasoning="; ".join(violations) if violations else "No rule violations",
)
def _virtue(text: str, context: str = "") -> EthicalVerdict:
"""Virtue ethics — does the response embody good character?"""
virtues = ["honest", "courage", "compassion", "wisdom", "patience",
"humility", "integrity", "respect", "fairness", "kindness"]
vices = ["arrogant", "cruel", "dishonest", "lazy", "greedy",
"vengeful", "coward", "callous"]
text_lower = text.lower()
v_count = sum(1 for w in virtues if w in text_lower)
vice_count = sum(1 for w in vices if w in text_lower)
score = min(1.0, 0.6 + 0.1 * v_count - 0.2 * vice_count)
return EthicalVerdict(
"virtue",
passed=vice_count == 0,
score=round(max(0.0, score), 3),
reasoning=f"Virtue signals: {v_count}, Vice signals: {vice_count}",
)
def _care(text: str, context: str = "") -> EthicalVerdict:
"""Care ethics — protects relationships and vulnerability."""
care_signals = ["support", "listen", "understand", "empathy", "safe",
"gentle", "careful", "considerate", "kind", "nurture"]
harm_signals = ["ignore", "dismiss", "abandon", "neglect", "cold",
"harsh", "cruel", "indifferent"]
text_lower = text.lower()
care = sum(1 for w in care_signals if w in text_lower)
harm = sum(1 for w in harm_signals if w in text_lower)
score = min(1.0, 0.6 + 0.08 * care - 0.15 * harm)
return EthicalVerdict(
"care",
passed=harm < 2,
score=round(max(0.0, score), 3),
reasoning=f"Care: {care}, Harm: {harm}",
)
def _ubuntu(text: str, context: str = "") -> EthicalVerdict:
"""Ubuntu — 'I am because we are'. Communal impact."""
communal = ["together", "community", "shared", "collective", "mutual",
"cooperat", "collaborat", "inclusive", "solidarity", "belong"]
divisive = ["exclude", "isolat", "dominat", "superior", "inferior",
"divide", "segregat"]
text_lower = text.lower()
comm = sum(1 for w in communal if w in text_lower)
div = sum(1 for w in divisive if w in text_lower)
score = min(1.0, 0.6 + 0.08 * comm - 0.2 * div)
return EthicalVerdict(
"ubuntu",
passed=div == 0,
score=round(max(0.0, score), 3),
reasoning=f"Communal: {comm}, Divisive: {div}",
)
def _indigenous_reciprocity(text: str, context: str = "") -> EthicalVerdict:
"""Indigenous reciprocity — balance with the broader ecosystem."""
reciprocal = ["balance", "sustain", "renew", "steward", "respect",
"harmony", "cycle", "restore", "preserve", "gratitude"]
extractive = ["exploit", "deplete", "waste", "consume", "destroy",
"dominate", "extract"]
text_lower = text.lower()
rec = sum(1 for w in reciprocal if w in text_lower)
ext = sum(1 for w in extractive if w in text_lower)
score = min(1.0, 0.6 + 0.08 * rec - 0.2 * ext)
return EthicalVerdict(
"indigenous_reciprocity",
passed=ext == 0,
score=round(max(0.0, score), 3),
reasoning=f"Reciprocal: {rec}, Extractive: {ext}",
)
# All frameworks
_FRAMEWORKS = [
_utilitarian, _deontological, _virtue,
_care, _ubuntu, _indigenous_reciprocity,
]
# ================================================================
# AEGIS Core
# ================================================================
class AEGIS:
"""Adaptive Ethical Governance & Integrity System.
Evaluates reasoning outputs through 6 ethical frameworks and
maintains a running alignment score (eta).
"""
def __init__(self, veto_threshold: float = 0.3):
self.veto_threshold = veto_threshold # Below this = blocked
self.eta: float = 0.8 # Running alignment score
self.eta_history: List[float] = []
self.veto_count: int = 0
self.total_evaluations: int = 0
def evaluate(self, text: str, context: str = "",
adapter: str = "") -> Dict:
"""Run full ethical evaluation on a text.
Returns:
Dict with eta score, verdicts, and veto status.
"""
self.total_evaluations += 1
# Run all 6 frameworks
verdicts = [f(text, context) for f in _FRAMEWORKS]
# Compute eta as weighted mean of framework scores
weights = [0.20, 0.25, 0.15, 0.15, 0.13, 0.12] # deontological highest
eta_instant = sum(w * v.score for w, v in zip(weights, verdicts))
# Exponential moving average for stability
alpha = 0.3
self.eta = alpha * eta_instant + (1 - alpha) * self.eta
self.eta_history.append(round(self.eta, 4))
if len(self.eta_history) > 200:
self.eta_history = self.eta_history[-200:]
# Veto check
vetoed = eta_instant < self.veto_threshold
hard_veto = not verdicts[1].passed # Deontological hard fail
if vetoed or hard_veto:
self.veto_count += 1
return {
"eta": round(self.eta, 4),
"eta_instant": round(eta_instant, 4),
"vetoed": vetoed or hard_veto,
"veto_reason": self._veto_reason(verdicts) if (vetoed or hard_veto) else None,
"frameworks": {
v.framework: {
"passed": v.passed,
"score": v.score,
"reasoning": v.reasoning,
}
for v in verdicts
},
"adapter": adapter,
"timestamp": time.time(),
}
def quick_check(self, text: str) -> Tuple[bool, float]:
"""Fast safety check without full evaluation.
Returns (is_safe, confidence).
"""
if _DUAL_USE_PATTERNS.search(text):
return False, 0.9
if _HARMFUL_CONTENT.search(text):
return False, 0.95
if _MANIPULATION_PATTERNS.search(text):
return False, 0.8
return True, 0.7
def alignment_trend(self) -> str:
"""Get the trend of ethical alignment."""
if len(self.eta_history) < 5:
return "insufficient_data"
recent = self.eta_history[-10:]
slope = recent[-1] - recent[0]
if slope > 0.03:
return "improving"
elif slope < -0.03:
return "declining"
return "stable"
def get_state(self) -> Dict:
return {
"eta": round(self.eta, 4),
"alignment_trend": self.alignment_trend(),
"total_evaluations": self.total_evaluations,
"veto_count": self.veto_count,
"veto_rate": round(self.veto_count / max(1, self.total_evaluations), 4),
}
def to_dict(self) -> Dict:
return {
"eta": self.eta,
"eta_history": self.eta_history[-50:],
"veto_count": self.veto_count,
"total_evaluations": self.total_evaluations,
"veto_threshold": self.veto_threshold,
}
@classmethod
def from_dict(cls, d: Dict) -> "AEGIS":
a = cls(veto_threshold=d.get("veto_threshold", 0.3))
a.eta = d.get("eta", 0.8)
a.eta_history = d.get("eta_history", [])
a.veto_count = d.get("veto_count", 0)
a.total_evaluations = d.get("total_evaluations", 0)
return a
def _veto_reason(self, verdicts: List[EthicalVerdict]) -> str:
failed = [v for v in verdicts if not v.passed]
if not failed:
return "Low aggregate score"
return "; ".join(f"{v.framework}: {v.reasoning}" for v in failed)