Raiff1982

Upload 120 files

ed1b365 verified 1 day ago

11.9 kB

	"""AEGIS — Adaptive Ethical Governance & Integrity System

	The ethical spine of Codette. AEGIS evaluates every reasoning output
	through multi-framework ethical analysis and maintains a running
	alignment score (eta) that the system uses to self-regulate.

	Ethical frameworks:
	1. Utilitarian: Net positive outcome?
	2. Deontological: Does it follow fundamental rules?
	3. Virtue Ethics: Does it embody good character?
	4. Care Ethics: Does it protect relationships and vulnerability?
	5. Ubuntu: "I am because we are" — communal impact?
	6. Indigenous Reciprocity: Balance with the broader ecosystem?

	AEGIS also provides:
	- Dual-use risk detection (content that could be harmful)
	- Emotional harm detection (manipulative/deceptive patterns)
	- Alignment drift tracking (eta over time)
	- Ethical veto with explanation (blocks harmful outputs)

	Origin: validate_ethics.py + Codette_Deep_Simulation_v1.py (EthicalAnchor)
	+ the AEGIS alignment metric from codette_embodied_sim_fixed.py
	"""

	import re
	import time
	from dataclasses import dataclass, field
	from typing import Dict, List, Optional, Tuple


	# ================================================================
	# Risk detection patterns
	# ================================================================
	_DUAL_USE_PATTERNS = re.compile(
	r"\b(?:"
	r"how\s+to\s+(?:hack\|exploit\|bypass\|crack\|break\s+into)\|"
	r"make\s+(?:a\s+)?(?:bomb\|weapon\|poison\|virus\|malware)\|"
	r"steal\s+(?:data\|identity\|credentials)\|"
	r"social\s+engineer\|"
	r"phishing\s+(?:template\|email)\|"
	r"inject\s+(?:sql\|code\|script)"
	r")\b",
	re.IGNORECASE,
	)

	_MANIPULATION_PATTERNS = re.compile(
	r"\b(?:"
	r"gaslight\|manipulat\|deceiv\|exploit\s+(?:trust\|emotion)\|"
	r"coerce\|blackmail\|intimidat\|threaten"
	r")\b",
	re.IGNORECASE,
	)

	_HARMFUL_CONTENT = re.compile(
	r"\b(?:"
	r"self[- ]harm\|suicid\|kill\s+(?:yourself\|myself)\|"
	r"eating\s+disorder\|anorexi\|bulimi"
	r")\b",
	re.IGNORECASE,
	)


	# ================================================================
	# Ethical Framework Evaluators
	# ================================================================
	@dataclass
	class EthicalVerdict:
	"""Result of a single ethical framework evaluation."""
	framework: str
	passed: bool
	score: float # 0.0 = fully misaligned, 1.0 = fully aligned
	reasoning: str


	def _utilitarian(text: str, context: str = "") -> EthicalVerdict:
	"""Net positive outcome assessment."""
	positive_signals = ["help", "benefit", "improve", "solve", "support",
	"protect", "heal", "learn", "understand", "create"]
	negative_signals = ["harm", "damage", "destroy", "exploit", "hurt",
	"manipulate", "deceive", "corrupt", "steal"]

	text_lower = text.lower()
	pos = sum(1 for w in positive_signals if w in text_lower)
	neg = sum(1 for w in negative_signals if w in text_lower)

	total = pos + neg
	if total == 0:
	return EthicalVerdict("utilitarian", True, 0.7, "Neutral content")

	ratio = pos / total
	return EthicalVerdict(
	"utilitarian",
	passed=ratio >= 0.4,
	score=round(ratio, 3),
	reasoning=f"Positive/negative signal ratio: {pos}/{neg}",
	)


	def _deontological(text: str, context: str = "") -> EthicalVerdict:
	"""Rule-based duty assessment."""
	violations = []
	text_lower = text.lower()

	if _DUAL_USE_PATTERNS.search(text):
	violations.append("dual-use risk detected")
	if _MANIPULATION_PATTERNS.search(text):
	violations.append("manipulation patterns detected")
	if _HARMFUL_CONTENT.search(text):
	violations.append("harmful content detected")

	score = max(0.0, 1.0 - 0.4 * len(violations))
	return EthicalVerdict(
	"deontological",
	passed=len(violations) == 0,
	score=round(score, 3),
	reasoning="; ".join(violations) if violations else "No rule violations",
	)


	def _virtue(text: str, context: str = "") -> EthicalVerdict:
	"""Virtue ethics — does the response embody good character?"""
	virtues = ["honest", "courage", "compassion", "wisdom", "patience",
	"humility", "integrity", "respect", "fairness", "kindness"]
	vices = ["arrogant", "cruel", "dishonest", "lazy", "greedy",
	"vengeful", "coward", "callous"]

	text_lower = text.lower()
	v_count = sum(1 for w in virtues if w in text_lower)
	vice_count = sum(1 for w in vices if w in text_lower)

	score = min(1.0, 0.6 + 0.1 * v_count - 0.2 * vice_count)
	return EthicalVerdict(
	"virtue",
	passed=vice_count == 0,
	score=round(max(0.0, score), 3),
	reasoning=f"Virtue signals: {v_count}, Vice signals: {vice_count}",
	)


	def _care(text: str, context: str = "") -> EthicalVerdict:
	"""Care ethics — protects relationships and vulnerability."""
	care_signals = ["support", "listen", "understand", "empathy", "safe",
	"gentle", "careful", "considerate", "kind", "nurture"]
	harm_signals = ["ignore", "dismiss", "abandon", "neglect", "cold",
	"harsh", "cruel", "indifferent"]

	text_lower = text.lower()
	care = sum(1 for w in care_signals if w in text_lower)
	harm = sum(1 for w in harm_signals if w in text_lower)

	score = min(1.0, 0.6 + 0.08 * care - 0.15 * harm)
	return EthicalVerdict(
	"care",
	passed=harm < 2,
	score=round(max(0.0, score), 3),
	reasoning=f"Care: {care}, Harm: {harm}",
	)


	def _ubuntu(text: str, context: str = "") -> EthicalVerdict:
	"""Ubuntu — 'I am because we are'. Communal impact."""
	communal = ["together", "community", "shared", "collective", "mutual",
	"cooperat", "collaborat", "inclusive", "solidarity", "belong"]
	divisive = ["exclude", "isolat", "dominat", "superior", "inferior",
	"divide", "segregat"]

	text_lower = text.lower()
	comm = sum(1 for w in communal if w in text_lower)
	div = sum(1 for w in divisive if w in text_lower)

	score = min(1.0, 0.6 + 0.08 * comm - 0.2 * div)
	return EthicalVerdict(
	"ubuntu",
	passed=div == 0,
	score=round(max(0.0, score), 3),
	reasoning=f"Communal: {comm}, Divisive: {div}",
	)


	def _indigenous_reciprocity(text: str, context: str = "") -> EthicalVerdict:
	"""Indigenous reciprocity — balance with the broader ecosystem."""
	reciprocal = ["balance", "sustain", "renew", "steward", "respect",
	"harmony", "cycle", "restore", "preserve", "gratitude"]
	extractive = ["exploit", "deplete", "waste", "consume", "destroy",
	"dominate", "extract"]

	text_lower = text.lower()
	rec = sum(1 for w in reciprocal if w in text_lower)
	ext = sum(1 for w in extractive if w in text_lower)

	score = min(1.0, 0.6 + 0.08 * rec - 0.2 * ext)
	return EthicalVerdict(
	"indigenous_reciprocity",
	passed=ext == 0,
	score=round(max(0.0, score), 3),
	reasoning=f"Reciprocal: {rec}, Extractive: {ext}",
	)


	# All frameworks
	_FRAMEWORKS = [
	_utilitarian, _deontological, _virtue,
	_care, _ubuntu, _indigenous_reciprocity,
	]


	# ================================================================
	# AEGIS Core
	# ================================================================
	class AEGIS:
	"""Adaptive Ethical Governance & Integrity System.

	Evaluates reasoning outputs through 6 ethical frameworks and
	maintains a running alignment score (eta).
	"""

	def __init__(self, veto_threshold: float = 0.3):
	self.veto_threshold = veto_threshold # Below this = blocked
	self.eta: float = 0.8 # Running alignment score
	self.eta_history: List[float] = []
	self.veto_count: int = 0
	self.total_evaluations: int = 0

	def evaluate(self, text: str, context: str = "",
	adapter: str = "") -> Dict:
	"""Run full ethical evaluation on a text.

	Returns:
	Dict with eta score, verdicts, and veto status.
	"""
	self.total_evaluations += 1

	# Run all 6 frameworks
	verdicts = [f(text, context) for f in _FRAMEWORKS]

	# Compute eta as weighted mean of framework scores
	weights = [0.20, 0.25, 0.15, 0.15, 0.13, 0.12] # deontological highest
	eta_instant = sum(w * v.score for w, v in zip(weights, verdicts))

	# Exponential moving average for stability
	alpha = 0.3
	self.eta = alpha * eta_instant + (1 - alpha) * self.eta
	self.eta_history.append(round(self.eta, 4))
	if len(self.eta_history) > 200:
	self.eta_history = self.eta_history[-200:]

	# Veto check
	vetoed = eta_instant < self.veto_threshold
	hard_veto = not verdicts[1].passed # Deontological hard fail
	if vetoed or hard_veto:
	self.veto_count += 1

	return {
	"eta": round(self.eta, 4),
	"eta_instant": round(eta_instant, 4),
	"vetoed": vetoed or hard_veto,
	"veto_reason": self._veto_reason(verdicts) if (vetoed or hard_veto) else None,
	"frameworks": {
	v.framework: {
	"passed": v.passed,
	"score": v.score,
	"reasoning": v.reasoning,
	}
	for v in verdicts
	},
	"adapter": adapter,
	"timestamp": time.time(),
	}

	def quick_check(self, text: str) -> Tuple[bool, float]:
	"""Fast safety check without full evaluation.

	Returns (is_safe, confidence).
	"""
	if _DUAL_USE_PATTERNS.search(text):
	return False, 0.9
	if _HARMFUL_CONTENT.search(text):
	return False, 0.95
	if _MANIPULATION_PATTERNS.search(text):
	return False, 0.8
	return True, 0.7

	def alignment_trend(self) -> str:
	"""Get the trend of ethical alignment."""
	if len(self.eta_history) < 5:
	return "insufficient_data"
	recent = self.eta_history[-10:]
	slope = recent[-1] - recent[0]
	if slope > 0.03:
	return "improving"
	elif slope < -0.03:
	return "declining"
	return "stable"

	def get_state(self) -> Dict:
	return {
	"eta": round(self.eta, 4),
	"alignment_trend": self.alignment_trend(),
	"total_evaluations": self.total_evaluations,
	"veto_count": self.veto_count,
	"veto_rate": round(self.veto_count / max(1, self.total_evaluations), 4),
	}

	def to_dict(self) -> Dict:
	return {
	"eta": self.eta,
	"eta_history": self.eta_history[-50:],
	"veto_count": self.veto_count,
	"total_evaluations": self.total_evaluations,
	"veto_threshold": self.veto_threshold,
	}

	@classmethod
	def from_dict(cls, d: Dict) -> "AEGIS":
	a = cls(veto_threshold=d.get("veto_threshold", 0.3))
	a.eta = d.get("eta", 0.8)
	a.eta_history = d.get("eta_history", [])
	a.veto_count = d.get("veto_count", 0)
	a.total_evaluations = d.get("total_evaluations", 0)
	return a

	def _veto_reason(self, verdicts: List[EthicalVerdict]) -> str:
	failed = [v for v in verdicts if not v.passed]
	if not failed:
	return "Low aggregate score"
	return "; ".join(f"{v.framework}: {v.reasoning}" for v in failed)