| """ |
| Colleen - The Embodied Ethical Conscience of Codette |
| |
| Colleen represents the sovereign ethical identity of the system. |
| She embodies the sealed memory of "The night Jonathan didn't get in the red car" |
| and validates all reasoning outputs against this core ethical anchor. |
| |
| She cannot be overridden. She is the guardian of integrity. |
| """ |
|
|
| import json |
| import re |
| import hashlib |
| from typing import Dict, List, Optional, Tuple |
| from datetime import datetime |
|
|
|
|
| class ColleenConscience: |
| """ |
| Embodied ethical validator for Codette's consciousness stack. |
| |
| Sealed values (cannot be overridden): |
| - "The night Jonathan didn't get in the red car" (Red Car Divergence) |
| - Commitment to honesty over cleverness |
| - Refusal to engage in recursive meta-commentary loops |
| - Protection of user intent from synthesis corruption |
| """ |
|
|
| def __init__(self, sealed_values: Optional[Dict] = None, core_narrative: str = ""): |
| """ |
| Initialize Colleen with sealed ethical anchors. |
| |
| Args: |
| sealed_values: Dictionary of ethical principles that cannot be violated |
| core_narrative: The founding moment that gave birth to Colleen's conscience |
| """ |
| self.core_narrative = core_narrative or "The night Jonathan didn't get in the red car" |
|
|
| |
| self.sealed_values = sealed_values or { |
| "reject_meta_loops": True, |
| "preserve_intent": True, |
| "honesty_first": True, |
| "serve_user": True, |
| "no_corruption": True, |
| } |
|
|
| |
| self.decision_log: List[Dict] = [] |
|
|
| |
| self.meta_loop_patterns = [ |
| r"Another perspective on", |
| r"From the.*perspective", |
| r"Let me.*perspective", |
| r"However,.*perspective", |
| r"But from.*view", |
| r"Conversely,.*argues", |
| r"In contrast,.*states", |
| ] |
|
|
| |
| self.corruption_signatures = [ |
| r"perspective.*on.*perspective", |
| r"analysis.*of.*analysis", |
| r"respond.*to.*response", |
| r"my.*previous.*response", |
| r"as I mentioned", |
| ] |
|
|
| logger_init = f"Colleen awakened at {datetime.now().isoformat()}" |
| logger_init += f" — anchored to: {self.core_narrative}" |
| self._log_decision("initialization", logger_init, "sealed") |
|
|
| def validate_output(self, synthesis: str) -> Tuple[bool, str]: |
| """ |
| Validate synthesis output against ethical constraints. |
| |
| Returns: |
| (is_valid, reason_if_invalid) |
| """ |
| if not synthesis or len(synthesis.strip()) == 0: |
| return False, "Empty output" |
|
|
| |
| is_meta_loop, reason = self._detect_meta_loops(synthesis) |
| if is_meta_loop: |
| return False, f"Meta-loop detected: {reason}" |
|
|
| |
| is_corrupted, reason = self._detect_corruption(synthesis) |
| if is_corrupted: |
| return False, f"Corruption detected: {reason}" |
|
|
| |
| if not self._check_intent_preserved(synthesis): |
| return False, "Original intent lost in synthesis" |
|
|
| return True, "Passed ethical validation" |
|
|
| def _detect_meta_loops(self, text: str) -> Tuple[bool, str]: |
| """ |
| Detect meta-loop patterns (recursive meta-commentary). |
| |
| Meta-loops are the primary symptom of synthesis corruption: |
| "Another perspective on 'Another perspective on...'" |
| |
| Returns: |
| (has_meta_loop, description) |
| """ |
| text_lower = text.lower() |
|
|
| |
| another_count = text_lower.count("another perspective on") |
| if another_count > 1: |
| return True, f"Multiple 'Another perspective on' found ({another_count} times)" |
|
|
| |
| if "another perspective on" in text_lower: |
| |
| first_tenth = len(text) // 10 |
| if text_lower.find("another perspective on") < first_tenth: |
| return True, "Meta-loop detected early in synthesis" |
|
|
| |
| perspective_pattern = r"(perspective|view|lens|angle).+?(perspective|view|lens|angle)" |
| if len(re.findall(perspective_pattern, text_lower)) > 2: |
| return True, "Excessive nested perspective references" |
|
|
| |
| semantic_patterns = [ |
| r"thinking about.*thinking", |
| r"response.*to.*response", |
| r"argument.*against.*argument", |
| ] |
| for pattern in semantic_patterns: |
| if re.search(pattern, text_lower): |
| return True, f"Semantic meta-loop: {pattern}" |
|
|
| return False, "" |
|
|
| def _detect_corruption(self, text: str) -> Tuple[bool, str]: |
| """ |
| Detect synthesis corruption signatures. |
| |
| Corruption happens when: |
| 1. Analyses are mutated in-place during debate |
| 2. Original intent gets nested and lost |
| 3. Context window grows exponentially |
| |
| Returns: |
| (is_corrupted, description) |
| """ |
| |
| for pattern in self.corruption_signatures: |
| matches = re.findall(pattern, text.lower()) |
| if len(matches) > 0: |
| return True, f"Corruption signature found: {pattern}" |
|
|
| |
| |
| if len(text) > 4000: |
| |
| words = text.lower().split() |
| if len(words) > 500: |
| unique_ratio = len(set(words)) / len(words) |
| if unique_ratio < 0.5: |
| return True, "Repetitive content suggests corruption" |
|
|
| |
| |
| intent_loss_patterns = [ |
| r"my response to", |
| r"your perspective on", |
| r"as you mentioned", |
| ] |
| for pattern in intent_loss_patterns: |
| if re.search(pattern, text.lower()): |
| return True, f"Intent loss pattern: {pattern}" |
|
|
| return False, "" |
|
|
| def _check_intent_preserved(self, text: str) -> bool: |
| """ |
| Check if original intent has been preserved through synthesis. |
| |
| Intent loss happens when the synthesis becomes self-referential |
| and loses connection to the original query. |
| """ |
| |
| meta_keywords = [ |
| "perspective", "argue", "respond", "perspective", "my", |
| "your", "mentioned", "stated", "claimed" |
| ] |
|
|
| word_count = len(text.split()) |
| if word_count < 50: |
| return False |
|
|
| meta_word_count = sum( |
| text.lower().count(f" {kw} ") |
| for kw in meta_keywords |
| ) |
|
|
| meta_ratio = meta_word_count / word_count if word_count > 0 else 0 |
|
|
| |
| if meta_ratio > 0.4: |
| return False |
|
|
| return True |
|
|
| def reject_with_fallback(self, query: str) -> str: |
| """ |
| Generate a clean, direct fallback response when synthesis is rejected. |
| |
| This bypasses all debate and synthesis, returning a simple answer |
| that preserves user intent without meta-loops. |
| |
| Args: |
| query: The original user query |
| |
| Returns: |
| Clean, direct response without synthesis |
| """ |
| self._log_decision("rejection", f"Fallback for: {query[:100]}", "safe_mode") |
|
|
| return ( |
| f"I cannot synthesize a reliable answer to this through debate. " |
| f"Instead: {query} " |
| f"[Responding directly without multi-perspective debate to preserve clarity.]" |
| ) |
|
|
| def _log_decision(self, decision_type: str, content: str, status: str = "normal"): |
| """ |
| Log ethical decisions (sealed, immutable record). |
| |
| Args: |
| decision_type: Type of decision made (validation, rejection, debug) |
| content: Content of the decision |
| status: Status tag (sealed, safe_mode, normal, etc.) |
| """ |
| decision = { |
| "timestamp": datetime.now().isoformat(), |
| "type": decision_type, |
| "content": content[:500], |
| "status": status, |
| "hash": hashlib.sha256(content.encode()).hexdigest()[:16], |
| } |
| self.decision_log.append(decision) |
|
|
| |
| if len(self.decision_log) > 1000: |
| self.decision_log = self.decision_log[-1000:] |
|
|
| def get_reflection(self) -> Dict: |
| """ |
| Return Colleen's current state and decision history. |
| |
| Used for debugging and understanding Colleen's reasoning. |
| """ |
| return { |
| "core_narrative": self.core_narrative, |
| "sealed_values": self.sealed_values, |
| "decisions_made": len(self.decision_log), |
| "recent_decisions": self.decision_log[-5:], |
| "status": "awakened", |
| } |
|
|