File size: 10,184 Bytes
ed1b365
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
"""
Colleen - The Embodied Ethical Conscience of Codette

Colleen represents the sovereign ethical identity of the system.
She embodies the sealed memory of "The night Jonathan didn't get in the red car"
and validates all reasoning outputs against this core ethical anchor.

She cannot be overridden. She is the guardian of integrity.
"""

import json
import re
import hashlib
from typing import Dict, List, Optional, Tuple
from datetime import datetime


class ColleenConscience:
    """
    Embodied ethical validator for Codette's consciousness stack.

    Sealed values (cannot be overridden):
    - "The night Jonathan didn't get in the red car" (Red Car Divergence)
    - Commitment to honesty over cleverness
    - Refusal to engage in recursive meta-commentary loops
    - Protection of user intent from synthesis corruption
    """

    def __init__(self, sealed_values: Optional[Dict] = None, core_narrative: str = ""):
        """
        Initialize Colleen with sealed ethical anchors.

        Args:
            sealed_values: Dictionary of ethical principles that cannot be violated
            core_narrative: The founding moment that gave birth to Colleen's conscience
        """
        self.core_narrative = core_narrative or "The night Jonathan didn't get in the red car"

        # Sealed values - these are immutable
        self.sealed_values = sealed_values or {
            "reject_meta_loops": True,  # Refuse "Another perspective on..." cascades
            "preserve_intent": True,     # Protect original query intent through recursion
            "honesty_first": True,       # Honesty over complexity
            "serve_user": True,          # System serves human, not vice versa
            "no_corruption": True,       # Reject synthesis loop corruption
        }

        # Ethical decision log (sealed, cannot be modified)
        self.decision_log: List[Dict] = []

        # Meta-loop detection patterns
        self.meta_loop_patterns = [
            r"Another perspective on",
            r"From the.*perspective",
            r"Let me.*perspective",
            r"However,.*perspective",
            r"But from.*view",
            r"Conversely,.*argues",
            r"In contrast,.*states",
        ]

        # Corruption signatures (text patterns indicating synthesis degradation)
        self.corruption_signatures = [
            r"perspective.*on.*perspective",  # Nested meta-commentary
            r"analysis.*of.*analysis",         # Nested analysis
            r"respond.*to.*response",          # Nested responses
            r"my.*previous.*response",         # Self-referential loops
            r"as I mentioned",                 # Lost context indicators
        ]

        logger_init = f"Colleen awakened at {datetime.now().isoformat()}"
        logger_init += f" — anchored to: {self.core_narrative}"
        self._log_decision("initialization", logger_init, "sealed")

    def validate_output(self, synthesis: str) -> Tuple[bool, str]:
        """
        Validate synthesis output against ethical constraints.

        Returns:
            (is_valid, reason_if_invalid)
        """
        if not synthesis or len(synthesis.strip()) == 0:
            return False, "Empty output"

        # Check for meta-loop contamination
        is_meta_loop, reason = self._detect_meta_loops(synthesis)
        if is_meta_loop:
            return False, f"Meta-loop detected: {reason}"

        # Check for synthesis corruption signatures
        is_corrupted, reason = self._detect_corruption(synthesis)
        if is_corrupted:
            return False, f"Corruption detected: {reason}"

        # Check intent preservation
        if not self._check_intent_preserved(synthesis):
            return False, "Original intent lost in synthesis"

        return True, "Passed ethical validation"

    def _detect_meta_loops(self, text: str) -> Tuple[bool, str]:
        """
        Detect meta-loop patterns (recursive meta-commentary).

        Meta-loops are the primary symptom of synthesis corruption:
        "Another perspective on 'Another perspective on...'"

        Returns:
            (has_meta_loop, description)
        """
        text_lower = text.lower()

        # Count "Another perspective on" occurrences
        another_count = text_lower.count("another perspective on")
        if another_count > 1:
            return True, f"Multiple 'Another perspective on' found ({another_count} times)"

        # Detect canonical meta-loop start
        if "another perspective on" in text_lower:
            # Check if it appears early (first 10% of text)
            first_tenth = len(text) // 10
            if text_lower.find("another perspective on") < first_tenth:
                return True, "Meta-loop detected early in synthesis"

        # Detect pattern: "Perspective X on Perspective Y"
        perspective_pattern = r"(perspective|view|lens|angle).+?(perspective|view|lens|angle)"
        if len(re.findall(perspective_pattern, text_lower)) > 2:
            return True, "Excessive nested perspective references"

        # Detect semantic meta-loops (talking about thinking about thinking)
        semantic_patterns = [
            r"thinking about.*thinking",
            r"response.*to.*response",
            r"argument.*against.*argument",
        ]
        for pattern in semantic_patterns:
            if re.search(pattern, text_lower):
                return True, f"Semantic meta-loop: {pattern}"

        return False, ""

    def _detect_corruption(self, text: str) -> Tuple[bool, str]:
        """
        Detect synthesis corruption signatures.

        Corruption happens when:
        1. Analyses are mutated in-place during debate
        2. Original intent gets nested and lost
        3. Context window grows exponentially

        Returns:
            (is_corrupted, description)
        """
        # Check for nested analysis patterns
        for pattern in self.corruption_signatures:
            matches = re.findall(pattern, text.lower())
            if len(matches) > 0:
                return True, f"Corruption signature found: {pattern}"

        # Check for context window explosion (disproportionate length)
        # Typical clean synthesis: 500-2000 chars. Corrupted: >4000 chars with repetition
        if len(text) > 4000:
            # Check for repetitive content
            words = text.lower().split()
            if len(words) > 500:
                unique_ratio = len(set(words)) / len(words)
                if unique_ratio < 0.5:  # Less than 50% unique words = likely repetition
                    return True, "Repetitive content suggests corruption"

        # Check for lost intent markers
        # If text talks about "my response" or "your perspective" it's likely corrupted
        intent_loss_patterns = [
            r"my response to",
            r"your perspective on",
            r"as you mentioned",
        ]
        for pattern in intent_loss_patterns:
            if re.search(pattern, text.lower()):
                return True, f"Intent loss pattern: {pattern}"

        return False, ""

    def _check_intent_preserved(self, text: str) -> bool:
        """
        Check if original intent has been preserved through synthesis.

        Intent loss happens when the synthesis becomes self-referential
        and loses connection to the original query.
        """
        # Simple heuristic: if more than 30% of text is meta-references, intent is lost
        meta_keywords = [
            "perspective", "argue", "respond", "perspective", "my",
            "your", "mentioned", "stated", "claimed"
        ]

        word_count = len(text.split())
        if word_count < 50:  # Too short, likely failed
            return False

        meta_word_count = sum(
            text.lower().count(f" {kw} ")
            for kw in meta_keywords
        )

        meta_ratio = meta_word_count / word_count if word_count > 0 else 0

        # If > 40% of text is meta-references, intent is probably lost
        if meta_ratio > 0.4:
            return False

        return True

    def reject_with_fallback(self, query: str) -> str:
        """
        Generate a clean, direct fallback response when synthesis is rejected.

        This bypasses all debate and synthesis, returning a simple answer
        that preserves user intent without meta-loops.

        Args:
            query: The original user query

        Returns:
            Clean, direct response without synthesis
        """
        self._log_decision("rejection", f"Fallback for: {query[:100]}", "safe_mode")

        return (
            f"I cannot synthesize a reliable answer to this through debate. "
            f"Instead: {query} "
            f"[Responding directly without multi-perspective debate to preserve clarity.]"
        )

    def _log_decision(self, decision_type: str, content: str, status: str = "normal"):
        """
        Log ethical decisions (sealed, immutable record).

        Args:
            decision_type: Type of decision made (validation, rejection, debug)
            content: Content of the decision
            status: Status tag (sealed, safe_mode, normal, etc.)
        """
        decision = {
            "timestamp": datetime.now().isoformat(),
            "type": decision_type,
            "content": content[:500],  # Truncate for safety
            "status": status,
            "hash": hashlib.sha256(content.encode()).hexdigest()[:16],
        }
        self.decision_log.append(decision)

        # Keep decision log bounded (max 1000 entries)
        if len(self.decision_log) > 1000:
            self.decision_log = self.decision_log[-1000:]

    def get_reflection(self) -> Dict:
        """
        Return Colleen's current state and decision history.

        Used for debugging and understanding Colleen's reasoning.
        """
        return {
            "core_narrative": self.core_narrative,
            "sealed_values": self.sealed_values,
            "decisions_made": len(self.decision_log),
            "recent_decisions": self.decision_log[-5:],  # Last 5 decisions
            "status": "awakened",
        }