File size: 10,590 Bytes
732bb64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
"""
Retrieval modes for EEG semantic decoding.

Each mode is a callable class that takes:
    embedding_index: EmbeddingIndex
    nexus_conn: sqlite3 connection
    semantic_embedding: torch.Tensor  (the current predicted text embedding)

And returns:
    list of strings (lines to print), or empty list (suppress output this frame)

Drop into EEGSemanticProcessor by replacing the find_similar_messages + _print_unique_lines
path with:   mode.step(semantic_embedding) -> lines
"""

import numpy as np
import torch
import hashlib
import random
from collections import deque


def fix_encoding(s):
    if not s:
        return s
    if isinstance(s, str):
        b = s.encode('utf-8', 'surrogateescape')
    else:
        b = s
    fixed = b.decode('utf-8', 'replace')
    if 'ì' in s or 'í' in s or 'ï' in s:
        return ""
    return fixed


def _retrieve(embedding_index, nexus_conn, embedding_np, k=64, assistant_only=False):
    """Shared retrieval helper. Returns list of (content, distance) tuples."""
    if len(embedding_np.shape) == 1:
        embedding_np = embedding_np.reshape(1, -1)

    distances, indices = embedding_index.search(embedding_np, k)
    distances = distances.flatten()
    indices = indices.flatten()

    cursor = nexus_conn.cursor()
    query = "SELECT content FROM messages WHERE id = ?"
    if assistant_only:
        query += " AND role = 'assistant'"

    results = []
    for msg_id, dist in zip(indices, distances):
        cursor.execute(query, (int(msg_id),))
        row = cursor.fetchone()
        if row and row[0]:
            results.append((row[0], float(dist)))
    return results


def _lines_from_messages(messages, max_lines=60):
    """Extract individual lines from message contents, deduplicated."""
    lines = []
    seen = set()
    for content in messages:
        for line in content.splitlines():
            line = line.strip()
            if not line:
                continue
            line = fix_encoding(line)
            if not line:
                continue
            if line not in seen:
                seen.add(line)
                lines.append(line)
            if len(lines) >= max_lines:
                return lines
    return lines


class FloodMode:
    """
    Original behavior: retrieve k candidates, sample, deduplicate against
    recent windows. Fast, noisy, good for raw stream-of-consciousness.
    """

    def __init__(self, embedding_index, nexus_conn, search_k=180, final_k=90,
                 sample_size=42, last_n=3):
        self.embedding_index = embedding_index
        self.nexus_conn = nexus_conn
        self.search_k = search_k
        self.final_k = final_k
        self.sample_size = sample_size
        self.previous_sets = deque(maxlen=last_n)

    def step(self, semantic_embedding):
        emb_np = semantic_embedding.detach().cpu().numpy()
        results = _retrieve(self.embedding_index, self.nexus_conn, emb_np,
                           k=self.search_k)

        messages = [content for content, _ in results[:self.final_k]]
        if not messages:
            return []

        sample = random.sample(messages, min(self.sample_size, len(messages)))

        current_lines = set()
        for msg in sample:
            for line in msg.splitlines():
                line = line.strip()
                if line:
                    current_lines.add(line)

        unique = current_lines.copy()
        for prev in self.previous_sets:
            unique -= prev
        self.previous_sets.append(current_lines)

        unique = [l for l in map(fix_encoding, unique) if l]
        return sorted(unique)


class DriftMode:
    """
    Emit output only when the semantic pointer moves significantly.
    Retrieves based on the *direction* of movement (current - previous),
    added to the current position. This amplifies whatever the signal
    is shifting toward.

    Parameters:
        move_threshold: minimum cosine distance between consecutive
                        embeddings to trigger output
        amplify: how much to weight the delta (1.0 = pure direction,
                 0.0 = pure position)
        search_k: candidates to retrieve
        cooldown: minimum steps between outputs
    """

    def __init__(self, embedding_index, nexus_conn, search_k=64,
                 move_threshold=0.05, amplify=0.5, cooldown=3, max_lines=30):
        self.embedding_index = embedding_index
        self.nexus_conn = nexus_conn
        self.search_k = search_k
        self.move_threshold = move_threshold
        self.amplify = amplify
        self.cooldown = cooldown
        self.max_lines = max_lines

        self.prev_embedding = None
        self.steps_since_emit = 0
        self.prev_lines = set()

    def step(self, semantic_embedding):
        emb_np = semantic_embedding.detach().cpu().numpy().flatten()

        # Normalize
        norm = np.linalg.norm(emb_np)
        if norm > 0:
            emb_normed = emb_np / norm
        else:
            emb_normed = emb_np

        self.steps_since_emit += 1

        if self.prev_embedding is None:
            self.prev_embedding = emb_normed
            return []

        # Compute movement
        cos_sim = np.dot(emb_normed, self.prev_embedding)
        cos_dist = 1.0 - cos_sim

        if cos_dist < self.move_threshold or self.steps_since_emit < self.cooldown:
            return []

        # Direction of movement
        delta = emb_normed - self.prev_embedding
        delta_norm = np.linalg.norm(delta)
        if delta_norm > 0:
            delta = delta / delta_norm

        # Query = current position + amplified direction
        query = emb_normed + self.amplify * delta
        query_norm = np.linalg.norm(query)
        if query_norm > 0:
            query = query / query_norm

        self.prev_embedding = emb_normed
        self.steps_since_emit = 0

        results = _retrieve(self.embedding_index, self.nexus_conn,
                           query.reshape(1, -1), k=self.search_k)

        messages = [content for content, _ in results]
        lines = _lines_from_messages(messages, self.max_lines)

        # Remove lines seen in previous emission
        lines = [l for l in lines if l not in self.prev_lines]
        self.prev_lines = set(lines)

        return lines


class FocusMode:
    """
    Maintain an exponential moving average of embeddings. Only emit
    when the centroid shifts enough. Surfaces the persistent underlying
    theme rather than moment-to-moment noise.

    Parameters:
        alpha: EMA smoothing factor (lower = smoother, more stable)
        shift_threshold: minimum cosine distance of centroid movement to emit
        search_k: candidates to retrieve
        top_n: how many top results to show (closest to centroid)
    """

    def __init__(self, embedding_index, nexus_conn, search_k=48,
                 alpha=0.15, shift_threshold=0.02, top_n=20, max_lines=25):
        self.embedding_index = embedding_index
        self.nexus_conn = nexus_conn
        self.search_k = search_k
        self.alpha = alpha
        self.shift_threshold = shift_threshold
        self.top_n = top_n
        self.max_lines = max_lines

        self.centroid = None
        self.last_emit_centroid = None
        self.prev_lines = set()

    def step(self, semantic_embedding):
        emb_np = semantic_embedding.detach().cpu().numpy().flatten()

        norm = np.linalg.norm(emb_np)
        if norm > 0:
            emb_normed = emb_np / norm
        else:
            emb_normed = emb_np

        # Update EMA centroid
        if self.centroid is None:
            self.centroid = emb_normed.copy()
            self.last_emit_centroid = emb_normed.copy()
            return []

        self.centroid = self.alpha * emb_normed + (1.0 - self.alpha) * self.centroid

        # Re-normalize centroid
        c_norm = np.linalg.norm(self.centroid)
        if c_norm > 0:
            centroid_normed = self.centroid / c_norm
        else:
            centroid_normed = self.centroid

        # Check if centroid has shifted enough since last emission
        cos_sim = np.dot(centroid_normed, self.last_emit_centroid)
        cos_dist = 1.0 - cos_sim

        if cos_dist < self.shift_threshold:
            return []

        self.last_emit_centroid = centroid_normed.copy()

        # Retrieve based on smoothed centroid
        results = _retrieve(self.embedding_index, self.nexus_conn,
                           centroid_normed.reshape(1, -1), k=self.search_k)

        messages = [content for content, _ in results[:self.top_n]]
        lines = _lines_from_messages(messages, self.max_lines)

        # Deduplicate against previous emission
        lines = [l for l in lines if l not in self.prev_lines]
        self.prev_lines = set(_lines_from_messages(
            [content for content, _ in results[:self.top_n]], self.max_lines))

        return lines


class LayeredMode:
    """
    Run multiple timescales simultaneously. Show three sections:

      [fast]  — what just changed (high threshold, small k)
      [mid]   — recent theme (EMA with medium alpha)
      [slow]  — deep undercurrent (EMA with low alpha)

    Each layer only emits its section when its own threshold is crossed.
    At least one layer must fire for any output.
    """

    def __init__(self, embedding_index, nexus_conn, search_k=48, max_lines_per_layer=10):
        self.layers = {
            'fast': DriftMode(embedding_index, nexus_conn, search_k=search_k,
                             move_threshold=0.08, amplify=0.7, cooldown=1,
                             max_lines=max_lines_per_layer),
            'mid': FocusMode(embedding_index, nexus_conn, search_k=search_k,
                            alpha=0.25, shift_threshold=0.03, top_n=16,
                            max_lines=max_lines_per_layer),
            'slow': FocusMode(embedding_index, nexus_conn, search_k=search_k,
                             alpha=0.05, shift_threshold=0.015, top_n=12,
                             max_lines=max_lines_per_layer),
        }

    def step(self, semantic_embedding):
        sections = {}
        for name, layer in self.layers.items():
            lines = layer.step(semantic_embedding)
            if lines:
                sections[name] = lines

        if not sections:
            return []

        output = []
        for name in ['fast', 'mid', 'slow']:
            if name in sections:
                output.append(f"── {name} ──")
                output.extend(sections[name])
                output.append("")

        return output