morphism / retrieval.py

Upload retrieval.py

732bb64 verified about 1 month ago

10.6 kB

	"""
	Retrieval modes for EEG semantic decoding.

	Each mode is a callable class that takes:
	embedding_index: EmbeddingIndex
	nexus_conn: sqlite3 connection
	semantic_embedding: torch.Tensor (the current predicted text embedding)

	And returns:
	list of strings (lines to print), or empty list (suppress output this frame)

	Drop into EEGSemanticProcessor by replacing the find_similar_messages + _print_unique_lines
	path with: mode.step(semantic_embedding) -> lines
	"""

	import numpy as np
	import torch
	import hashlib
	import random
	from collections import deque


	def fix_encoding(s):
	if not s:
	return s
	if isinstance(s, str):
	b = s.encode('utf-8', 'surrogateescape')
	else:
	b = s
	fixed = b.decode('utf-8', 'replace')
	if 'ì' in s or 'í' in s or 'ï' in s:
	return ""
	return fixed


	def _retrieve(embedding_index, nexus_conn, embedding_np, k=64, assistant_only=False):
	"""Shared retrieval helper. Returns list of (content, distance) tuples."""
	if len(embedding_np.shape) == 1:
	embedding_np = embedding_np.reshape(1, -1)

	distances, indices = embedding_index.search(embedding_np, k)
	distances = distances.flatten()
	indices = indices.flatten()

	cursor = nexus_conn.cursor()
	query = "SELECT content FROM messages WHERE id = ?"
	if assistant_only:
	query += " AND role = 'assistant'"

	results = []
	for msg_id, dist in zip(indices, distances):
	cursor.execute(query, (int(msg_id),))
	row = cursor.fetchone()
	if row and row[0]:
	results.append((row[0], float(dist)))
	return results


	def _lines_from_messages(messages, max_lines=60):
	"""Extract individual lines from message contents, deduplicated."""
	lines = []
	seen = set()
	for content in messages:
	for line in content.splitlines():
	line = line.strip()
	if not line:
	continue
	line = fix_encoding(line)
	if not line:
	continue
	if line not in seen:
	seen.add(line)
	lines.append(line)
	if len(lines) >= max_lines:
	return lines
	return lines


	class FloodMode:
	"""
	Original behavior: retrieve k candidates, sample, deduplicate against
	recent windows. Fast, noisy, good for raw stream-of-consciousness.
	"""

	def __init__(self, embedding_index, nexus_conn, search_k=180, final_k=90,
	sample_size=42, last_n=3):
	self.embedding_index = embedding_index
	self.nexus_conn = nexus_conn
	self.search_k = search_k
	self.final_k = final_k
	self.sample_size = sample_size
	self.previous_sets = deque(maxlen=last_n)

	def step(self, semantic_embedding):
	emb_np = semantic_embedding.detach().cpu().numpy()
	results = _retrieve(self.embedding_index, self.nexus_conn, emb_np,
	k=self.search_k)

	messages = [content for content, _ in results[:self.final_k]]
	if not messages:
	return []

	sample = random.sample(messages, min(self.sample_size, len(messages)))

	current_lines = set()
	for msg in sample:
	for line in msg.splitlines():
	line = line.strip()
	if line:
	current_lines.add(line)

	unique = current_lines.copy()
	for prev in self.previous_sets:
	unique -= prev
	self.previous_sets.append(current_lines)

	unique = [l for l in map(fix_encoding, unique) if l]
	return sorted(unique)


	class DriftMode:
	"""
	Emit output only when the semantic pointer moves significantly.
	Retrieves based on the direction of movement (current - previous),
	added to the current position. This amplifies whatever the signal
	is shifting toward.

	Parameters:
	move_threshold: minimum cosine distance between consecutive
	embeddings to trigger output
	amplify: how much to weight the delta (1.0 = pure direction,
	0.0 = pure position)
	search_k: candidates to retrieve
	cooldown: minimum steps between outputs
	"""

	def __init__(self, embedding_index, nexus_conn, search_k=64,
	move_threshold=0.05, amplify=0.5, cooldown=3, max_lines=30):
	self.embedding_index = embedding_index
	self.nexus_conn = nexus_conn
	self.search_k = search_k
	self.move_threshold = move_threshold
	self.amplify = amplify
	self.cooldown = cooldown
	self.max_lines = max_lines

	self.prev_embedding = None
	self.steps_since_emit = 0
	self.prev_lines = set()

	def step(self, semantic_embedding):
	emb_np = semantic_embedding.detach().cpu().numpy().flatten()

	# Normalize
	norm = np.linalg.norm(emb_np)
	if norm > 0:
	emb_normed = emb_np / norm
	else:
	emb_normed = emb_np

	self.steps_since_emit += 1

	if self.prev_embedding is None:
	self.prev_embedding = emb_normed
	return []

	# Compute movement
	cos_sim = np.dot(emb_normed, self.prev_embedding)
	cos_dist = 1.0 - cos_sim

	if cos_dist < self.move_threshold or self.steps_since_emit < self.cooldown:
	return []

	# Direction of movement
	delta = emb_normed - self.prev_embedding
	delta_norm = np.linalg.norm(delta)
	if delta_norm > 0:
	delta = delta / delta_norm

	# Query = current position + amplified direction
	query = emb_normed + self.amplify * delta
	query_norm = np.linalg.norm(query)
	if query_norm > 0:
	query = query / query_norm

	self.prev_embedding = emb_normed
	self.steps_since_emit = 0

	results = _retrieve(self.embedding_index, self.nexus_conn,
	query.reshape(1, -1), k=self.search_k)

	messages = [content for content, _ in results]
	lines = _lines_from_messages(messages, self.max_lines)

	# Remove lines seen in previous emission
	lines = [l for l in lines if l not in self.prev_lines]
	self.prev_lines = set(lines)

	return lines


	class FocusMode:
	"""
	Maintain an exponential moving average of embeddings. Only emit
	when the centroid shifts enough. Surfaces the persistent underlying
	theme rather than moment-to-moment noise.

	Parameters:
	alpha: EMA smoothing factor (lower = smoother, more stable)
	shift_threshold: minimum cosine distance of centroid movement to emit
	search_k: candidates to retrieve
	top_n: how many top results to show (closest to centroid)
	"""

	def __init__(self, embedding_index, nexus_conn, search_k=48,
	alpha=0.15, shift_threshold=0.02, top_n=20, max_lines=25):
	self.embedding_index = embedding_index
	self.nexus_conn = nexus_conn
	self.search_k = search_k
	self.alpha = alpha
	self.shift_threshold = shift_threshold
	self.top_n = top_n
	self.max_lines = max_lines

	self.centroid = None
	self.last_emit_centroid = None
	self.prev_lines = set()

	def step(self, semantic_embedding):
	emb_np = semantic_embedding.detach().cpu().numpy().flatten()

	norm = np.linalg.norm(emb_np)
	if norm > 0:
	emb_normed = emb_np / norm
	else:
	emb_normed = emb_np

	# Update EMA centroid
	if self.centroid is None:
	self.centroid = emb_normed.copy()
	self.last_emit_centroid = emb_normed.copy()
	return []

	self.centroid = self.alpha * emb_normed + (1.0 - self.alpha) * self.centroid

	# Re-normalize centroid
	c_norm = np.linalg.norm(self.centroid)
	if c_norm > 0:
	centroid_normed = self.centroid / c_norm
	else:
	centroid_normed = self.centroid

	# Check if centroid has shifted enough since last emission
	cos_sim = np.dot(centroid_normed, self.last_emit_centroid)
	cos_dist = 1.0 - cos_sim

	if cos_dist < self.shift_threshold:
	return []

	self.last_emit_centroid = centroid_normed.copy()

	# Retrieve based on smoothed centroid
	results = _retrieve(self.embedding_index, self.nexus_conn,
	centroid_normed.reshape(1, -1), k=self.search_k)

	messages = [content for content, _ in results[:self.top_n]]
	lines = _lines_from_messages(messages, self.max_lines)

	# Deduplicate against previous emission
	lines = [l for l in lines if l not in self.prev_lines]
	self.prev_lines = set(_lines_from_messages(
	[content for content, _ in results[:self.top_n]], self.max_lines))

	return lines


	class LayeredMode:
	"""
	Run multiple timescales simultaneously. Show three sections:

	[fast] — what just changed (high threshold, small k)
	[mid] — recent theme (EMA with medium alpha)
	[slow] — deep undercurrent (EMA with low alpha)

	Each layer only emits its section when its own threshold is crossed.
	At least one layer must fire for any output.
	"""

	def __init__(self, embedding_index, nexus_conn, search_k=48, max_lines_per_layer=10):
	self.layers = {
	'fast': DriftMode(embedding_index, nexus_conn, search_k=search_k,
	move_threshold=0.08, amplify=0.7, cooldown=1,
	max_lines=max_lines_per_layer),
	'mid': FocusMode(embedding_index, nexus_conn, search_k=search_k,
	alpha=0.25, shift_threshold=0.03, top_n=16,
	max_lines=max_lines_per_layer),
	'slow': FocusMode(embedding_index, nexus_conn, search_k=search_k,
	alpha=0.05, shift_threshold=0.015, top_n=12,
	max_lines=max_lines_per_layer),
	}

	def step(self, semantic_embedding):
	sections = {}
	for name, layer in self.layers.items():
	lines = layer.step(semantic_embedding)
	if lines:
	sections[name] = lines

	if not sections:
	return []

	output = []
	for name in ['fast', 'mid', 'slow']:
	if name in sections:
	output.append(f"── {name} ──")
	output.extend(sections[name])
	output.append("")

	return output