Spaces:

Executor-Tyrant-Framework
/

NuWave

Running

App Files Files Community

NuWave / nuwave /substrate /ng_embed.py

Executor-Tyrant-Framework

Initial commit

4c0cf4e about 1 month ago

raw

history blame contribute delete

22.3 kB

	"""
	ng_embed — Centralized embedding service for the E-T Systems ecosystem.

	Singleton embedding engine used by every module. Provides:
	1. Unified embedding via Snowflake/snowflake-arctic-embed-m-v1.5 (ONNX)
	2. Dual-pass embedding (forest + trees) via TID concept extraction
	3. Thread-safe singleton — one model instance per process
	4. Hash fallback when ONNX model unavailable

	This is a VENDORED file. Canonical source: ~/NeuroGraph/ng_embed.py
	Do NOT modify vendored copies. Changes made here, re-vendored everywhere.

	Model: Snowflake/snowflake-arctic-embed-m-v1.5
	- 768-dim, CLS pooling, standard BERT architecture
	- Query prefix: "Represent this sentence for searching relevant passages: "
	- Documents: no prefix
	- ONNX quantized (~110MB) via onnxruntime — no PyTorch dependency

	Dual-pass (Punchlist #81 — Josh's invention):
	Pass 1 (Forest): Gestalt embedding of whole content. One node.
	Pass 2 (Trees): LLM extracts concepts via TID. Each concept embedded
	separately. Each tree linked to its forest via synapses. Cross-document
	tree links form naturally through similarity association.

	# ---- Changelog ----
	# [2026-03-22] Claude (Opus 4.6) — Initial creation.
	# What: Centralized embedding + dual-pass for entire ecosystem.
	# Why: PRD §5 (Dual_Pass_Embedding_Implementation.md). Replaces 7+
	# identical _embed() functions. Prevents embedding dimension
	# mismatch incidents. Upgrades model from bge-base-en-v1.5 to
	# snowflake-arctic-embed-m-v1.5 (+1.89 retrieval MTEB).
	# How: ONNX Runtime + tokenizers for embedding. TID for concept
	# extraction. Substrate-learnable gate for Pass 2 value.
	# -------------------
	"""

	from __future__ import annotations

	import hashlib
	import json
	import logging
	import os
	import threading
	import time
	from pathlib import Path
	from typing import Any, Callable, Dict, List, Optional, TYPE_CHECKING

	import numpy as np

	if TYPE_CHECKING:
	from ng_ecosystem import NGEcosystem

	logger = logging.getLogger("ng_embed")

	# ---------------------------------------------------------------------------
	# Configuration defaults — all values are bootstrap scaffolding
	# ---------------------------------------------------------------------------

	_DEFAULT_CONFIG = {
	# Model
	"model_id": "Snowflake/snowflake-arctic-embed-m-v1.5",
	"onnx_filename": "onnx/model_quantized.onnx",
	"embedding_dim": 768,
	"pooling": "cls",
	"query_prefix": "Represent this sentence for searching relevant passages: ",
	"document_prefix": "",
	"cache_dir": str(Path.home() / ".cache" / "ng_embed"),

	# Dual-pass (Punchlist #81)
	"tid_endpoint": "http://127.0.0.1:7437/v1/chat/completions",
	"max_content_for_extraction": 2000, # Chars sent to TID
	"max_concepts": 20, # Cap extracted concepts
	"forest_to_tree_weight": 0.4, # Bootstrap synapse weight
	"tree_to_forest_ratio": 0.7, # tree→forest = forest_weight * ratio
	"tid_timeout": 30, # Seconds
	"tid_model": "auto", # TID routes to appropriate model
	"tid_temperature": 0.2,
	"tid_max_tokens": 500,
	}

	# Concept extraction prompt — not classification, not labeling.
	# The LLM reads content and identifies distinct concepts within it.
	# This is extraction at the ingestion boundary — the LLM is a tool
	# that helps the substrate receive richer raw experience (Law 7).
	_EXTRACTION_PROMPT = """Extract the key concepts, terms, and specific references from this text. Return them as a JSON array of short strings, each one a distinct concept or term mentioned in the text.

	Focus on:
	- Specific technical terms
	- Named entities (people, tools, systems)
	- Domain-specific concepts
	- Action descriptions
	- Relationships between things

	Return ONLY a JSON array of strings. No explanation. Example: ["concept one", "concept two", "specific term"]

	Text:
	{content}"""


	# ---------------------------------------------------------------------------
	# NGEmbed — The singleton embedding service
	# ---------------------------------------------------------------------------

	class NGEmbed:
	"""Centralized embedding engine for the E-T Systems ecosystem.

	Thread-safe singleton. One ONNX model instance per process, shared
	by all modules. Provides both single-pass embedding and dual-pass
	(forest + trees) via TID concept extraction.

	Usage:
	from ng_embed import embed, embed_batch

	vec = embed("some text") # 768-dim document embedding
	vec = embed("query text", is_query=True) # With query prefix
	vec = embed("text", normalize=True) # L2-normalized (Praxis)

	vecs = embed_batch(["text1", "text2"]) # Batch embedding
	"""

	_instance: Optional["NGEmbed"] = None
	_lock = threading.Lock()

	def __init__(self, config: Optional[Dict[str, Any]] = None):
	self._config = dict(_DEFAULT_CONFIG)
	if config:
	self._config.update(config)

	self._session = None # ONNX InferenceSession (lazy)
	self._tokenizer = None # tokenizers.Tokenizer (lazy)
	self._model_loaded = False
	self._model_failed = False
	self._model_lock = threading.Lock()

	# Dual-pass stats
	self._extractions = 0
	self._concepts_total = 0
	self._failures = 0

	# -- Singleton -----------------------------------------------------------

	@classmethod
	def get_instance(cls, config: Optional[Dict[str, Any]] = None) -> "NGEmbed":
	"""Thread-safe singleton factory."""
	if cls._instance is not None:
	return cls._instance
	with cls._lock:
	if cls._instance is None:
	cls._instance = cls(config)
	return cls._instance

	@classmethod
	def reset_instance(cls) -> None:
	"""Destroy singleton (testing only)."""
	with cls._lock:
	if cls._instance is not None:
	cls._instance._session = None
	cls._instance._tokenizer = None
	cls._instance = None

	# -- Model loading -------------------------------------------------------

	def _ensure_model(self) -> bool:
	"""Lazy-load ONNX model + tokenizer on first use."""
	if self._model_loaded:
	return True
	if self._model_failed:
	return False

	with self._model_lock:
	if self._model_loaded:
	return True
	if self._model_failed:
	return False

	try:
	import onnxruntime as ort
	from huggingface_hub import hf_hub_download
	from tokenizers import Tokenizer

	model_id = self._config["model_id"]
	cache_dir = self._config["cache_dir"]
	os.makedirs(cache_dir, exist_ok=True)

	# Download ONNX model
	onnx_path = hf_hub_download(
	repo_id=model_id,
	filename=self._config["onnx_filename"],
	cache_dir=cache_dir,
	)

	# Load ONNX session (CPU, optimized)
	sess_opts = ort.SessionOptions()
	sess_opts.graph_optimization_level = (
	ort.GraphOptimizationLevel.ORT_ENABLE_ALL
	)
	sess_opts.intra_op_num_threads = max(1, os.cpu_count() // 2)
	self._session = ort.InferenceSession(
	onnx_path,
	sess_options=sess_opts,
	providers=["CPUExecutionProvider"],
	)

	# Load tokenizer
	self._tokenizer = Tokenizer.from_pretrained(model_id)
	self._tokenizer.enable_padding(
	pad_id=0, pad_token="[PAD]",
	)
	self._tokenizer.enable_truncation(max_length=512)

	self._model_loaded = True
	logger.info(
	"ng_embed: loaded %s (ONNX, %d-dim, CLS pooling)",
	model_id, self._config["embedding_dim"],
	)
	return True

	except Exception as exc:
	logger.warning("ng_embed: model load failed, using hash fallback: %s", exc)
	self._model_failed = True
	return False

	# -- Embedding -----------------------------------------------------------

	def embed(
	self,
	text: str,
	normalize: bool = False,
	is_query: bool = False,
	) -> np.ndarray:
	"""Embed text → 768-dim float32 numpy array.

	Args:
	text: Raw text to embed.
	normalize: L2-normalize output (True for Praxis compatibility).
	is_query: Prepend query prefix (for recall/search operations).

	Returns:
	768-dim float32 numpy array.
	"""
	if self._ensure_model():
	return self._onnx_embed(text, normalize=normalize, is_query=is_query)
	return self._hash_embed(text, normalize=normalize)

	def embed_batch(
	self,
	texts: List[str],
	normalize: bool = False,
	is_query: bool = False,
	) -> List[np.ndarray]:
	"""Batch embedding for efficiency.

	Args:
	texts: List of texts to embed.
	normalize: L2-normalize outputs.
	is_query: Prepend query prefix to all texts.

	Returns:
	List of 768-dim float32 numpy arrays.
	"""
	if not texts:
	return []
	if self._ensure_model():
	return self._onnx_embed_batch(texts, normalize=normalize, is_query=is_query)
	return [self._hash_embed(t, normalize=normalize) for t in texts]

	def _onnx_embed(
	self,
	text: str,
	normalize: bool = False,
	is_query: bool = False,
	) -> np.ndarray:
	"""Single text embedding via ONNX Runtime."""
	# Apply prefix
	if is_query:
	text = self._config["query_prefix"] + text
	else:
	prefix = self._config["document_prefix"]
	if prefix:
	text = prefix + text

	# Tokenize
	encoding = self._tokenizer.encode(text)
	input_ids = np.array([encoding.ids], dtype=np.int64)
	attention_mask = np.array([encoding.attention_mask], dtype=np.int64)

	# Infer
	outputs = self._session.run(
	None,
	{
	"input_ids": input_ids,
	"attention_mask": attention_mask,
	},
	)

	# sentence_embedding output (index 1) — pre-pooled by model
	vec = outputs[1][0, :].astype(np.float32)

	if normalize:
	norm = np.linalg.norm(vec)
	if norm > 0:
	vec = vec / norm

	return vec

	def _onnx_embed_batch(
	self,
	texts: List[str],
	normalize: bool = False,
	is_query: bool = False,
	) -> List[np.ndarray]:
	"""Batch embedding via ONNX Runtime with padding."""
	# Apply prefixes
	prefixed = []
	for text in texts:
	if is_query:
	prefixed.append(self._config["query_prefix"] + text)
	else:
	prefix = self._config["document_prefix"]
	prefixed.append((prefix + text) if prefix else text)

	# Batch tokenize
	encodings = self._tokenizer.encode_batch(prefixed)
	max_len = max(len(e.ids) for e in encodings)

	input_ids = np.zeros((len(encodings), max_len), dtype=np.int64)
	attention_mask = np.zeros((len(encodings), max_len), dtype=np.int64)

	for i, enc in enumerate(encodings):
	length = len(enc.ids)
	input_ids[i, :length] = enc.ids
	attention_mask[i, :length] = enc.attention_mask

	# Infer
	outputs = self._session.run(
	None,
	{
	"input_ids": input_ids,
	"attention_mask": attention_mask,
	},
	)

	# sentence_embedding output (index 1) — pre-pooled by model
	results = []
	for i in range(len(texts)):
	vec = outputs[1][i, :].astype(np.float32)
	if normalize:
	norm = np.linalg.norm(vec)
	if norm > 0:
	vec = vec / norm
	results.append(vec)

	return results

	def _hash_embed(
	self,
	text: str,
	normalize: bool = False,
	) -> np.ndarray:
	"""Deterministic hash-based fallback embedding.

	Produces a stable 768-dim vector from text via SHA-384.
	Not semantically meaningful — ensures modules can operate
	when the ONNX model is unavailable.
	"""
	dim = self._config["embedding_dim"]
	h = hashlib.sha384(text.encode("utf-8")).digest()
	# Expand hash to fill dim via seeded RNG
	rng = np.random.RandomState(
	int.from_bytes(h[:4], "little")
	)
	vec = rng.randn(dim).astype(np.float32)
	if normalize:
	norm = np.linalg.norm(vec)
	if norm > 0:
	vec = vec / norm
	return vec

	# -- Dual-pass (Punchlist #81) -------------------------------------------

	def dual_record_outcome(
	self,
	ecosystem: "NGEcosystem",
	content: str,
	embedding: np.ndarray,
	target_id: str,
	success: bool,
	strength: float = 1.0,
	metadata: Optional[Dict[str, Any]] = None,
	) -> Dict[str, Any]:
	"""Dual-pass learning: forest embedding + tree concept extraction.

	Pass 1: Record the forest (gestalt) embedding via ecosystem.record_outcome().
	Pass 2: Extract concepts via TID → embed each → record_outcome()
	per tree → create forest→tree synapses in the substrate.

	If TID is unavailable or extraction fails, gracefully falls back
	to single-pass (forest only). Pass 2 failure is never fatal.

	Args:
	ecosystem: The module's NGEcosystem instance.
	content: Raw text content (for concept extraction).
	embedding: Pre-computed forest embedding (Pass 1).
	target_id: Opaque string for what was decided.
	success: Whether the outcome was successful.
	strength: Caller-reported significance [0.0, 1.0].
	metadata: Additional metadata dict.

	Returns:
	{
	"forest_result": dict, # record_outcome result for forest
	"tree_ids": [str], # Target IDs for tree nodes
	"concepts": [str], # Extracted concept strings
	"pass2_attempted": bool,
	}
	"""
	# Pass 1: Forest — standard record_outcome with gestalt embedding
	forest_result = ecosystem.record_outcome(
	embedding, target_id, success,
	strength=strength, metadata=metadata,
	)

	result = {
	"forest_result": forest_result,
	"tree_ids": [],
	"concepts": [],
	"pass2_attempted": False,
	}

	# Pass 2: Trees — concept extraction via TID
	concepts = self._extract_concepts(content)
	result["pass2_attempted"] = True

	if not concepts:
	return result

	result["concepts"] = concepts

	# Embed and record each concept
	tree_embeddings = self.embed_batch(concepts)
	for concept, tree_emb in zip(concepts, tree_embeddings):
	tree_meta = dict(metadata or {})
	tree_meta["_tree_concept"] = True
	tree_meta["_forest_target_id"] = target_id
	tree_meta["_concept"] = concept

	tree_target = f"{target_id}::tree::{concept[:64]}"
	tree_result = ecosystem.record_outcome(
	tree_emb, tree_target, success,
	strength=strength * 0.8, # Trees slightly softer than forest
	metadata=tree_meta,
	)

	if tree_result:
	result["tree_ids"].append(tree_target)

	# Forest→tree synapse creation happens in the substrate
	# through ng_lite's similarity-based association when the
	# tree embedding is close enough to the forest. The explicit
	# synapses below reinforce this connection at bootstrap weight.
	self._create_substrate_link(
	ecosystem, embedding, tree_emb,
	target_id, tree_target,
	)

	self._extractions += 1
	self._concepts_total += len(result["tree_ids"])

	logger.debug(
	"Dual-pass: forest=%s, %d trees from %d concepts",
	target_id[:32], len(result["tree_ids"]), len(concepts),
	)

	return result

	def _create_substrate_link(
	self,
	ecosystem: "NGEcosystem",
	forest_emb: np.ndarray,
	tree_emb: np.ndarray,
	forest_target: str,
	tree_target: str,
	) -> None:
	"""Create forest↔tree link in the substrate via record_outcome.

	Uses cross-recording: record the tree embedding against the forest
	target_id, and vice versa. This creates bidirectional associations
	in the substrate's Hebbian network.
	"""
	weight = self._config["forest_to_tree_weight"]
	ratio = self._config["tree_to_forest_ratio"]

	# Forest→tree: "when I see this tree, recall the forest"
	try:
	ecosystem.record_outcome(
	tree_emb, forest_target, True,
	strength=weight,
	metadata={"_link": "dual_pass_tree_to_forest"},
	)
	except Exception:
	pass

	# Tree→forest: "when I see this forest, recall the tree"
	try:
	ecosystem.record_outcome(
	forest_emb, tree_target, True,
	strength=weight * ratio,
	metadata={"_link": "dual_pass_forest_to_tree"},
	)
	except Exception:
	pass

	def _extract_concepts(self, text: str) -> List[str]:
	"""Extract concepts from text via TID LLM call.

	One LLM call per ingestion. Returns list of concept strings,
	or empty list on failure (non-fatal).
	"""
	import requests

	content = text[:self._config["max_content_for_extraction"]]
	prompt = _EXTRACTION_PROMPT.format(content=content)

	try:
	resp = requests.post(
	self._config["tid_endpoint"],
	json={
	"model": self._config["tid_model"],
	"messages": [
	{
	"role": "system",
	"content": "You extract concepts from text. "
	"Return only a JSON array of strings.",
	},
	{"role": "user", "content": prompt},
	],
	"temperature": self._config["tid_temperature"],
	"max_tokens": self._config["tid_max_tokens"],
	},
	timeout=self._config["tid_timeout"],
	)
	resp.raise_for_status()
	response_text = (
	resp.json()["choices"][0]["message"]["content"].strip()
	)

	concepts = self._parse_concepts(response_text)
	return concepts[:self._config["max_concepts"]]

	except Exception as exc:
	logger.debug("Concept extraction failed: %s", exc)
	self._failures += 1
	return []

	@staticmethod
	def _parse_concepts(text: str) -> List[str]:
	"""Parse a JSON array from LLM response, handling markdown fences."""
	text = text.strip()
	if text.startswith("```"):
	lines = text.split("\n")
	lines = [l for l in lines if not l.strip().startswith("```")]
	text = "\n".join(lines).strip()

	try:
	result = json.loads(text)
	if isinstance(result, list):
	return [str(c).strip() for c in result if str(c).strip()]
	except json.JSONDecodeError:
	start = text.find("[")
	end = text.rfind("]") + 1
	if start >= 0 and end > start:
	try:
	result = json.loads(text[start:end])
	if isinstance(result, list):
	return [str(c).strip() for c in result if str(c).strip()]
	except json.JSONDecodeError:
	pass

	return []

	# -- Stats ---------------------------------------------------------------

	@property
	def stats(self) -> Dict[str, Any]:
	return {
	"model_id": self._config["model_id"],
	"model_loaded": self._model_loaded,
	"embedding_dim": self._config["embedding_dim"],
	"pooling": self._config["pooling"],
	"dual_pass": {
	"extractions": self._extractions,
	"concepts_total": self._concepts_total,
	"failures": self._failures,
	"avg_concepts": (
	round(self._concepts_total / self._extractions, 1)
	if self._extractions > 0 else 0
	),
	},
	}


	# ---------------------------------------------------------------------------
	# Module-level convenience functions
	# ---------------------------------------------------------------------------

	def embed(
	text: str,
	normalize: bool = False,
	is_query: bool = False,
	) -> np.ndarray:
	"""Embed text → 768-dim float32 numpy array.

	Convenience wrapper around NGEmbed.get_instance().embed().
	"""
	return NGEmbed.get_instance().embed(text, normalize=normalize, is_query=is_query)


	def embed_batch(
	texts: List[str],
	normalize: bool = False,
	is_query: bool = False,
	) -> List[np.ndarray]:
	"""Batch embed texts → list of 768-dim float32 numpy arrays."""
	return NGEmbed.get_instance().embed_batch(
	texts, normalize=normalize, is_query=is_query,
	)