File size: 71,396 Bytes

bca5039

from __future__ import annotations
import torch
import numpy as np
from abc import ABC, abstractmethod
from typing import Dict, Union, Tuple, Optional, Callable, Any, List
import warnings
from collections import defaultdict
import datasets
from datasets import load_dataset

# Optional dependencies for spatial indexing
try:
    import faiss
    FAISS_AVAILABLE = True
except ImportError:
    FAISS_AVAILABLE = False

try:
    from sklearn.neighbors import NearestNeighbors
    SKLEARN_AVAILABLE = True
except ImportError:
    SKLEARN_AVAILABLE = False


class SpatialIndex:
    """Spatial indexing for fast similarity search."""

    def __init__(self, vectors: np.ndarray, token_ids: List[int], method: str = "auto"):
        self.token_ids = np.array(token_ids)
        self.method = method
        self._index = None

        if method == "auto":
            if FAISS_AVAILABLE and vectors.shape[0] > 1000:
                method = "faiss"
            elif SKLEARN_AVAILABLE:
                method = "sklearn"
            else:
                method = "linear"

        self._build_index(vectors, method)

    def _build_index(self, vectors: np.ndarray, method: str):
        if method == "faiss" and FAISS_AVAILABLE:
            # L1 distance approximation using L2 index with normalized vectors
            vectors_l2 = vectors / (np.linalg.norm(vectors, axis=1, keepdims=True) + 1e-8)
            self._index = faiss.IndexFlatIP(vectors_l2.shape[1])  # Inner product for normalized vectors
            self._index.add(vectors_l2.astype(np.float32))
            self.method = "faiss"

        elif method == "sklearn" and SKLEARN_AVAILABLE:
            # Use manhattan distance for true L1
            self._index = NearestNeighbors(
                metric='manhattan',
                algorithm='ball_tree',
                n_jobs=-1
            ).fit(vectors)
            self.method = "sklearn"
        else:
            # Fallback to linear search
            self._vectors = vectors
            self.method = "linear"

    def search_radius(self, query_vector: np.ndarray, max_distance: float, max_results: int = 1000) -> Tuple[
        List[int], List[float]]:
        """Find all points within max_distance using L1 metric."""
        if self.method == "sklearn":
            indices = self._index.radius_neighbors([query_vector], radius=max_distance)[1][0]
            if len(indices) > max_results:
                # Compute actual distances and take closest
                distances = np.sum(np.abs(self._vectors[indices] - query_vector), axis=1)
                top_k = np.argsort(distances)[:max_results]
                indices = indices[top_k]
            distances = np.sum(np.abs(self._vectors[indices] - query_vector), axis=1)
            return self.token_ids[indices].tolist(), distances.tolist()

        elif self.method == "faiss":
            # Approximate search using cosine similarity
            query_l2 = query_vector / (np.linalg.norm(query_vector) + 1e-8)
            similarities, indices = self._index.search(query_l2.reshape(1, -1).astype(np.float32), max_results)
            # Filter by converting similarity threshold to approximate distance
            threshold_sim = 1.0 - max_distance  # rough approximation
            mask = similarities[0] >= threshold_sim
            return self.token_ids[indices[0][mask]].tolist(), (1.0 - similarities[0][mask]).tolist()

        else:  # linear
            distances = np.sum(np.abs(self._vectors - query_vector), axis=1)
            mask = distances <= max_distance
            if np.sum(mask) > max_results:
                indices = np.argsort(distances)[:max_results]
                mask = np.zeros_like(distances, dtype=bool)
                mask[indices] = True
            return self.token_ids[mask].tolist(), distances[mask].tolist()


class GeometricVocab(ABC):
    """
    Optimized geometric vocabulary with spatial indexing and caching.
    """

    def __init__(self, dim: int):
        self.dim = int(dim)
        self._token_to_id: Dict[str, int] = {}
        self._id_to_token: Dict[int, str] = {}
        self._id_to_vec: Dict[int, np.ndarray] = {}
        self._id_to_volume: Dict[int, float] = {}
        self._id_to_provenance: Dict[int, dict] = {}
        self._valid_token_ids: set[int] = set()

        # Optimization caches
        self._normalized_cache: Dict[int, np.ndarray] = {}
        self._pooled_cache: Dict[int, np.ndarray] = {}
        self._spatial_index: Optional[SpatialIndex] = None
        self._index_dirty = False
        
        # NEW: Character-level cache for Unicode composition
        self._char_cache: Dict[str, np.ndarray] = {}
        self._char_lookups_saved = 0  # Statistics

    def _invalidate_caches(self):
        """Invalidate caches when vocabulary changes."""
        self._normalized_cache.clear()
        self._pooled_cache.clear()
        self._spatial_index = None
        self._index_dirty = True
        # Keep char cache across vocabulary changes as characters are stable

    def _ensure_spatial_index(self):
        """Build spatial index if needed."""
        if self._spatial_index is None or self._index_dirty:
            if len(self._valid_token_ids) < 10:
                return  # Too few tokens for indexing

            pooled_vectors = []
            token_ids = []
            for tid in sorted(self._valid_token_ids):
                pooled_vec = self._get_cached_pooled(tid)
                if pooled_vec is not None:
                    pooled_vectors.append(pooled_vec)
                    token_ids.append(tid)

            if pooled_vectors:
                self._spatial_index = SpatialIndex(
                    np.array(pooled_vectors),
                    token_ids,
                    method="auto"
                )
                self._index_dirty = False

    def _get_cached_pooled(self, token_id: int) -> Optional[np.ndarray]:
        """Get pooled vector with caching."""
        if token_id in self._pooled_cache:
            return self._pooled_cache[token_id]

        if token_id in self._id_to_vec:
            X = self._id_to_vec[token_id]
            pooled = X.mean(axis=0)
            self._pooled_cache[token_id] = pooled
            return pooled
        return None

    def _get_cached_normalized(self, token_id: int) -> Optional[np.ndarray]:
        """Get L1-normalized pooled vector with caching."""
        if token_id in self._normalized_cache:
            return self._normalized_cache[token_id]

        pooled = self._get_cached_pooled(token_id)
        if pooled is not None:
            normalized = pooled / (np.abs(pooled).sum() + 1e-8)
            self._normalized_cache[token_id] = normalized
            return normalized
        return None

    # --------------------------- abstract surface --------------------
    @abstractmethod
    def encode(self, token: str, *, return_id: bool = False) -> Union[np.ndarray, Tuple[np.ndarray, int]]:
        raise NotImplementedError

    @abstractmethod
    def get_score(self, token_or_id: Union[str, int]) -> float:
        raise NotImplementedError

    # --------------------------- basic queries (optimized) -----------------------
    def decode(self, token_id: int, fallback: str = "<unk>") -> Optional[str]:
        if token_id in self._id_to_token:
            return self._id_to_token[token_id]
        return fallback if fallback in self._token_to_id else None

    def decode_with_provenance(self, token_id: int, fallback: str = "<unk>") -> Tuple[Optional[str], Optional[dict]]:
        tok = self.decode(token_id, fallback=fallback)
        prov = self._id_to_provenance.get(token_id)
        return tok, prov

    def provenance(self, token_or_id: Union[str, int]) -> Optional[dict]:
        tid = token_or_id if isinstance(token_or_id, int) else self._token_to_id.get(token_or_id)
        return self._id_to_provenance.get(tid)

    def embedding(self, token_or_id: Union[str, int]) -> Optional[np.ndarray]:
        tid = token_or_id if isinstance(token_or_id, int) else self._token_to_id.get(token_or_id)
        return self._id_to_vec.get(tid)

    def pooled(self, token_or_id: Union[str, int], method: str = "mean") -> Optional[np.ndarray]:
        """Optimized pooled method with character caching"""
        
        # Fast path for single characters
        if isinstance(token_or_id, str) and len(token_or_id) == 1:
            if token_or_id in self._char_cache:
                self._char_lookups_saved += 1
                return self._char_cache[token_or_id].copy()  # Return copy to prevent mutation
        
        # Regular lookup
        tid = token_or_id if isinstance(token_or_id, int) else self._token_to_id.get(token_or_id)
        if tid is None:
            return None

        if method == "mean":
            pooled = self._get_cached_pooled(tid)
            
            # Cache single characters for future use
            if pooled is not None and isinstance(token_or_id, str) and len(token_or_id) == 1:
                self._char_cache[token_or_id] = pooled.copy()
            
            return pooled

        # Fallback for other methods
        X = self._id_to_vec.get(tid)
        if X is None:
            return None
        if method == "first":
            return X[0]
        if method == "sum":
            return X.sum(axis=0)
        raise ValueError(f"Invalid pooling method: {method}")

    def pooled_batch(self, tokens: List[Union[str, int]], method: str = "mean") -> List[Optional[np.ndarray]]:
        """Batch pooling with character-level caching for efficiency"""
        results = []
        
        for token in tokens:
            # Use optimized single pooled method which handles char caching
            results.append(self.pooled(token, method))
        
        return results

    # --------------------------- optimized similarity ---------------------
    def similarity(self, token_a: Union[str, int], token_b: Union[str, int]) -> float:
        """
        Optimized L1-normalized directional similarity using cached vectors.
        """
        tid_a = token_a if isinstance(token_a, int) else self._token_to_id.get(token_a)
        tid_b = token_b if isinstance(token_b, int) else self._token_to_id.get(token_b)

        if tid_a is None or tid_b is None:
            return -1.0

        a_norm = self._get_cached_normalized(tid_a)
        b_norm = self._get_cached_normalized(tid_b)

        if a_norm is None or b_norm is None:
            return -1.0

        return float(np.dot(a_norm, b_norm))

    def similarity_magnitude(self, token_a: Union[str, int], token_b: Union[str, int]) -> float:
        """
        Raw dot-product using cached pooled vectors.
        """
        tid_a = token_a if isinstance(token_a, int) else self._token_to_id.get(token_a)
        tid_b = token_b if isinstance(token_b, int) else self._token_to_id.get(token_b)

        if tid_a is None or tid_b is None:
            return -1.0

        a = self._get_cached_pooled(tid_a)
        b = self._get_cached_pooled(tid_b)

        if a is None or b is None:
            return -1.0

        return float(np.dot(a, b))

    # --------------------------- optimized spatial search ---------------------
    def extract_band(self, trajectory: np.ndarray, max_angle: float = 0.3, method: str = "pooled") -> Dict[
        str, np.ndarray]:
        """
        Optimized spatial search using indexing when available.
        """
        if trajectory.ndim == 2:
            direction = trajectory.mean(0)
        else:
            direction = trajectory
        direction = direction / (np.abs(direction).sum() + 1e-8)

        # Try spatial index first
        self._ensure_spatial_index()
        if self._spatial_index is not None:
            try:
                # Convert angle threshold to distance threshold (approximation)
                max_distance = max_angle * 2.0  # rough conversion
                token_ids, distances = self._spatial_index.search_radius(
                    direction, max_distance, max_results=1000
                )

                # Refine results with exact L1 similarity check
                out: Dict[str, np.ndarray] = {}
                for tid in token_ids:
                    tok = self._id_to_token.get(tid)
                    if tok is None:
                        continue
                    v_norm = self._get_cached_normalized(tid)
                    if v_norm is not None and float(np.dot(v_norm, direction)) >= 1.0 - max_angle:
                        out[tok] = self._id_to_vec[tid]
                return out

            except Exception as e:
                warnings.warn(f"Spatial index search failed: {e}, falling back to linear")

        # Fallback to linear search
        out: Dict[str, np.ndarray] = {}
        for tok, tid in self._token_to_id.items():
            v_norm = self._get_cached_normalized(tid)
            if v_norm is not None and float(np.dot(v_norm, direction)) >= 1.0 - max_angle:
                out[tok] = self._id_to_vec[tid]
        return out

    def find_similar_tokens(self, token: Union[str, int], k: int = 10, min_similarity: float = 0.5) -> List[
        Tuple[str, float]]:
        """
        Find k most similar tokens using spatial indexing when available.
        """
        tid = token if isinstance(token, int) else self._token_to_id.get(token)
        if tid is None:
            return []

        query_vec = self._get_cached_normalized(tid)
        if query_vec is None:
            return []

        self._ensure_spatial_index()
        if self._spatial_index is not None:
            try:
                # Use spatial index for approximate search
                max_distance = (1.0 - min_similarity) * 2.0
                token_ids, _ = self._spatial_index.search_radius(
                    query_vec, max_distance, max_results=k * 3  # Get extra for refinement
                )

                # Compute exact similarities and sort
                similarities = []
                for tid_cand in token_ids:
                    if tid_cand == tid:  # Skip self
                        continue
                    sim = self.similarity(tid, tid_cand)
                    if sim >= min_similarity:
                        tok = self._id_to_token.get(tid_cand)
                        if tok:
                            similarities.append((tok, sim))

                return sorted(similarities, key=lambda x: x[1], reverse=True)[:k]

            except Exception as e:
                warnings.warn(f"Spatial similarity search failed: {e}, falling back to linear")

        # Linear fallback
        similarities = []
        for tok_cand, tid_cand in self._token_to_id.items():
            if tid_cand == tid:
                continue
            sim = self.similarity(tid, tid_cand)
            if sim >= min_similarity:
                similarities.append((tok_cand, sim))

        return sorted(similarities, key=lambda x: x[1], reverse=True)[:k]

    # --------------------------- helpers exposed to callbacks --------
    def _helpers(self) -> Dict[str, Callable[..., np.ndarray]]:
        def _emb(x):
            e = self.embedding(x)
            return None if e is None else np.asarray(e, np.float32)

        def _poo(x):
            p = self.pooled(x)
            return None if p is None else np.asarray(p, np.float32)

        def _chars(s):
            # Use batch pooling for efficiency
            return self.pooled_batch(list(s)) if isinstance(s, str) else None

        return {"embedding": _emb, "pooled": _poo, "chars_pooled": _chars}

    # --------------------------- DEFAULT create_crystal (unicode path) ----
    def _default_create_crystal(self, config: dict, callback: Callable[..., np.ndarray]) -> np.ndarray:
        """
        Deterministic default when user leaves callback/create_crystal=None.
        """
        pool_type = config.get("pool_type") or "unicode"
        H = config["helpers"]
        token_plain = str(config["data"]["token"])
        d = int(config["dim"])

        c_uni = self._compose_unicode_center(token_plain, H, pool_type, d)
        c_defs = self._compose_wordnet_center(config.get("additional_definitions", []), H, pool_type, d)

        if pool_type == "combination":
            parts = [v for v in (c_uni, c_defs) if v is not None]
            c = np.mean(np.stack(parts, 0), 0) if parts else np.zeros(d, np.float32)
        elif pool_type == "wordnet":
            c = c_defs if c_defs is not None else np.zeros(d, np.float32)
        else:
            c = c_uni if c_uni is not None else np.zeros(d, np.float32)

        # L1 normalization only
        l1 = float(np.abs(c).sum()) + 1e-8
        c = c / l1
        return self._deterministic_pentachoron(c)

    def _default_unicode_callback(self, name: str, **kwargs) -> np.ndarray:
        raise NotImplementedError("Default callback is not invoked directly.")

    # --------------------------- universal builders (overrideable) ---
    def _compose_unicode_center(
            self, token_plain: str, H, pool_type: Optional[str], dim: int
    ) -> Optional[np.ndarray]:
        """
        Build a center vector from the token's Unicode characters - OPTIMIZED.
        """
        # Use batch pooling for all characters at once
        char_list = list(token_plain)
        pooled_chars = self.pooled_batch(char_list)
        
        vecs: List[np.ndarray] = []
        for pooled_v in pooled_chars:
            if pooled_v is None:
                continue
            v = np.asarray(pooled_v, np.float32)
            if v.shape[0] != dim:
                raise ValueError(f"Unicode pooled dim mismatch: got {v.shape[0]}, expected {dim}")
            vecs.append(v)

        if not vecs:
            return None

        stacked = np.stack(vecs, 0)

        if pool_type in (None, "unicode", "mean"):
            c = stacked.mean(axis=0)
        elif pool_type == "abs":
            c = np.abs(stacked).mean(axis=0)
        elif pool_type == "dot":
            c = stacked.mean(axis=0)
            c = c / (np.abs(c).sum() + 1e-8)  # L1 normalize
        elif pool_type == "mse":
            c = (stacked ** 2).mean(axis=0)
        elif pool_type == "max":
            c = stacked.max(axis=0)
        else:
            raise ValueError(f"Unsupported pool_type '{pool_type}'")

        return c.astype(np.float32, copy=False)

    def _compose_wordnet_center(
            self, definitions: List[str], H, pool_type: Optional[str], dim: int
    ) -> Optional[np.ndarray]:
        """Build a center vector from definition text characters - OPTIMIZED."""
        # Collect all characters from all definitions
        all_chars = []
        for text in definitions:
            all_chars.extend(list(str(text)))
        
        # Batch lookup
        pooled_chars = self.pooled_batch(all_chars)
        
        vecs: List[np.ndarray] = []
        for pooled_v in pooled_chars:
            if pooled_v is None:
                continue
            v = np.asarray(pooled_v, np.float32)
            if v.shape[0] != dim:
                raise ValueError(f"Definition pooled dim mismatch: got {v.shape[0]}, expected {dim}")
            vecs.append(v)

        if not vecs:
            return None

        stacked = np.stack(vecs, 0)

        if pool_type in (None, "unicode", "mean"):
            c = stacked.mean(axis=0)
        elif pool_type == "abs":
            c = np.abs(stacked).mean(axis=0)
        elif pool_type == "dot":
            c = stacked.mean(axis=0)
            c = c / (np.abs(c).sum() + 1e-8)  # L1 normalize
        elif pool_type == "mse":
            c = (stacked ** 2).mean(axis=0)
        elif pool_type == "max":
            c = stacked.max(axis=0)
        else:
            raise ValueError(f"Unsupported pool_type '{pool_type}'")

        return c.astype(np.float32, copy=False)

    def _deterministic_pentachoron(self, center_vec: np.ndarray) -> np.ndarray:
        """Universal pentachoron inflation (deterministic; overrideable)."""
        d = center_vec.shape[0]
        proposals = np.stack([
            center_vec,
            np.roll(center_vec, 1),
            np.roll(center_vec, 3) * np.sign(center_vec + 1e-8),
            np.roll(center_vec, 7) - center_vec,
            np.roll(center_vec, 11) + center_vec,
        ], 0).astype(np.float32)

        # L1 row norms
        norms = np.sum(np.abs(proposals), axis=1, keepdims=True) + 1e-8
        Q = proposals / norms

        # GS orthogonalization with L1 row renorm at each step
        for i in range(5):
            for j in range(i):
                Q[i] -= np.dot(Q[i], Q[j]) * Q[j]
            Q[i] /= (np.sum(np.abs(Q[i])) + 1e-8)

        gamma = np.array([1.0, 0.9, -0.8, 1.1, 1.2], np.float32)
        X = np.zeros((5, d), np.float32)
        for i in range(5):
            X[i] = center_vec + gamma[i] * Q[i]
        return X - X.mean(0, keepdims=True)

    # --------------------------- finalize + provenance (overrideable) ----
    def _finalize_crystal(self, X: np.ndarray) -> np.ndarray:
        X = np.asarray(X, np.float32, order='C')  # Ensure C-contiguous
        if X.shape != (5, self.dim):
            raise ValueError(f"Crystal must be shape (5, {self.dim}); got {X.shape}.")
        return X - X.mean(0, keepdims=True)

    def _auto_provenance_from_cfg(self, cfg: Dict[str, Any]) -> dict:
        token = cfg["data"]["token"]
        prov = {
            "source": "special/compose",
            "token": token,
            "pool_type": cfg.get("pool_type") or "unicode",
            "components": list(token),
            "additional_definitions": list(cfg.get("additional_definitions", [])),
        }
        if cfg.get("antonyms"):
            prov["antonyms"] = list(cfg["antonyms"])
        if cfg.get("inversion_formula") is not None:
            prov["inversion_formula"] = "user_supplied"
        return prov

    def _finalize_crystal_and_provenance(
            self, product: Union[np.ndarray, Dict[str, Any]], cfg: Dict[str, Any]
    ) -> Tuple[np.ndarray, dict]:
        # ndarray path
        if isinstance(product, np.ndarray):
            X = self._finalize_crystal(product)
            prov = self._auto_provenance_from_cfg(cfg)
            return X, prov

        # dict path
        if not isinstance(product, dict):
            raise TypeError(
                "create_crystal must return ndarray or dict with {'base':..., 'ops':..., 'provenance':...}.")
        base = np.asarray(product["base"], np.float32)
        X = base
        for op in product.get("ops", []):
            name = op.get("name")
            if name == "center":
                X -= X.mean(0, keepdims=True)
            elif name == "scale":
                X *= float(op.get("k", 1.0))
            elif name == "translate":
                t = np.asarray(op.get("t"), np.float32)
                if t.shape != (self.dim,):
                    raise ValueError(f"translate.t must be shape ({self.dim},)")
                X = X + t[None, :]
            elif name == "normalize_rows":
                n = np.sum(np.abs(X), axis=1, keepdims=True) + 1e-8
                X = X / n
            elif name == "align_to":
                v = np.asarray(op.get("v"), np.float32)
                if v.shape != (self.dim,):
                    raise ValueError(f"align_to.v must be shape ({self.dim},)")
                v = v / (np.abs(v).sum() + 1e-8)
                p = X.mean(0)
                p = p / (np.abs(p).sum() + 1e-8)
                alpha = float(op.get("alpha", 1.0))
                X = X + alpha * (v - p)[None, :]
            else:
                raise ValueError(f"Unsupported op '{name}'")
        prov = dict(product.get("provenance", {})) or self._auto_provenance_from_cfg(cfg)
        return self._finalize_crystal(X), prov

    # --------------------------- universal manifestation routine ----------
    def _manifest_special_tokens(
            self,
            base_set: Dict[str, int],
            create_crystal: Callable[[dict, Callable[..., np.ndarray]], Union[np.ndarray, Dict[str, Any]]],
            callback: Optional[Callable[..., np.ndarray]],
            create_config: Dict[str, Any],
    ) -> None:
        """Universal, deterministic manifestor with character pre-caching."""
        
        # NEW: Pre-cache all unique characters that will be needed
        unique_chars = set()
        for name in base_set.keys():
            token_plain = name.strip("<>").strip()
            unique_chars.update(token_plain)
            
        print(f"[⚡] Pre-caching {len(unique_chars)} unique characters...")
        for ch in unique_chars:
            _ = self.pooled(ch)  # Trigger caching
        
        helpers = self._helpers()

        for name, tid in base_set.items():
            # Keep if already present
            if tid in self._id_to_vec:
                self._token_to_id[name] = tid
                self._id_to_token.setdefault(tid, name)
                self._valid_token_ids.add(tid)
                continue

            # Build per-token config
            cfg = {
                "dim": self.dim,
                "pool_type": create_config.get("pool_type", None),
                "special_tokens": create_config.get("special_tokens"),
                "additional_definitions": create_config.get("additional_definitions", []),
                "antonyms": create_config.get("antonyms"),
                "inversion_formula": create_config.get("inversion_formula"),
                "data": {"token": name.strip("<>").strip(), "token_id": tid, "origin": "special"},
                "helpers": helpers,
            }

            if create_crystal is None:
                create_crystal = self._default_create_crystal

            product = create_crystal(cfg, callback) if callback is not None else create_crystal(cfg,
                                                                                                self._default_unicode_callback)
            X, prov = self._finalize_crystal_and_provenance(product, cfg)

            # Register
            self._token_to_id[name] = tid
            self._id_to_token[tid] = name
            self._id_to_vec[tid] = X.astype(np.float32, copy=False, order='C')
            self._id_to_provenance[tid] = prov
            self._valid_token_ids.add(tid)
            self._id_to_volume.setdefault(tid, 1.0)

            # Aliases
            for alias in (cfg.get("special_tokens") or []):
                alias = str(alias)
                self._token_to_id[alias] = tid
                self._id_to_token.setdefault(tid, alias)
            if cfg.get("special_tokens"):
                self._id_to_provenance[tid].setdefault("aliases", list(cfg["special_tokens"]))

            # Antonyms
            antonyms = cfg.get("antonyms") or []
            invf = cfg.get("inversion_formula")
            if invf:
                for anti in antonyms:
                    if anti in base_set:
                        anti_id = base_set[anti]
                        if anti_id not in self._id_to_vec:
                            X_inv = invf(X, cfg)  # must be deterministic
                            X_inv = self._finalize_crystal(X_inv)
                            self._token_to_id[anti] = anti_id
                            self._id_to_token[anti_id] = anti
                            self._id_to_vec[anti_id] = X_inv.astype(np.float32, copy=False, order='C')
                            inv_prov = {
                                "source": "inversion",
                                "of_token": name,
                                "of_token_id": tid,
                                "pool_type": cfg.get("pool_type") or "unicode",
                                "components": prov.get("components", []),
                                "additional_definitions": cfg.get("additional_definitions", []),
                                "ops": ["invert"],
                            }
                            self._id_to_provenance[anti_id] = inv_prov
                            self._valid_token_ids.add(anti_id)
                            self._id_to_volume.setdefault(anti_id, 1.0)

        # Invalidate caches after adding tokens
        self._invalidate_caches()
        
        if self._char_lookups_saved > 0:
            print(f"[✅] Character cache saved {self._char_lookups_saved} lookups")

    # --------------------------- basics -------------------------------
    def vocab_size(self) -> int:
        return len(self._token_to_id)

    def token_to_id(self, token: str) -> Optional[int]:
        return self._token_to_id.get(token)

    def id_to_token(self, token_id: int) -> Optional[str]:
        return self._id_to_token.get(token_id)

    def cache_stats(self) -> Dict[str, int]:
        """Get cache statistics."""
        return {
            "normalized_cache_size": len(self._normalized_cache),
            "pooled_cache_size": len(self._pooled_cache),
            "char_cache_size": len(self._char_cache),
            "char_lookups_saved": self._char_lookups_saved,
            "spatial_index_size": len(self._spatial_index.token_ids) if self._spatial_index else 0,
            "vocab_size": len(self._valid_token_ids)
        }

    def clear_caches(self):
        """Clear all caches to free memory."""
        self._invalidate_caches()
        self._char_cache.clear()
        self._char_lookups_saved = 0


from typing import List, Dict, Union, Optional, Tuple, Callable, Any

class PretrainedGeometricVocab(GeometricVocab):
    """
    Parquet-backed deterministic vocab with columnar load, duplicate-mean aggregation,
    pooled caching, and fast path for flat crystals.
    """
    def __init__(
        self,
        repo_id: str,
        dim: int,
        *,
        subset: str = "unicode", 
        split: str = "train_100d",
        base_set: Optional[Dict[str, int]] = None,
        create_config: Optional[Dict[str, Any]] = None,
        create_crystal: Optional[Callable[[dict, Callable[..., np.ndarray]], Union[np.ndarray, Dict[str, Any]]]] = None,
        callback: Optional[Callable[..., np.ndarray]] = None,
        manifest_specials: bool = True,
        # perf/robustness knobs
        store: str = "full",                 # "full" | "pooled" | "both"
        reshape_order: str = "C",
        vertex_count: int = 5,
        infer_dim: bool = True,
        strict_shapes: bool = False,
        # new perf knobs
        finalize_mode: str = "post_mean",    # "none" | "post_mean"
        cache_pooled: bool = True,
        streaming=False,
    ):
        super().__init__(dim)
        self.repo_id = str(repo_id)
        self._id_to_pooled: Dict[int, np.ndarray] = {}  # optional pooled cache

        # ---------- load split (columnar, minimal columns) ----------
        ds = load_dataset(self.repo_id, split=split)
        have = set(ds.column_names)
        wanted = ["token_id", "token", "crystal", "volume"]
        keep = [c for c in wanted if c in have]
        drop = [c for c in ds.column_names if c not in keep]
        if drop:
            ds = ds.remove_columns(drop)
        ds = ds.with_format("numpy", columns=keep)

        ids = ds["token_id"] if "token_id" in keep else np.array([], dtype=np.int64)
        toks = ds["token"]    if "token"    in keep else np.array([], dtype=object)
        cryst= ds["crystal"]  if "crystal"  in keep else np.array([], dtype=object)
        vols = ds["volume"]   if "volume"   in keep else None

        ids = np.asarray(ids).astype(np.int64, copy=False)
        toks = np.asarray(toks)

        # --------- shape helpers ----------
        def _coerce(raw: Any) -> np.ndarray:
            X = np.asarray(raw, np.float32)
            if X.ndim == 2:
                V, D = int(X.shape[0]), int(X.shape[1])
                if V != vertex_count:
                    raise ValueError(f"Crystal has {V} vertices, expected {vertex_count}.")
                if D != self.dim:
                    if infer_dim: self.dim = D
                    else: raise ValueError(f"Dim mismatch: got {D}, expected {self.dim}.")
                return X
            if X.ndim == 1:
                n = int(X.size)
                if n == vertex_count * self.dim:
                    return np.reshape(X, (vertex_count, self.dim), order=reshape_order)
                if infer_dim and n % vertex_count == 0:
                    self.dim = n // vertex_count
                    return np.reshape(X, (vertex_count, self.dim), order=reshape_order)
                if n == self.dim:
                    c = X / (np.abs(X).sum() + 1e-8)
                    return self._deterministic_pentachoron(c)
            raise ValueError(f"Unsupported crystal shape {X.shape if isinstance(X, np.ndarray) else type(X)}.")

        def _finalize_if_needed(X: np.ndarray) -> np.ndarray:
            if finalize_mode == "none":
                return np.asarray(X, np.float32, order="C")
            elif finalize_mode == "post_mean":
                return self._finalize_crystal(X)
            else:
                raise ValueError(f"finalize_mode must be 'none' or 'post_mean', got {finalize_mode!r}")

        vols_f = np.asarray(vols, dtype=np.float32) if vols is not None else None

        # ---------- FAST PATH: flat uniform crystals ----------
        # Try to stack into (N, L); succeeds when each row is the same length.
        fastpath_ok = False
        A = None  # (N, L) float32
        try:
            A = np.stack(cryst)  # may raise if jagged / object
            if A.ndim == 2 and A.dtype != object:
                A = A.astype(np.float32, copy=False)
                L = A.shape[1]
                if L % vertex_count == 0:
                    # infer or validate D
                    D = L // vertex_count
                    if self.dim != D:
                        if infer_dim:
                            self.dim = int(D)
                        else:
                            raise ValueError(f"Dim mismatch: got D={D}, expected dim={self.dim}.")
                    fastpath_ok = True
        except Exception:
            fastpath_ok = False

        if fastpath_ok and A is not None and len(ids) > 0:
            # reshape to (N, V, D)
            V = vertex_count
            D = self.dim
            A = A.reshape(-1, V, D, order=reshape_order)

            # sort by ids and reduceat to mean duplicates in pure NumPy
            order = np.argsort(ids, kind="stable")
            ids_sorted = ids[order]
            A_sorted = A[order]
            vols_sorted = vols_f[order] if vols_f is not None else None

            uniq_ids, idx, counts = np.unique(ids_sorted, return_index=True, return_counts=True)
            sums = np.add.reduceat(A_sorted, idx, axis=0)              # (K, V, D)
            means = sums / counts[:, None, None]                        # (K, V, D)

            if vols_sorted is not None:
                v_sums = np.add.reduceat(vols_sorted, idx)
                v_means = v_sums / counts.astype(np.float32)
            else:
                v_means = np.ones_like(uniq_ids, dtype=np.float32)

            # commit maps
            self._token_to_id.clear(); self._id_to_token.clear()
            self._id_to_vec.clear();   self._id_to_volume.clear(); self._valid_token_ids.clear()
            self._id_to_pooled.clear()

            # pick a representative token per id: first occurrence in sorted block
            toks_sorted = toks[order]
            rep_toks = toks_sorted[idx]

            for tid, tok, X_mean, v_m in zip(uniq_ids.tolist(), rep_toks.tolist(), means, v_means.tolist()):
                # cache pooled BEFORE finalize to preserve signal
                if cache_pooled:
                    self._id_to_pooled[tid] = X_mean.mean(axis=0).astype(np.float32, copy=False)
                X_store = _finalize_if_needed(X_mean)

                self._token_to_id[str(tok)] = tid
                self._id_to_token[tid] = str(tok)
                if store in ("full", "both"):
                    self._id_to_vec[tid] = np.asarray(X_store, np.float32, order="C")
                elif store == "pooled":
                    # store pooled as embedding if desired
                    self._id_to_vec[tid] = (self._id_to_pooled[tid] if cache_pooled
                                            else X_mean.mean(axis=0).astype(np.float32, copy=False))
                self._id_to_volume[tid] = float(v_m)
                self._valid_token_ids.add(tid)

        else:
            # ---------- FALLBACK: per-row coerce + dict mean ----------
            ids_int  = ids.tolist()
            toks_str = [str(x) for x in toks.tolist()]
            vols_f   = (vols_f.tolist() if vols_f is not None else [1.0] * len(ids_int))

            x_sum: Dict[int, np.ndarray] = {}
            v_sum: Dict[int, float]      = {}
            n_cnt: Dict[int, int]        = {}
            tok_pref: Dict[int, str]     = {}

            for tid, tok, raw, vol in zip(ids_int, toks_str, cryst, vols_f):
                X = _coerce(raw)  # [V,D] float32
                if tid not in x_sum:
                    x_sum[tid]  = X.astype(np.float32, copy=True)
                    v_sum[tid]  = float(vol)
                    n_cnt[tid]  = 1
                    tok_pref[tid] = tok
                else:
                    x_sum[tid] += X
                    v_sum[tid] += float(vol)
                    n_cnt[tid] += 1

            self._token_to_id.clear(); self._id_to_token.clear()
            self._id_to_vec.clear();   self._id_to_volume.clear(); self._valid_token_ids.clear()
            self._id_to_pooled.clear()

            for tid in x_sum.keys():  # order not critical; add sorted(tids) if you need determinism
                X_mean = x_sum[tid] / float(n_cnt[tid])
                if cache_pooled:
                    self._id_to_pooled[tid] = X_mean.mean(axis=0).astype(np.float32, copy=False)
                X_store = _finalize_if_needed(X_mean)

                tok    = tok_pref[tid]
                vol_m  = v_sum[tid] / float(n_cnt[tid])

                self._token_to_id[tok] = tid
                self._id_to_token[tid] = tok
                if store in ("full", "both"):
                    self._id_to_vec[tid] = np.asarray(X_store, np.float32, order="C")
                elif store == "pooled":
                    self._id_to_vec[tid] = (self._id_to_pooled[tid] if cache_pooled
                                            else X_mean.mean(axis=0).astype(np.float32, copy=False))
                self._id_to_volume[tid] = float(vol_m)
                self._valid_token_ids.add(tid)

        # ---------- specials ----------
        if manifest_specials and base_set:
            self._manifest_special_tokens(
                base_set=base_set,
                create_crystal=create_crystal,
                callback=callback,
                create_config=create_config or {}
            )

    # -------- override pooled() to use cache (if present) --------
    def pooled(self, token_or_id: Union[str, int], method: str = "mean") -> Optional[np.ndarray]:
        # Favor cached pooled when available; fallback to base (computes mean)
        tid = token_or_id if isinstance(token_or_id, int) else self._token_to_id.get(token_or_id)
        if tid is not None and tid in self._id_to_pooled:
            return self._id_to_pooled[tid]
        return super().pooled(token_or_id, method=method)

    # -------- SP-like surface --------
    def encode(self, token: str, *, return_id: bool = False) -> Union[np.ndarray, Tuple[np.ndarray, int]]:
        tid = self._token_to_id.get(token)
        if tid is None:
            unk_id = self._token_to_id.get("<unk>")
            if unk_id is None:
                raise KeyError(f"Token '{token}' not found and '<unk>' missing.")
            X = self._id_to_vec[unk_id]
            return (X, unk_id) if return_id else X
        X = self._id_to_vec[tid]
        return (X, tid) if return_id else X

    def get_score(self, token_or_id: Union[str, int]) -> float:
        tid = token_or_id if isinstance(token_or_id, int) else self._token_to_id.get(token_or_id, None)
        if tid is None or tid not in self._valid_token_ids:
            return -100.0
        vol = self._id_to_volume.get(tid, 1.0)
        return float(np.clip(vol / 10.0, 0.01, 1.0))

    # -------- Torch cache ----------
    def cache(self, tokens: Union[List[str], Dict[str, int]], device: str = "cpu", dtype: torch.dtype = torch.float32):
        tok_list = list(tokens.keys()) if isinstance(tokens, dict) else list(tokens)
        mats, pooled, keep = [], [], []
        for t in tok_list:
            X = self.embedding(t)
            v = self.pooled(t)
            if X is None or v is None:
                continue
            mats.append(torch.as_tensor(X, dtype=dtype))
            pooled.append(torch.as_tensor(v, dtype=dtype))
            keep.append(t)
        if not mats:
            raise ValueError("No valid tokens found in input.")
        return {
            "tokens": keep,
            "crystals": torch.stack(mats, 0).to(device),
            "pooled":   torch.stack(pooled, 0).to(device),
        }


    def _coerce_crystal_shape(
        self,
        raw: Any,
        *,
        vertex_count: int,
        reshape_order: str,
        infer_dim: bool,
        strict_shapes: bool
    ) -> np.ndarray:
        """
        Accepts raw crystal data and returns [vertex_count, self.dim] float32 C-order.

        Acceptable inputs:
          - [vertex_count, D]
          - [vertex_count * D] (flat)  -> reshaped to [vertex_count, D]
          - [D] (pooled center)       -> converted by deterministic pentachoron (fallback)
        """
        X = np.asarray(raw, dtype=np.float32)

        # Already [V, D]
        if X.ndim == 2:
            V, D = int(X.shape[0]), int(X.shape[1])
            if V != vertex_count:
                if strict_shapes:
                    raise ValueError(f"Crystal has {V} vertices, expected {vertex_count}.")
                # Gentle fallback: attempt to treat rows as vertices if divisible
                if V * D % vertex_count == 0 and infer_dim:
                    # e.g., [10, D] -> try to collapse/average into [5,D]? Not safe.
                    # Safer: hard error to avoid silent geometry change.
                    raise ValueError(f"Unexpected vertex rows {V}; refusing to coerce silently.")
                else:
                    raise ValueError(f"Crystal has {V} vertices, expected {vertex_count}.")
            # Update dim if needed
            if D != self.dim:
                if infer_dim:
                    self.dim = D
                else:
                    raise ValueError(f"Dim mismatch: got D={D}, expected dim={self.dim}.")
            # Ensure mean-centered (finalize handles centering)
            return X

        # Flat [V*D]
        if X.ndim == 1:
            n = int(X.size)
            # Exact match for flat crystal
            if n == vertex_count * self.dim:
                return np.reshape(X, (vertex_count, self.dim), order=reshape_order)

            # Infer D from total length if divisible
            if infer_dim and n % vertex_count == 0:
                inferred = n // vertex_count
                self.dim = int(inferred)
                return np.reshape(X, (vertex_count, self.dim), order=reshape_order)

            # Pooled [D]: inflate deterministically to [V, D]
            if n == self.dim:
                c = X / (np.abs(X).sum() + 1e-8)  # L1
                return self._deterministic_pentachoron(c)

            if strict_shapes:
                raise ValueError(
                    f"Cannot coerce crystal of length {n}. "
                    f"Expected {vertex_count*self.dim} (flat) or {self.dim} (pooled)."
                )
            # Conservative fallback: treat as pooled center with inferred D if reasonable
            if infer_dim and n > 0:
                self.dim = n
                c = X / (np.abs(X).sum() + 1e-8)
                return self._deterministic_pentachoron(c)

        raise ValueError(f"Unsupported crystal shape {X.shape} (ndim={X.ndim}).")


    # -------- Introspection --------
    def describe(self) -> Dict[str, Union[str, int]]:
        return {"repo": self.repo_id, "dimension": self.dim, "vocab_size": self.vocab_size()}


from __future__ import annotations
import torch
import numpy as np
from abc import ABC, abstractmethod
from typing import Dict, Union, Tuple, Optional, Callable, Any, List
import warnings
from collections import OrderedDict
import datasets
from datasets import load_dataset

# Global flag for warning suppression
SILENT_MODE = False

def set_silent_mode(silent: bool):
    """Set global silent mode for token synthesis warnings"""
    global SILENT_MODE
    SILENT_MODE = silent

class LRUCache(OrderedDict):
    """Simple LRU cache implementation"""
    def __init__(self, maxsize=128):
        super().__init__()
        self.maxsize = maxsize

    def __getitem__(self, key):
        value = super().__getitem__(key)
        self.move_to_end(key)
        return value

    def __setitem__(self, key, value):
        if key in self:
            self.move_to_end(key)
        super().__setitem__(key, value)
        if len(self) > self.maxsize:
            oldest = next(iter(self))
            del self[oldest]


class LazyGeometricVocab(GeometricVocab):
    """
    Lazy-loading geometric vocabulary that loads tokens on demand.
    Maintains a small working set in memory with LRU eviction.
    Supports automatic token synthesis for missing tokens.
    """
    
    def __init__(
        self,
        repo_id: str,
        dim: int,
        *,
        name: str = "unicode_100d",  # Updated default to match new structure
        split: str = "train",  # Updated default to "train"
        stream: bool = True,  # Use streaming by default to avoid bulk downloads
        base_set: Optional[Dict[str, int]] = None,
        create_config: Optional[Dict[str, Any]] = None,
        create_crystal: Optional[Callable] = None,
        callback: Optional[Callable] = None,
        manifest_specials: bool = True,
        # Lazy loading parameters
        cache_size: int = 1000,  # Max tokens to keep in memory
        preload_tokens: Optional[List[str]] = None,  # Critical tokens to preload
        index_cache_path: Optional[str] = None,  # Path to save/load index
        # Tokenization
        tokenizer: Optional[Callable[[str], List[str]]] = None,  # Custom tokenizer
        # Synthesis settings
        silent: bool = False,  # Suppress synthesis warnings
        # Performance knobs
        store: str = "full",
        reshape_order: str = "C",
        vertex_count: int = 5,
        infer_dim: bool = True,
        finalize_mode: str = "post_mean",
        cache_pooled: bool = True,
    ):
        super().__init__(dim)
        
        self.repo_id = repo_id
        self.name = name
        self.split = split
        self.stream = stream
        self.vertex_count = vertex_count
        self.reshape_order = reshape_order
        self.infer_dim = infer_dim
        self.finalize_mode = finalize_mode
        self.store = store
        self.cache_pooled = cache_pooled
        self.silent = silent
        
        # Initialize pooled dictionary that may be missing from parent class
        if not hasattr(self, '_id_to_pooled'):
            self._id_to_pooled = {}
        
        # For synthesis
        self.create_crystal_fn = create_crystal
        self.callback_fn = callback
        self.create_config = create_config or {}
        self._synthesized_tokens: set = set()
        self._next_synthetic_id = -1  # Use negative IDs for synthetic tokens
        
        # Tokenizer - default to simple split
        self.tokenizer = tokenizer or (lambda s: s.split())
        
        # LRU caches for lazy loading
        self._crystal_cache = LRUCache(maxsize=cache_size)
        self._pooled_lru = LRUCache(maxsize=cache_size * 2)  # Pooled vectors are smaller
        
        # Load dataset but don't fetch data yet
        self._dataset = None
        self._dataset_stream = None
        self._token_index: Dict[str, List[int]] = {}  # token -> [row indices]
        self._id_index: Dict[int, List[int]] = {}  # token_id -> [row indices]
        self._row_data: Dict[int, dict] = {}  # row -> cached data
        
        # Initialize index
        self._build_index(split, name)
        
        # Pre-load base characters for synthesis
        self._preload_synthesis_base()
        
        # Preload critical tokens if specified
        if preload_tokens:
            self._preload(preload_tokens)
            
        # Manifest special tokens
        if manifest_specials and base_set:
            self._manifest_special_tokens(
                base_set=base_set,
                create_crystal=create_crystal,
                callback=callback,
                create_config=create_config or {}
            )
    
    def _preload_synthesis_base(self):
        """Pre-load basic ASCII characters needed for synthesis"""
        # Essential characters that are commonly used in token synthesis
        base_chars = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,!?-_()[]{}:;'\"")
        
        print(f"Pre-loading {len(base_chars)} base characters for synthesis...")
        loaded = 0
        for char in base_chars:
            tid = self._token_to_id.get(char)
            if tid:
                # Pre-load this character's embedding
                if self._load_crystal(tid) is not None:
                    loaded += 1
        print(f"Loaded {loaded} base characters")
    
    def _build_index(self, split: str, name: str):
        """Build token/id index without loading crystal data"""
        print(f"Building index for {self.repo_id}/{name}/{split}...")
        
        if self.stream:
            try:
                # Use streaming to avoid downloading all splits
                # Don't specify columns in streaming mode to avoid schema issues
                self._dataset_stream = load_dataset(
                    self.repo_id,
                    name=name,
                    split=split,
                    streaming=True
                )
                
                # Build index from streaming dataset
                for idx, row in enumerate(self._dataset_stream):
                    token = str(row["token"])
                    token_id = int(row["token_id"])
                    
                    # Token index
                    if token not in self._token_index:
                        self._token_index[token] = []
                    self._token_index[token].append(idx)
                    
                    # ID index
                    if token_id not in self._id_index:
                        self._id_index[token_id] = []
                    self._id_index[token_id].append(idx)
                    
                    # Update mappings (use first occurrence)
                    if token not in self._token_to_id:
                        self._token_to_id[token] = token_id
                        self._id_to_token[token_id] = token
                        self._valid_token_ids.add(token_id)
                
                print(f"Index built (streaming): {len(self._token_index)} unique tokens")
                
            except Exception as e:
                print(f"Streaming failed: {e}")
                print("Falling back to non-streaming mode...")
                self.stream = False
                # Recursive call with streaming disabled
                self._build_index(split, name)
                
        else:
            # Non-streaming mode - load dataset normally
            try:
                # Try with data_files to load only specific split
                data_files = f"data/{name}/{split}-*.parquet"
                ds = load_dataset(
                    self.repo_id,
                    data_files=data_files,
                    split="train"
                )
            except:
                # Fallback to normal loading
                try:
                    ds = load_dataset(
                        self.repo_id,
                        name=name,
                        split=split
                    )
                except Exception as e:
                    print(f"Failed to load dataset: {e}")
                    raise
            
            # Build indices
            for idx, row in enumerate(ds):
                token = str(row["token"])
                token_id = int(row["token_id"])
                
                # Token index
                if token not in self._token_index:
                    self._token_index[token] = []
                self._token_index[token].append(idx)
                
                # ID index
                if token_id not in self._id_index:
                    self._id_index[token_id] = []
                self._id_index[token_id].append(idx)
                
                # Update mappings (use first occurrence)
                if token not in self._token_to_id:
                    self._token_to_id[token] = token_id
                    self._id_to_token[token_id] = token
                    self._valid_token_ids.add(token_id)
            
            # Store dataset reference (will lazy load full data)
            self._dataset = ds
            print(f"Index built: {len(self._token_index)} unique tokens")
    
    def _load_row(self, row_idx: int) -> dict:
        """Load a single row from dataset"""
        if row_idx in self._row_data:
            return self._row_data[row_idx]
        
        # If streaming, need to load the full dataset on first data access
        if self.stream and self._dataset is None:
            print(f"Loading full dataset for {self.repo_id}/{self.name}/{self.split}...")
            try:
                # Try with data_files first
                data_files = f"data/{self.name}/{self.split}-*.parquet"
                self._dataset = load_dataset(
                    self.repo_id,
                    data_files=data_files,
                    split="train"
                )
            except:
                # Fallback to normal loading
                self._dataset = load_dataset(
                    self.repo_id,
                    name=self.name,
                    split=self.split
                )
            
        if self._dataset is None:
            raise RuntimeError("Dataset not initialized")
            
        row = self._dataset[row_idx]
        self._row_data[row_idx] = row
        return row
    
    def _load_crystal(self, token_id: int) -> Optional[np.ndarray]:
        """Load and aggregate crystal for a token_id"""
        if token_id in self._crystal_cache:
            return self._crystal_cache[token_id]
            
        if token_id not in self._id_index:
            return None
            
        row_indices = self._id_index[token_id]
        crystals = []
        volumes = []
        
        for idx in row_indices:
            row = self._load_row(idx)
            
            # Parse crystal
            raw_crystal = row.get("crystal")
            if raw_crystal is not None:
                X = self._coerce_crystal(raw_crystal)
                crystals.append(X)
                
                # Get volume if available
                vol = row.get("volume", 1.0)
                volumes.append(float(vol))
        
        if not crystals:
            return None
            
        # Average multiple occurrences
        if len(crystals) == 1:
            X_final = crystals[0]
            vol_final = volumes[0]
        else:
            X_final = np.mean(crystals, axis=0)
            vol_final = np.mean(volumes)
        
        # Finalize
        if self.finalize_mode == "post_mean":
            X_final = self._finalize_crystal(X_final)
            
        # Cache based on store mode
        if self.store in ("full", "both"):
            self._crystal_cache[token_id] = X_final
            self._id_to_vec[token_id] = X_final
            
        # Cache pooled if requested
        if self.cache_pooled:
            pooled = X_final.mean(axis=0)
            self._pooled_lru[token_id] = pooled
            if token_id not in self._id_to_pooled:
                self._id_to_pooled[token_id] = pooled
                
        # Store volume
        self._id_to_volume[token_id] = vol_final
        
        return X_final
    
    def _coerce_crystal(self, raw: Any) -> np.ndarray:
        """Convert raw crystal data to proper shape"""
        X = np.asarray(raw, dtype=np.float32)
        
        if X.ndim == 2:
            V, D = X.shape
            if V != self.vertex_count:
                raise ValueError(f"Expected {self.vertex_count} vertices, got {V}")
            if D != self.dim:
                if self.infer_dim:
                    self.dim = D
                else:
                    raise ValueError(f"Dimension mismatch: {D} vs {self.dim}")
            return X
            
        if X.ndim == 1:
            n = X.size
            if n == self.vertex_count * self.dim:
                return X.reshape((self.vertex_count, self.dim), order=self.reshape_order)
            if self.infer_dim and n % self.vertex_count == 0:
                self.dim = n // self.vertex_count
                return X.reshape((self.vertex_count, self.dim), order=self.reshape_order)
            if n == self.dim:
                # Pooled vector - expand to crystal
                c = X / (np.abs(X).sum() + 1e-8)
                return self._deterministic_pentachoron(c)
                
        raise ValueError(f"Cannot coerce crystal shape {X.shape}")
    
    def _synthesize_token(self, token: str) -> int:
        """Synthesize a new token embedding on-the-fly with fallback for missing chars."""
        # Generate a new ID for synthetic token
        tid = self._next_synthetic_id
        self._next_synthetic_id -= 1
        
        # Warn user unless silenced
        if not self.silent and not SILENT_MODE:
            warnings.warn(
                f"Token '{token}' synthesized - ensure you synthesize your tokens ahead of time.",
                UserWarning,
                stacklevel=3
            )
        
        # Track as synthesized
        self._synthesized_tokens.add(token)
        
        # Try to use character-based synthesis first
        try:
            # Check if all characters are available
            missing_chars = []
            for char in token:
                if char not in self._token_to_id and char not in self._char_cache:
                    missing_chars.append(char)
            
            # If missing chars, try to load or synthesize them first
            if missing_chars:
                for char in missing_chars:
                    char_tid = self._token_to_id.get(char)
                    if char_tid:
                        # Try to load it
                        self._load_crystal(char_tid)
                    else:
                        # Create a simple embedding for this character
                        self._synthesize_simple_char(char)
            
            # Now try the full synthesis
            helpers = self._helpers()
            cfg = {
                "dim": self.dim,
                "pool_type": self.create_config.get("pool_type", "unicode"),
                "data": {"token": token, "token_id": tid, "origin": "synthetic"},
                "helpers": helpers,
            }
            
            if self.create_crystal_fn is not None:
                product = self.create_crystal_fn(cfg, self.callback_fn)
            else:
                product = self._default_create_crystal(cfg, self._default_unicode_callback)
            
            X, prov = self._finalize_crystal_and_provenance(product, cfg)
            
        except Exception as e:
            # Fallback to simple random synthesis
            print(f"Character-based synthesis failed for '{token}': {e}. Using random synthesis.")
            X = self._synthesize_random_crystal(token)
            prov = {"source": "synthetic_random", "token": token}
        
        prov["synthetic"] = True
        
        # Register in all maps
        self._token_to_id[token] = tid
        self._id_to_token[tid] = token
        self._id_to_vec[tid] = X.astype(np.float32, copy=False, order='C')
        self._id_to_provenance[tid] = prov
        self._valid_token_ids.add(tid)
        self._id_to_volume[tid] = 1.0
        
        # Cache
        self._crystal_cache[tid] = X
        if self.cache_pooled:
            pooled = X.mean(axis=0)
            self._pooled_lru[tid] = pooled
            self._id_to_pooled[tid] = pooled
        
        return tid
    
    def _synthesize_simple_char(self, char: str):
        """Create a simple deterministic embedding for a single character"""
        import hashlib
        
        # Use character's unicode codepoint for deterministic generation
        if len(char) == 1:
            seed = ord(char)
        else:
            seed = int(hashlib.md5(char.encode()).hexdigest()[:8], 16)
        
        np.random.seed(seed)
        
        # Generate a simple vector based on character properties
        vec = np.random.randn(self.dim).astype(np.float32)
        vec = vec / (np.abs(vec).sum() + 1e-8)  # L1 normalize
        
        # Cache it
        self._char_cache[char] = vec
        
    def _synthesize_random_crystal(self, token: str) -> np.ndarray:
        """Fallback: create a deterministic random crystal based on token string"""
        import hashlib
        
        # Create deterministic seed from token
        seed = int(hashlib.md5(token.encode()).hexdigest()[:8], 16)
        np.random.seed(seed)
        
        # Generate a random crystal
        X = np.random.randn(self.vertex_count, self.dim).astype(np.float32)
        X = self._finalize_crystal(X)
        
        return X
    
    def _preload(self, tokens: List[str]):
        """Preload specific tokens into cache"""
        print(f"Preloading {len(tokens)} tokens...")
        for token in tokens:
            tid = self._token_to_id.get(token)
            if tid:
                self._load_crystal(tid)
    
    # Override base methods to use lazy loading with synthesis
    
    def embedding(self, token_or_id: Union[str, int], generate: bool = False) -> Optional[np.ndarray]:
        """Get embedding, loading if necessary, synthesizing if requested"""
        # Handle token ID input
        if isinstance(token_or_id, int):
            tid = token_or_id
            token = self._id_to_token.get(tid)
        else:
            token = token_or_id
            tid = self._token_to_id.get(token)
        
        if tid is not None:
            # Check cache first
            if tid in self._id_to_vec:
                return self._id_to_vec[tid]
            # Load on demand
            return self._load_crystal(tid)
        
        # Token not found - synthesize if requested
        if generate and token is not None:
            tid = self._synthesize_token(token)
            return self._id_to_vec[tid]
        
        return None
    
    def pooled(self, token_or_id: Union[str, int], method: str = "mean", generate: bool = False) -> Optional[np.ndarray]:
        """Get pooled vector, loading if necessary, synthesizing if requested"""
        # Handle token ID input
        if isinstance(token_or_id, int):
            tid = token_or_id
            token = self._id_to_token.get(tid)
        else:
            token = token_or_id
            tid = self._token_to_id.get(token)
        
        if tid is not None:
            # Check pooled cache
            if tid in self._pooled_lru:
                return self._pooled_lru[tid]
            if tid in self._id_to_pooled:
                return self._id_to_pooled[tid]
                
            # Load crystal and compute pooled
            X = self.embedding(tid, generate=False)
            if X is not None:
                if method == "mean":
                    pooled = X.mean(axis=0)
                    self._pooled_lru[tid] = pooled
                    return pooled
                elif method == "first":
                    return X[0]
                elif method == "sum":
                    return X.sum(axis=0)
                else:
                    raise ValueError(f"Unknown pooling method: {method}")
        
        # Token not found - synthesize if requested
        if generate and token is not None:
            tid = self._synthesize_token(token)
            return self.pooled(tid, method=method, generate=False)
            
        return None
    
    def encode(self, token: str, *, return_id: bool = False, generate: bool = False) -> Union[np.ndarray, Tuple[np.ndarray, int]]:
        """Encode token, loading if necessary, synthesizing if requested"""
        tid = self._token_to_id.get(token)
        
        if tid is None:
            if generate:
                # Synthesize new token
                tid = self._synthesize_token(token)
                X = self._id_to_vec[tid]
            else:
                # Fallback to UNK
                unk_id = self._token_to_id.get("<unk>")
                if unk_id is None:
                    # No UNK token - try to synthesize if allowed
                    if generate:
                        tid = self._synthesize_token(token)
                        X = self._id_to_vec[tid]
                    else:
                        raise KeyError(f"Token '{token}' not found and no <unk> token available")
                else:
                    X = self.embedding(unk_id, generate=False)
                    tid = unk_id
        else:
            X = self.embedding(tid, generate=False)
            if X is None:
                raise RuntimeError(f"Failed to load embedding for token '{token}'")
                
        return (X, tid) if return_id else X
    
    def get_score(self, token_or_id: Union[str, int]) -> float:
        """Get token score"""
        tid = token_or_id if isinstance(token_or_id, int) else self._token_to_id.get(token_or_id)
        if tid is None or tid not in self._valid_token_ids:
            return -100.0
            
        # Load volume if needed
        if tid not in self._id_to_volume:
            self._load_crystal(tid)
            
        vol = self._id_to_volume.get(tid, 1.0)
        return float(np.clip(vol / 10.0, 0.01, 1.0))
    
    def encode_batch(self, tokens: Union[str, List[str]], 
                     *, return_ids: bool = False,
                     prefetch: bool = True,
                     generate: bool = False) -> Union[List[np.ndarray], Tuple[List[np.ndarray], List[int]]]:
        """
        Encode a batch of tokens efficiently.
        
        Args:
            tokens: Either a string (will be tokenized) or list of token strings
            return_ids: Whether to return token IDs alongside embeddings
            prefetch: Whether to prefetch all tokens before encoding
            generate: If True, synthesize missing tokens
        
        Returns:
            List of embeddings, optionally with list of token IDs
        """
        # Handle string input - tokenize it
        if isinstance(tokens, str):
            tokens = self.tokenizer(tokens)
        
        if not isinstance(tokens, list):
            raise TypeError(f"Expected str or List[str], got {type(tokens)}")
        
        # Track which tokens need synthesis
        tokens_to_synthesize = []
        if generate:
            for token in tokens:
                if token not in self._token_to_id:
                    tokens_to_synthesize.append(token)
        
        # Warn about batch synthesis if needed
        if tokens_to_synthesize and not self.silent and not SILENT_MODE:
            warnings.warn(
                f"{len(tokens_to_synthesize)} tokens synthesized - ensure you synthesize your tokens ahead of time. "
                f"Synthesized: {tokens_to_synthesize[:5]}{'...' if len(tokens_to_synthesize) > 5 else ''}",
                UserWarning,
                stacklevel=2
            )
        
        # Prefetch existing tokens if requested
        if prefetch:
            self._prefetch_batch([t for t in tokens if t in self._token_to_id])
        
        # Encode all tokens
        embeddings = []
        ids = []
        
        for token in tokens:
            if return_ids:
                emb, tid = self.encode(token, return_id=True, generate=generate)
                embeddings.append(emb)
                ids.append(tid)
            else:
                emb = self.encode(token, return_id=False, generate=generate)
                embeddings.append(emb)
        
        return (embeddings, ids) if return_ids else embeddings
    
    def _prefetch_batch(self, tokens: List[str]):
        """
        Prefetch a batch of tokens efficiently.
        """
        # Collect all token IDs that need loading
        tokens_to_load = []
        for token in tokens:
            tid = self._token_to_id.get(token)
            if tid and tid not in self._crystal_cache and tid not in self._id_to_vec:
                tokens_to_load.append(tid)
        
        if not tokens_to_load:
            return  # Everything already cached
        
        # Load crystals for each token
        for tid in tokens_to_load:
            self._load_crystal(tid)
    
    def cache_stats(self) -> Dict[str, Any]:
        """Get cache statistics"""
        stats = super().cache_stats()
        stats.update({
            "crystal_cache_size": len(self._crystal_cache),
            "pooled_lru_size": len(self._pooled_lru),
            "rows_cached": len(self._row_data),
            "tokens_indexed": len(self._token_index),
            "ids_indexed": len(self._id_index),
            "synthesized_tokens": len(self._synthesized_tokens),
        })
        return stats
    
    def evict_from_cache(self, tokens: Optional[List[str]] = None):
        """Manually evict tokens from cache to free memory"""
        if tokens is None:
            # Clear all caches
            self._crystal_cache.clear()
            self._pooled_lru.clear()
            self._id_to_vec.clear()
            self._id_to_pooled.clear()
            self._row_data.clear()
        else:
            # Evict specific tokens
            for token in tokens:
                tid = self._token_to_id.get(token)
                if tid:
                    self._crystal_cache.pop(tid, None)
                    self._pooled_lru.pop(tid, None)
                    self._id_to_vec.pop(tid, None)
                    self._id_to_pooled.pop(tid, None)
    
    def get_synthesized_tokens(self) -> List[str]:
        """Get list of all tokens that were synthesized at runtime"""
        return list(self._synthesized_tokens)
    
    def is_synthesized(self, token: str) -> bool:
        """Check if a token was synthesized at runtime"""
        return token in self._synthesized_tokens




# For 100-dimensional embeddings
vocab = LazyGeometricVocab(
    repo_id="AbstractPhil/geometric-vocab",
    dim=64,
    name="unicode_64d",  # Specifies the dimension config
    split="train",        # Now always "train"
    stream=False,
    cache_size=1024,
    silent=False
)
FROZEN_VOCAB = []