File size: 12,668 Bytes

dbb04e4

"""

Contextual Query Masking via XOR Attention (Phase 4.0)

======================================================

Implements an XOR-based soft attention mechanism over Binary HDV space.



How it works:

  1. A "context key" is constructed by bundling recent HOT-tier vectors.

  2. A XOR attention mask is generated:  mask = query XOR context_key

     This creates a residual vector that is ORTHOGONAL to the context,

     effectively suppressing already-known dimensions and amplifying novel ones.

  3. Query results are re-ranked by a composite score:

        composite = alpha * raw_similarity + beta * novelty_boost(mask, mem_hdv)

  4. The mask is also available for downstream gap-detection.



Motivation (VSA theory):

  - XOR in binary HDV space is the self-inverse binding operator.

  - query.xor(context) ≈ "what about this query is NOT already represented in context?"

  - Hamming similarity(mask, candidate) ≈ novelty of candidate relative to context.



Phase 4.1: XOR-based Project Isolation

======================================

XORIsolationMask provides deterministic project-based memory isolation:



  - Each project_id derives a unique binary mask via SHA256(project_id) -> seed -> RNG

  - store(): masked_hdv = original_hdv XOR project_mask

  - query(): unmasked_query = query_hdv XOR project_mask (then search in masked space)

  - Memories from different projects are effectively orthogonal (~50% similarity)

"""

from __future__ import annotations

import hashlib
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple

import numpy as np
from loguru import logger

from .binary_hdv import BinaryHDV, majority_bundle


@dataclass
class AttentionConfig:
    """Tunable hyperparameters for XOR attention."""
    alpha: float = 0.6          # Weight for raw similarity
    beta: float = 0.4           # Weight for novelty score from XOR mask
    context_sample_n: int = 50  # How many HOT nodes to include in context key
    min_novelty_boost: float = 0.0  # Floor for novelty contribution
    enabled: bool = True

    def validate(self) -> None:
        assert 0.0 <= self.alpha <= 1.0, "alpha must be in [0, 1]"
        assert 0.0 <= self.beta <= 1.0, "beta must be in [0, 1]"
        assert abs((self.alpha + self.beta) - 1.0) < 1e-6, "alpha + beta must equal 1.0"


@dataclass
class AttentionResult:
    """Enriched result from contextual reranking."""
    node_id: str
    raw_score: float
    novelty_score: float
    composite_score: float
    attention_mask: Optional[BinaryHDV] = field(default=None, repr=False)


class XORAttentionMasker:
    """

    Contextual query masking using XOR binding in binary HDV space.



    Usage:

        masker = XORAttentionMasker(config)

        mask = masker.build_attention_mask(query_vec, context_vecs)

        reranked = masker.rerank(raw_scores, memory_vectors, mask)

    """

    def __init__(self, config: Optional[AttentionConfig] = None):
        self.config = config or AttentionConfig()

    def build_context_key(self, context_nodes_hdv: List[BinaryHDV]) -> BinaryHDV:
        """

        Bundle HOT-tier vectors into a single context summary key.

        Uses majority vote bundling (sum > threshold → 1, else → 0).

        Falls back to zero-vector if no context is available.

        """
        if not context_nodes_hdv:
            return BinaryHDV.zeros(context_nodes_hdv[0].dimension if context_nodes_hdv else 16384)

        return majority_bundle(context_nodes_hdv)

    def build_attention_mask(

        self,

        query_vec: BinaryHDV,

        context_key: BinaryHDV,

    ) -> BinaryHDV:
        """

        Compute XOR attention mask: mask = query XOR context_key.



        The mask represents "query minus context" — bits that are unique

        to the query compared to what the system already holds in working memory.



        High Hamming similarity between mask and a candidate → that candidate

        is novel / peripheral relative to the current context.

        """
        mask = query_vec.xor_bind(context_key)
        logger.debug(
            "Built XOR attention mask — "
            f"query/context Hamming dist = {query_vec.normalized_distance(context_key):.4f}"
        )
        return mask

    def novelty_score(self, mask: BinaryHDV, candidate_hdv: BinaryHDV) -> float:
        """

        Calculate novelty of a candidate relative to the context.



        Defined as: Hamming similarity(mask, candidate) in [0, 1].

        Higher value → candidate is more "attention-worthy" given the query context.

        """
        return mask.similarity(candidate_hdv)

    def rerank(

        self,

        raw_scores: Dict[str, float],

        memory_vectors: Dict[str, BinaryHDV],

        mask: BinaryHDV,

    ) -> List[AttentionResult]:
        """

        Re-rank retrieved memories using the composite XOR attention score.



        Args:

            raw_scores: {node_id: raw_similarity} from initial retrieval.

            memory_vectors: {node_id: BinaryHDV} for novelty calculation.

            mask: XOR attention mask built from query and context.



        Returns:

            Sorted list of AttentionResult (highest composite first).

        """
        cfg = self.config
        results: List[AttentionResult] = []

        for node_id, raw in raw_scores.items():
            hdv = memory_vectors.get(node_id)
            if hdv is None:
                novelty = cfg.min_novelty_boost
            else:
                novelty = max(self.novelty_score(mask, hdv), cfg.min_novelty_boost)

            composite = cfg.alpha * raw + cfg.beta * novelty

            results.append(
                AttentionResult(
                    node_id=node_id,
                    raw_score=raw,
                    novelty_score=novelty,
                    composite_score=composite,
                    attention_mask=mask,
                )
            )

        results.sort(key=lambda r: r.composite_score, reverse=True)
        return results

    def extract_scores(

        self, results: List[AttentionResult]

    ) -> List[Tuple[str, float]]:
        """Convert AttentionResult list to the standard (node_id, score) tuple format."""
        return [(r.node_id, r.composite_score) for r in results]


# ==============================================================================
# Phase 4.1: XOR-based Project Isolation
# ==============================================================================


@dataclass
class IsolationConfig:
    """Configuration for XOR-based project isolation."""
    enabled: bool = True
    dimension: int = 16384

    def validate(self) -> None:
        assert self.dimension > 0, "dimension must be positive"
        assert self.dimension % 8 == 0, "dimension must be multiple of 8"


class XORIsolationMask:
    """

    Deterministic XOR-based isolation mask for multi-tenant memory isolation.



    Design:

    -------

    Each project_id derives a unique binary mask through:

        SHA256(project_id) -> 256-bit digest -> seed -> np.random.Generator -> binary mask



    The mask is applied via XOR binding:

        - store(content, project_id="A"): masked_hdv = original_hdv XOR mask_A

        - query(query_text, project_id="A"): unmasked = query_hdv XOR mask_A



    Properties:

    -----------

    - Self-inverse: XOR twice with the same mask recovers the original vector

    - Deterministic: Same project_id always produces the same mask

    - Orthogonal isolation: Different projects' masks are ~50% different (random)

    - No key management: project_id IS the key (no external secrets needed)



    Security Model:

    ---------------

    This provides cryptographic isolation via the one-time pad principle:

    - A masked vector reveals NO information about the original without the mask

    - Cross-project queries will match random noise (~50% similarity baseline)

    - The isolation strength depends on the secrecy of project_ids



    Usage:

    ------

        masker = XORIsolationMask(config)

        mask = masker.get_mask("project-alpha")  # Deterministic mask



        # Store

        masked_hdv = masker.apply_mask(original_hdv, "project-alpha")



        # Query (apply same mask to query to search in masked space)

        masked_query = masker.apply_mask(query_hdv, "project-alpha")



        # Remove mask (if needed for inspection)

        original = masker.remove_mask(masked_hdv, "project-alpha")

    """

    def __init__(self, config: Optional[IsolationConfig] = None):
        self.config = config or IsolationConfig()
        self._mask_cache: Dict[str, BinaryHDV] = {}

    def _derive_seed(self, project_id: str) -> int:
        """

        Derive a deterministic 64-bit seed from project_id using SHA256.



        Args:

            project_id: Unique project identifier string.



        Returns:

            64-bit integer seed for numpy's Generator.

        """
        digest = hashlib.sha256(f"mnemo_isolation_v1:{project_id}".encode()).digest()
        return int.from_bytes(digest[:8], byteorder="big", signed=False)

    def get_mask(self, project_id: str) -> BinaryHDV:
        """

        Get or create the deterministic isolation mask for a project.



        The mask is cached for efficiency. Same project_id always returns

        the same BinaryHDV mask.



        Args:

            project_id: Unique project identifier.



        Returns:

            BinaryHDV mask of dimension self.config.dimension.

        """
        if project_id in self._mask_cache:
            return self._mask_cache[project_id]

        seed = self._derive_seed(project_id)
        rng = np.random.default_rng(seed)

        # Generate random binary mask
        n_bytes = self.config.dimension // 8
        mask_bytes = rng.integers(0, 256, size=n_bytes, dtype=np.uint8)

        mask = BinaryHDV(data=mask_bytes, dimension=self.config.dimension)
        self._mask_cache[project_id] = mask

        logger.debug(f"Generated isolation mask for project '{project_id}' (seed={seed})")
        return mask

    def apply_mask(self, hdv: BinaryHDV, project_id: str) -> BinaryHDV:
        """

        Apply project isolation mask to a vector (XOR binding).



        Args:

            hdv: The BinaryHDV to mask.

            project_id: Project identifier for mask derivation.



        Returns:

            Masked BinaryHDV (original XOR project_mask).

        """
        if not self.config.enabled:
            return hdv

        mask = self.get_mask(project_id)
        return hdv.xor_bind(mask)

    def remove_mask(self, masked_hdv: BinaryHDV, project_id: str) -> BinaryHDV:
        """

        Remove project isolation mask from a vector (XOR is self-inverse).



        Note: This is identical to apply_mask() due to XOR's self-inverse property.

        Kept as a separate method for semantic clarity.



        Args:

            masked_hdv: The masked BinaryHDV.

            project_id: Project identifier used for masking.



        Returns:

            Original unmasked BinaryHDV.

        """
        return self.apply_mask(masked_hdv, project_id)

    def clear_cache(self) -> None:
        """Clear the mask cache (useful for testing)."""
        self._mask_cache.clear()

    def is_isolated(

        self,

        hdv_a: BinaryHDV,

        project_id_a: str,

        hdv_b: BinaryHDV,

        project_id_b: str,

        threshold: float = 0.55,

    ) -> bool:
        """

        Check if two vectors are properly isolated (different projects).



        After masking, vectors from different projects should have ~50% similarity.

        This method checks if the cross-project similarity is within expected bounds.



        Args:

            hdv_a: First (unmasked) vector.

            project_id_a: First vector's project.

            hdv_b: Second (unmasked) vector.

            project_id_b: Second vector's project.

            threshold: Maximum similarity for "isolated" (default 0.55).



        Returns:

            True if vectors are isolated (different projects), False otherwise.

        """
        if project_id_a == project_id_b:
            return False  # Same project = not isolated

        masked_a = self.apply_mask(hdv_a, project_id_a)
        masked_b = self.apply_mask(hdv_b, project_id_b)

        similarity = masked_a.similarity(masked_b)
        return similarity < threshold