"""
Convergence Loop: the core reasoning mechanism.

Replaces attention in transformers. Multi-hop spatial search with query anchor.

How it works:
  1. Encode query → vector
  2. Search nearest neurons in the DB
  3. Blend neighbors weighted by confidence → activation
  4. Mix activation with original query (anchor prevents drift)
  5. Repeat until vector stabilizes (converged) or max hops (abort)

Converged = answer found. Not converged = "I don't know." (Invariant #4)

Each hop is inspectable — the trace shows exactly why the answer was found.
(Invariant #2: every answer has a source.)
"""

from dataclasses import dataclass, field

import numpy as np

from neuron import Neuron, NeuronDB


@dataclass
class Hop:
    """One step in the convergence trace. For inspectability."""
    hop_number: int
    neighbors: list          # [(neuron_id, confidence, similarity)]
    activation: np.ndarray   # blended vector before anchor
    current: np.ndarray      # vector after anchor blend
    movement: float          # cosine distance from previous step


@dataclass
class ConvergenceResult:
    """Result of a convergence loop."""
    converged: bool
    vector: np.ndarray              # final vector position
    concepts: list                  # neurons that participated in final hop
    hops: list = field(default_factory=list)  # full trace
    confidence: float = 0.0         # aggregate confidence of result

    def trace(self) -> str:
        """Human-readable trace of the convergence path. Invariant #2."""
        lines = []
        for hop in self.hops:
            neighbors_str = ", ".join(
                f"n{nid}({conf:.2f})" for nid, conf, _ in hop.neighbors
            )
            lines.append(
                f"  Hop {hop.hop_number}: [{neighbors_str}] "
                f"movement={hop.movement:.4f}"
            )
        status = "CONVERGED" if self.converged else "DID NOT CONVERGE"
        lines.insert(0, f"Convergence: {status} (confidence={self.confidence:.3f})")
        return "\n".join(lines)


class ConvergenceLoop:
    """
    Multi-hop reasoning through spatial search.

    The convergence loop IS attention — but each hop is inspectable.
    Query anchor IS a residual connection — prevents drift.
    Convergence check IS the stopping criterion — no convergence = abstain.

    Transformer correspondence:
      - _weighted_blend() uses softmax (exponential sharpening) over
        confidence scores, identical to scaled dot-product attention.
      - Per-hop k and threshold schedules give functional layer
        specialization: early hops explore broadly, later hops focus.
      - Concept-to-concept attention (NxN among discovered neighbors)
        provides compositional reasoning — same as token-to-token
        attention in transformer self-attention.
    """

    def __init__(self, db: NeuronDB = None,
                 max_hops: int = 10,
                 k: int = 5,
                 convergence_threshold: float = 0.99,
                 min_confidence: float = 0.1,
                 min_relevance: float = 0.3,
                 temperature: float = 1.0,
                 search_fn=None,
                 blend_fn=None,
                 cosine_fn=None):
        """
        Args:
            db: NeuronDB to search in (optional if search_fn provided)
            max_hops: maximum reasoning steps before abort
            k: number of neighbors to retrieve per hop
            convergence_threshold: cosine sim threshold for "stable"
            min_confidence: minimum neuron confidence to participate
            min_relevance: minimum cosine similarity between query and
                          best neighbor to accept convergence. Below this,
                          the system says "I don't know" even if the vector
                          stabilized. Invariant #4: honest about failure.
            temperature: softmax temperature for confidence weighting.
                        Higher = more uniform, lower = sharper.
                        Default 1.0 gives true softmax behavior.
                        Use float('inf') to recover pre-softmax linear
                        normalization for backward compatibility.
            search_fn: optional callable(query, k) → list of Neuron-like objects.
                      Allows plugging in sparse search or any other backend.
                      Each returned object must have .id, .vector, .confidence.
            blend_fn: optional callable(neurons) → blended vector.
                     If None, uses default _weighted_blend.
            cosine_fn: optional callable(a, b) → float similarity.
                      If None, uses default numpy cosine.
        """
        self.db = db
        self.max_hops = max_hops
        self.k = k
        self.convergence_threshold = convergence_threshold
        self.min_confidence = min_confidence
        self.min_relevance = min_relevance
        self.temperature = temperature
        self._search_fn = search_fn
        self._blend_fn = blend_fn
        self._cosine_fn = cosine_fn

    def converge(self, query_vector: np.ndarray) -> ConvergenceResult:
        """
        Run the convergence loop.

        Returns ConvergenceResult with converged=True if stable,
        converged=False if max hops reached (honest abstention).

        Per-hop specialization (like transformer layers having different
        learned parameters): early hops explore broadly (higher k, lower
        confidence threshold), later hops focus narrowly (lower k, higher
        threshold). This gives functional layer specialization without
        learned weights.
        """
        query = np.array(query_vector, dtype=np.float32)
        query_norm = np.linalg.norm(query)
        if query_norm == 0:
            return ConvergenceResult(
                converged=False, vector=query, concepts=[], confidence=0.0
            )
        query = query / query_norm

        current = query.copy()
        hops = []
        last_concepts = []

        for hop_num in range(self.max_hops):
            previous = current.copy()

            # --- Per-hop specialization (transformer layer analogy) ---
            # Progress through the loop: 0.0 (first hop) → 1.0 (last hop)
            progress = hop_num / max(self.max_hops - 1, 1)

            # Early hops: explore broadly (more neighbors)
            # Later hops: focus narrowly (fewer neighbors)
            hop_k = max(2, int(self.k * (1.5 - 0.7 * progress)))

            # Early hops: accept lower confidence (explore)
            # Later hops: require higher confidence (focus)
            hop_min_conf = self.min_confidence * (1.0 + 0.5 * progress)

            # 1. Search nearest neurons with per-hop k
            if self._search_fn:
                neighbors = self._search_fn(current, k=hop_k)
            else:
                neighbors = self.db.search(current, k=hop_k)

            # Filter by per-hop minimum confidence
            neighbors = [n for n in neighbors if n.confidence >= hop_min_conf]

            if not neighbors:
                # No neurons above confidence threshold — honest abort
                return ConvergenceResult(
                    converged=False, vector=current,
                    concepts=[], hops=hops, confidence=0.0,
                )

            # 2. Concept-to-concept attention (NxN among neighbors)
            #    Transformers compute attention between all tokens.
            #    Here we compute pairwise similarity among discovered
            #    neighbors and boost those that are mutually relevant.
            #    This gives compositional reasoning: concepts that
            #    "attend to each other" get amplified.
            neighbors = self._mutual_attention(neighbors)

            # 3. Blend neighbors weighted by confidence → activation
            #    Uses softmax (exponential sharpening) over confidences.
            if self._blend_fn:
                activation = self._blend_fn(neighbors)
            else:
                activation = self._weighted_blend(neighbors)

            # 4. Anchor to query (prevents drift)
            #    Early hops: explore (more activation)
            #    Later hops: contract (more query anchor)
            alpha = hop_num / self.max_hops  # 0→1
            current = (1 - alpha) * activation + alpha * query

            # Re-normalize
            norm = np.linalg.norm(current)
            if norm > 0:
                current = current / norm

            # Compute movement (how much the vector changed)
            movement = 1.0 - float(self._cosine_sim(current, previous))

            # Compute similarities for the trace
            neighbor_info = []
            for n in neighbors:
                sim = float(self._cosine_sim(n.vector, current))
                neighbor_info.append((n.id, n.confidence, sim))

            hops.append(Hop(
                hop_number=hop_num,
                neighbors=neighbor_info,
                activation=activation.copy(),
                current=current.copy(),
                movement=movement,
            ))

            last_concepts = neighbors

            # 5. Check convergence: has the vector stopped moving?
            sim = self._cosine_sim(current, previous)
            if sim >= self.convergence_threshold:
                # Vector stabilized — but are the neighbors actually relevant?
                best_relevance = max(
                    self._cosine_sim(n.vector, query) for n in neighbors
                )
                if best_relevance < self.min_relevance:
                    # Converged on garbage — honest abstention
                    return ConvergenceResult(
                        converged=False,
                        vector=current,
                        concepts=last_concepts,
                        hops=hops,
                        confidence=0.0,
                    )

                # CONVERGED on relevant neurons
                avg_confidence = np.mean([n.confidence for n in neighbors])
                return ConvergenceResult(
                    converged=True,
                    vector=current,
                    concepts=last_concepts,
                    hops=hops,
                    confidence=float(avg_confidence),
                )

        # DID NOT CONVERGE — "I don't know" (Invariant #4)
        avg_confidence = (
            np.mean([n.confidence for n in last_concepts])
            if last_concepts else 0.0
        )
        return ConvergenceResult(
            converged=False,
            vector=current,
            concepts=last_concepts,
            hops=hops,
            confidence=float(avg_confidence) * 0.5,  # penalize non-convergence
        )

    def _weighted_blend(self, neurons: list) -> np.ndarray:
        """
        Blend neuron vectors weighted by softmax over confidence scores.

        This IS softmax attention: exp(c / T) / sum(exp(c / T)).
        Temperature controls sharpening:
          - T → 0: winner-take-all (hard attention)
          - T = 1: standard softmax
          - T → ∞: uniform weighting (recovers old linear normalization)
        """
        vectors = np.array([n.vector for n in neurons])
        confidences = np.array([n.confidence for n in neurons], dtype=np.float32)

        # Floor at 0 for weighting (negative confidence = no contribution)
        confidences = np.maximum(confidences, 0)

        if confidences.sum() == 0:
            weights = np.ones(len(neurons), dtype=np.float32) / len(neurons)
        elif self.temperature == float('inf'):
            # Backward compat: infinite temperature = linear normalization
            weights = confidences / confidences.sum()
        else:
            # Softmax with temperature: exp(c/T) / sum(exp(c/T))
            # Subtract max for numerical stability (standard softmax trick)
            scaled = confidences / max(self.temperature, 1e-8)
            scaled = scaled - scaled.max()
            exp_scaled = np.exp(scaled)
            weights = exp_scaled / exp_scaled.sum()

        blended = np.average(vectors, axis=0, weights=weights).astype(np.float32)

        norm = np.linalg.norm(blended)
        if norm > 0:
            blended = blended / norm

        return blended

    def _mutual_attention(self, neurons: list) -> list:
        """
        Concept-to-concept attention: NxN similarity among discovered
        neighbors. Boost neurons that are mutually relevant — they
        "attend to each other."

        This is the compositional reasoning step that makes transformers
        work: tokens don't just attend to the query, they attend to
        each other. Here, concepts that form a coherent cluster get
        boosted, while isolated concepts get dampened.

        Returns the same neurons with confidence adjusted by mutual
        relevance. Does NOT modify the original neuron objects — creates
        lightweight wrappers.
        """
        if len(neurons) <= 1:
            return neurons

        n = len(neurons)

        # Compute pairwise similarity — uses pluggable cosine if provided
        if self._cosine_fn:
            sim_matrix = np.zeros((n, n), dtype=np.float32)
            for i in range(n):
                for j in range(i + 1, n):
                    s = self._cosine_fn(neurons[i].vector, neurons[j].vector)
                    sim_matrix[i, j] = s
                    sim_matrix[j, i] = s
        else:
            vectors = np.array([nn.vector for nn in neurons])
            norms = np.linalg.norm(vectors, axis=1, keepdims=True)
            norms = np.maximum(norms, 1e-8)
            normed = vectors / norms
            sim_matrix = normed @ normed.T  # NxN
            np.fill_diagonal(sim_matrix, 0.0)

        # Each neuron's mutual relevance = mean similarity to all others
        mutual_scores = sim_matrix.sum(axis=1) / max(n - 1, 1)

        # Boost confidence by mutual relevance:
        # new_confidence = original * (1 + mutual_score)
        # This preserves ordering but amplifies coherent clusters.
        boosted = []
        for i, neuron in enumerate(neurons):
            boost_factor = 1.0 + float(mutual_scores[i])
            # Create a lightweight copy with boosted confidence
            boosted_neuron = Neuron(
                id=neuron.id,
                vector=neuron.vector,
                confidence=neuron.confidence * boost_factor,
                successors=neuron.successors,
                predecessors=neuron.predecessors,
                timestamp=neuron.timestamp,
                temporal=neuron.temporal,
                level=neuron.level,
            )
            boosted.append(boosted_neuron)

        return boosted

    def _cosine_sim(self, a, b) -> float:
        """Cosine similarity. Uses pluggable cosine_fn if provided."""
        if self._cosine_fn:
            return self._cosine_fn(a, b)
        dot = float(np.dot(a, b))
        na = np.linalg.norm(a)
        nb = np.linalg.norm(b)
        if na == 0 or nb == 0:
            return 0.0
        return dot / (na * nb)


@dataclass
class MultiHopResult:
    """Result of multi-hop reasoning across convergence rounds."""
    converged: bool
    concepts: list                          # merged concept set from all rounds
    rounds: list = field(default_factory=list)  # list of ConvergenceResult per round
    confidence: float = 0.0
    vector: np.ndarray = None               # final vector position

    def trace(self) -> str:
        """Human-readable trace across all rounds. Invariant #2."""
        lines = []
        for i, r in enumerate(self.rounds):
            lines.append(f"=== Round {i + 1} ===")
            lines.append(r.trace())
        status = "CONVERGED" if self.converged else "DID NOT CONVERGE"
        concept_count = len(self.concepts)
        lines.insert(0,
            f"Multi-hop: {status} in {len(self.rounds)} round(s), "
            f"{concept_count} concepts (confidence={self.confidence:.3f})"
        )
        return "\n".join(lines)


class MultiHopConvergence:
    """
    Chained convergence: each round's discovered concepts shift the query
    for the next round, allowing reasoning to cross concept boundaries.

    Round 1: query → converge → concepts A
    Round 2: query + concepts_A blend → converge → concepts B
    ...
    Stop when: no new concepts found, or max rounds reached.

    This is iterative retrieval-generation (ITER-RETGEN) done without
    a neural component. Each round is inspectable. The query anchor
    prevents drift across rounds.
    """

    def __init__(self, loop: ConvergenceLoop,
                 max_rounds: int = 3,
                 concept_blend_weight: float = 0.4):
        """
        Args:
            loop: the underlying ConvergenceLoop
            max_rounds: maximum reasoning rounds
            concept_blend_weight: how much discovered concepts shift the query
                                 (0 = ignore concepts, 1 = ignore query)
        """
        self.loop = loop
        self.max_rounds = max_rounds
        self.concept_blend_weight = concept_blend_weight

    def reason(self, query_vector: np.ndarray) -> MultiHopResult:
        """
        Run multi-hop reasoning.

        Each round discovers concepts. Those concepts' vectors get blended
        into the query for the next round, shifting the search into new
        regions of concept space.
        """
        query = np.array(query_vector, dtype=np.float32)
        norm = np.linalg.norm(query)
        if norm == 0:
            return MultiHopResult(
                converged=False, concepts=[], confidence=0.0,
                vector=query,
            )
        query = query / norm

        all_concepts = []       # merged across rounds
        seen_ids = set()        # avoid duplicates
        rounds = []
        current_query = query.copy()

        for round_num in range(self.max_rounds):
            # Run convergence with the current (possibly shifted) query
            result = self.loop.converge(current_query)
            rounds.append(result)

            # Collect new concepts from this round
            new_concepts = []
            for c in result.concepts:
                if c.id not in seen_ids:
                    new_concepts.append(c)
                    seen_ids.add(c.id)

            all_concepts.extend(new_concepts)

            # Stop conditions:
            # 1. First round didn't converge at all → no point continuing
            if round_num == 0 and not result.converged and not result.concepts:
                break

            # 2. No new concepts found → we've exhausted this reasoning chain
            if not new_concepts and round_num > 0:
                break

            # 3. Last round → don't prepare next query
            if round_num == self.max_rounds - 1:
                break

            # Prepare next round: blend discovered concepts into query
            # This shifts the search to a new region of concept space
            if new_concepts:
                concept_blend = self._blend_concepts(new_concepts)
                w = self.concept_blend_weight
                current_query = (1 - w) * query + w * concept_blend
                norm = np.linalg.norm(current_query)
                if norm > 0:
                    current_query = current_query / norm

        # Determine overall result
        any_converged = any(r.converged for r in rounds)
        if all_concepts and any_converged:
            avg_conf = float(np.mean([c.confidence for c in all_concepts]))
            final_vec = rounds[-1].vector if rounds else query
            return MultiHopResult(
                converged=True,
                concepts=all_concepts,
                rounds=rounds,
                confidence=avg_conf,
                vector=final_vec,
            )
        else:
            return MultiHopResult(
                converged=False,
                concepts=all_concepts,
                rounds=rounds,
                confidence=0.0,
                vector=rounds[-1].vector if rounds else query,
            )

    def _blend_concepts(self, concepts: list) -> np.ndarray:
        """Blend concept vectors weighted by confidence."""
        vectors = np.array([c.vector for c in concepts], dtype=np.float32)
        confs = np.array([max(c.confidence, 0.01) for c in concepts], dtype=np.float32)
        total = confs.sum()
        if total == 0:
            weights = np.ones(len(concepts), dtype=np.float32) / len(concepts)
        else:
            weights = confs / total
        blended = np.average(vectors, axis=0, weights=weights).astype(np.float32)
        norm = np.linalg.norm(blended)
        if norm > 0:
            blended = blended / norm
        return blended