"""
NeuroScope — Activation Extraction Pipeline

Loads Qwen3-4B and extracts hidden states + attention patterns for visualization.
Includes a demo mode that generates realistic synthetic data for GPU-free UI testing.

Architecture reference (Qwen3-4B):
    - 36 hidden layers, 32 attention heads (GQA with 8 KV heads)
    - 2560 hidden dim, 80 head dim
    - RoPE positional encoding, SwiGLU MLP

Usage:
    from extraction import ActivationExtractor, ExtractionResult
    result = ActivationExtractor.generate_demo_data("Hello world")
    # or: extractor = ActivationExtractor(); extractor.load_model(); result = extractor.extract("Hello")
"""

import time
import numpy as np
from dataclasses import dataclass
from typing import Optional

# ---------------------------------------------------------------------------
# Qwen3-4B architecture defaults (overridden at runtime when model loads)
# ---------------------------------------------------------------------------
DEFAULT_NUM_LAYERS = 36
DEFAULT_NUM_HEADS = 32
DEFAULT_NUM_KV_HEADS = 8
DEFAULT_HIDDEN_DIM = 2560
DEFAULT_HEAD_DIM = DEFAULT_HIDDEN_DIM // DEFAULT_NUM_HEADS  # 80


@dataclass
class ExtractionResult:
    """Structured output from a forward pass or demo data generation."""
    tokens: list[str]                # Decoded token strings
    hidden_states: np.ndarray        # (num_layers+1, seq_len, hidden_dim) — includes embedding layer
    attentions: np.ndarray           # (num_layers, num_heads, seq_len, seq_len)
    num_layers: int
    num_heads: int
    hidden_dim: int
    inference_time: float            # Seconds
    is_demo: bool = False


class ActivationExtractor:
    """Manages Qwen3-4B loading, inference, and activation capture."""

    def __init__(self):
        self.model = None
        self.tokenizer = None
        self.device = None
        self.num_layers = DEFAULT_NUM_LAYERS
        self.num_heads = DEFAULT_NUM_HEADS
        self.hidden_dim = DEFAULT_HIDDEN_DIM
        self.model_loaded = False

    def load_model(
        self,
        model_name: str = "Qwen/Qwen3-4B",
        quantize: bool = False,
    ) -> str:
        """Load model with optional 4-bit quantization for VRAM efficiency.

        Args:
            model_name: HuggingFace model identifier.
            quantize: If True, use bitsandbytes 4-bit NF4 quantization (~3 GB VRAM).

        Returns:
            Status string with detected architecture info.
        """
        import torch
        from transformers import AutoModelForCausalLM, AutoTokenizer

        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

        load_kwargs: dict = {
            "dtype": torch.bfloat16,
            "device_map": "auto",
            "trust_remote_code": True,
            "attn_implementation": "eager",
        }
        if quantize:
            from transformers import BitsAndBytesConfig
            load_kwargs["quantization_config"] = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_quant_type="nf4",
            )

        self.model = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)
        self.model.eval()

        # Auto-detect architecture from model config
        cfg = self.model.config
        self.num_layers = cfg.num_hidden_layers
        self.num_heads = cfg.num_attention_heads
        self.hidden_dim = cfg.hidden_size
        self.device = next(self.model.parameters()).device
        self.model_loaded = True

        return (
            f"✅ Loaded {model_name}: {self.num_layers} layers, "
            f"{self.num_heads} heads, {self.hidden_dim} hidden dim, "
            f"device={self.device}"
        )

    def extract(self, prompt: str) -> ExtractionResult:
        """Run forward pass and extract all hidden states + attention weights.

        Uses HuggingFace native output_attentions / output_hidden_states for
        simplicity and broad model compatibility.
        """
        import torch

        if not self.model_loaded:
            raise RuntimeError(
                "Model not loaded. Call load_model() first or use generate_demo_data()."
            )

        t0 = time.time()

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)

        with torch.no_grad():
            outputs = self.model(
                **inputs,
                output_attentions=True,
                output_hidden_states=True,
            )

        inference_time = time.time() - t0

        # Decode token strings (clean up common BPE prefixes)
        token_ids = inputs.input_ids[0].tolist()
        tokens = self.tokenizer.convert_ids_to_tokens(token_ids)
        tokens = [self._clean_token(t) for t in tokens]

        # Stack hidden states → (num_layers+1, seq_len, hidden_dim)
        hidden_states = np.stack(
            [hs[0].float().cpu().numpy() for hs in outputs.hidden_states]
        )

        # Stack attentions → (num_layers, num_heads, seq_len, seq_len)
        attentions = np.stack(
            [attn[0].float().cpu().numpy() for attn in outputs.attentions]
        )

        return ExtractionResult(
            tokens=tokens,
            hidden_states=hidden_states,
            attentions=attentions,
            num_layers=self.num_layers,
            num_heads=self.num_heads,
            hidden_dim=self.hidden_dim,
            inference_time=inference_time,
            is_demo=False,
        )

    def generate_streaming(
        self,
        prompt: str,
        max_new_tokens: int = 32,
    ):
        """Generate tokens one-by-one, yielding ExtractionResult after each step.

        This is a Python generator. Each yield produces an ExtractionResult
        containing the full sequence so far (prompt + generated tokens) with
        fresh hidden states and attention weights.

        Args:
            prompt: Input text to continue generating from.
            max_new_tokens: Maximum number of new tokens to generate.

        Yields:
            ExtractionResult for the growing sequence after each new token.
        """
        import torch

        if not self.model_loaded:
            raise RuntimeError(
                "Model not loaded. Call load_model() first."
            )

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        input_ids = inputs.input_ids
        t0 = time.time()

        for step in range(max_new_tokens):
            with torch.no_grad():
                outputs = self.model(
                    input_ids=input_ids,
                    output_attentions=True,
                    output_hidden_states=True,
                )

            # Greedy decode next token
            next_token_id = outputs.logits[0, -1].argmax(dim=-1).unsqueeze(0).unsqueeze(0)

            # Check for EOS
            if next_token_id.item() == self.tokenizer.eos_token_id:
                break

            # Build result for current sequence
            token_ids = input_ids[0].tolist()
            tokens = self.tokenizer.convert_ids_to_tokens(token_ids)
            tokens = [self._clean_token(t) for t in tokens]

            hidden_states = np.stack(
                [hs[0].float().cpu().numpy() for hs in outputs.hidden_states]
            )
            attentions = np.stack(
                [attn[0].float().cpu().numpy() for attn in outputs.attentions]
            )

            yield ExtractionResult(
                tokens=tokens,
                hidden_states=hidden_states,
                attentions=attentions,
                num_layers=self.num_layers,
                num_heads=self.num_heads,
                hidden_dim=self.hidden_dim,
                inference_time=time.time() - t0,
                is_demo=False,
            )

            # Extend sequence for next iteration
            input_ids = torch.cat([input_ids, next_token_id], dim=-1)

    @staticmethod
    def generate_demo_streaming(
        prompt: str = "The quick brown fox jumps over the lazy dog",
        max_new_tokens: int = 12,
    ):
        """Yield demo ExtractionResults simulating token-by-token generation."""
        # Generate full demo data, then yield growing slices
        base = ActivationExtractor.generate_demo_data(prompt)
        # Simulate additional generated tokens
        rng = np.random.RandomState(99)
        gen_tokens = ["and", "then", "it", "ran", "across", "the",
                      "field", "into", "the", "forest", ".", "<eos>"]
        gen_tokens = gen_tokens[:max_new_tokens]

        all_tokens = list(base.tokens)
        all_hs = list(base.hidden_states.transpose(1, 0, 2))  # list of (n_layers+1, hidden_dim) per token
        all_attn = base.attentions.copy()  # will rebuild each step

        t0 = time.time()

        for step, tok in enumerate(gen_tokens):
            all_tokens.append(tok)
            seq_len = len(all_tokens)

            # Generate a new hidden state column for this token
            new_hs = np.zeros((base.num_layers + 1, base.hidden_dim), dtype=np.float32)
            for layer in range(base.num_layers + 1):
                base_mag = 5.0 + layer * 0.8
                noise = rng.randn(base.hidden_dim).astype(np.float32) * (1.0 + layer * 0.1)
                noise[:64] += base_mag * np.sin(
                    np.arange(64) * (seq_len) / 12.0
                ).astype(np.float32)
                new_hs[layer] = noise
            all_hs.append(new_hs)

            # Stack hidden states for current sequence
            hs_array = np.stack(all_hs, axis=1)  # (n_layers+1, seq_len, hidden_dim)

            # Rebuild attention matrices at new seq_len
            attn_array = np.zeros(
                (base.num_layers, base.num_heads, seq_len, seq_len),
                dtype=np.float32,
            )
            for layer in range(base.num_layers):
                for head in range(base.num_heads):
                    raw = np.tril(rng.exponential(0.5, (seq_len, seq_len)).astype(np.float32))
                    # Simple causal softmax
                    mask = np.triu(np.full((seq_len, seq_len), -1e9, dtype=np.float32), k=1)
                    logits = raw + mask
                    logits -= logits.max(axis=-1, keepdims=True)
                    exp = np.exp(logits)
                    attn_array[layer, head] = exp / (exp.sum(axis=-1, keepdims=True) + 1e-8)

            yield ExtractionResult(
                tokens=list(all_tokens),
                hidden_states=hs_array,
                attentions=attn_array,
                num_layers=base.num_layers,
                num_heads=base.num_heads,
                hidden_dim=base.hidden_dim,
                inference_time=time.time() - t0,
                is_demo=True,
            )
            time.sleep(0.3)  # Simulate generation delay

    # -------------------------------------------------------------------
    # Demo data generation (no GPU required)
    # -------------------------------------------------------------------

    @staticmethod
    def generate_demo_data(
        prompt: str = "The quick brown fox jumps over the lazy dog",
    ) -> ExtractionResult:
        """Generate realistic synthetic data matching Qwen3-4B dimensions.

        Produces structured patterns that look plausible in all four
        visualization views:
        - Attention: causal masks with head-specific specialization
        - Magnitude: increasing L2 norms through depth
        - Token-layer grid: per-token evolution with semantic clustering
        - Scatter: separable token clusters in PCA space
        """
        t0 = time.time()
        rng = np.random.RandomState(42)

        # Simulate tokenization (split on whitespace, add BOS)
        raw_tokens = prompt.replace(",", " ,").replace(".", " .").split()
        tokens = ["<|im_start|>"] + raw_tokens
        seq_len = len(tokens)

        num_layers = DEFAULT_NUM_LAYERS
        num_heads = DEFAULT_NUM_HEADS
        hidden_dim = DEFAULT_HIDDEN_DIM

        # -- Hidden states with realistic depth-dependent structure ----------
        hidden_states = np.zeros(
            (num_layers + 1, seq_len, hidden_dim), dtype=np.float32
        )
        for layer in range(num_layers + 1):
            # Base magnitude grows through layers (empirical LLM pattern)
            base_mag = 5.0 + layer * 0.8
            noise_scale = 1.0 + layer * 0.1
            hs = rng.randn(seq_len, hidden_dim).astype(np.float32) * noise_scale

            for t in range(seq_len):
                # Position-dependent sinusoidal bias (simulates positional features)
                hs[t, :64] += base_mag * np.sin(
                    np.arange(64) * (t + 1) / 12.0
                ).astype(np.float32)
                # Layer-specific feature band activation
                band_start = (layer * 70) % hidden_dim
                band_end = min(band_start + 70, hidden_dim)
                hs[t, band_start:band_end] += base_mag * 0.5
                # Content words get stronger activations in middle layers
                if 10 <= layer <= 28 and t > 0 and len(raw_tokens[t - 1]) > 3:
                    hs[t, :256] *= 1.3

            hidden_states[layer] = hs

        # -- Attention patterns with head specialization --------------------
        attentions = np.zeros(
            (num_layers, num_heads, seq_len, seq_len), dtype=np.float32
        )
        for layer in range(num_layers):
            for head in range(num_heads):
                raw = np.tril(
                    rng.exponential(1.0, (seq_len, seq_len)).astype(np.float32)
                )

                # Head-type specialization (observed in real LLMs)
                head_type = head % 6
                if head_type == 0:
                    # Local window attention (±3 tokens)
                    for i in range(seq_len):
                        lo = max(0, i - 3)
                        raw[i, lo : i + 1] *= 4.0
                elif head_type == 1:
                    # BOS / sink attention
                    raw[:, 0] *= 6.0
                elif head_type == 2:
                    # Previous-token (induction-style)
                    for i in range(1, seq_len):
                        raw[i, i - 1] *= 5.0
                elif head_type == 3:
                    # Copy / identity (diagonal)
                    for i in range(seq_len):
                        raw[i, i] *= 5.0
                elif head_type == 4:
                    # Long-range (attend to early tokens)
                    raw[:, : min(3, seq_len)] *= 3.0
                # head_type == 5: uniform / mixed (no special pattern)

                # Causal softmax
                mask = np.triu(
                    np.full((seq_len, seq_len), -1e9, dtype=np.float32), k=1
                )
                logits = raw + mask
                logits -= logits.max(axis=-1, keepdims=True)
                exp = np.exp(logits)
                attentions[layer, head] = exp / (
                    exp.sum(axis=-1, keepdims=True) + 1e-8
                )

        inference_time = time.time() - t0

        return ExtractionResult(
            tokens=tokens,
            hidden_states=hidden_states,
            attentions=attentions,
            num_layers=num_layers,
            num_heads=num_heads,
            hidden_dim=hidden_dim,
            inference_time=inference_time,
            is_demo=True,
        )

    @staticmethod
    def _clean_token(tok: str) -> str:
        """Clean BPE artifacts from token string for display."""
        return (
            tok.replace("Ġ", " ")
            .replace("▁", " ")
            .replace("Ċ", "\\n")
            .replace("ĉ", "\\t")
        )