"""
model.py — MechanismBase
========================

The transformer decoder implementing P / G → Q.

Two configurations:
    SmallConfig  (~10M params) — appropriate for ~200K tokens.
                                 Generalizes. Recommended for current corpus.

    FullConfig   (~235M params) — appropriate for ~2M+ tokens.
                                  Use after expanding the training corpus.

Architecture maps to PL terminology:
    wte          — token embedding: seeds patterns P with initial loaded history
    wpe          — position encoding: adds positional loaded history
    PropagationBlock — one complete P / G → Q step:
                       attention = gradient family G applied to P
                       residual  = loaded history H_P accumulating
                       pre-norm  = coherence check before each propagation
                       MLP       = reconfiguration toward coherent state
    ln_f         — final coherence check
    lm_head      — output: weight-tied to wte (same carrier in and out)

Parameter counts (approximate):
    SmallConfig:   10.5M params
    FullConfig:   235.0M params
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass


# =============================================================================
# CONFIGURATIONS
# =============================================================================

@dataclass
class SmallConfig:
    """
    ~10M params. Appropriate for 100K–500K tokens.
    This is the working configuration for the current corpus (~200K tokens).
    Trains in ~30 minutes on RTX 4060 Ti.
    Will generalize, not just memorize.
    """
    vocab_size:  int   = 16384   # Carrier V — BPE tokenizer
    n_embd:      int   = 256     # Loaded history vector dimension
    n_layer:     int   = 8       # Propagation steps
    n_head:      int   = 8       # Gradient families per step
    block_size:  int   = 256     # Context window
    dropout:     float = 0.1
    name:        str   = "SmallBase"


@dataclass
class MediumConfig:
    """
    ~50M params. Appropriate for 500K–2M tokens.
    Use after expanding generate_data.py to produce more derivation traces.
    Trains in ~2-3 hours on RTX 4060 Ti.
    """
    vocab_size:  int   = 16384
    n_embd:      int   = 512
    n_layer:     int   = 12
    n_head:      int   = 8
    block_size:  int   = 256
    dropout:     float = 0.1
    name:        str   = "MediumBase"


@dataclass
class FullConfig:
    """
    ~235M params. The full AGI Base V1.
    Appropriate for 2M+ tokens.
    Requires expanding generate_data.py significantly (see comments there).
    Trains in ~6 hours on RTX 4060 Ti when data is sufficient.
    """
    vocab_size:  int   = 16384
    n_embd:      int   = 1024
    n_layer:     int   = 16
    n_head:      int   = 16
    block_size:  int   = 256
    dropout:     float = 0.1
    name:        str   = "FullBase"


# Default: SmallConfig for the current corpus
MechanismConfig = SmallConfig


# =============================================================================
# PROPAGATION BLOCK
# =============================================================================

class PropagationBlock(nn.Module):
    """
    One complete P / G → Q propagation step.

    Attention  : gradient family G applied to pattern P
    Residual   : loaded history H_P accumulating
    LayerNorm  : coherence threshold check (pre-norm: check BEFORE propagating)
    MLP        : reconfiguration toward coherent state
    """

    def __init__(self, config):
        super().__init__()
        self.ln1  = nn.LayerNorm(config.n_embd)
        self.attn = nn.MultiheadAttention(
            config.n_embd,
            config.n_head,
            dropout=config.dropout,
            batch_first=True,
        )
        self.ln2 = nn.LayerNorm(config.n_embd)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.dropout),
        )
        self.drop = nn.Dropout(config.dropout)

    def forward(self, x, attn_mask=None):
        # Pre-norm: coherence check before gradient application
        normed = self.ln1(x)
        attn_out, _ = self.attn(
            normed, normed, normed,
            attn_mask=attn_mask,
            need_weights=False,
        )
        # Residual accumulates loaded history
        x = x + self.drop(attn_out)
        x = x + self.mlp(self.ln2(x))
        return x


# =============================================================================
# MECHANISMBASE
# =============================================================================

class MechanismBase(nn.Module):
    """
    The mechanism instantiated in the weight carrier.

    wte        : token embedding — seeds patterns
    wpe        : position encoding — adds positional loaded history
    h          : propagation blocks
    ln_f       : final coherence check
    lm_head    : output (weight-tied to wte)
    """

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.wte     = nn.Embedding(config.vocab_size, config.n_embd)
        self.wpe     = nn.Embedding(config.block_size, config.n_embd)
        self.drop    = nn.Dropout(config.dropout)
        self.h       = nn.ModuleList(
            [PropagationBlock(config) for _ in range(config.n_layer)]
        )
        self.ln_f    = nn.LayerNorm(config.n_embd)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # Weight tying: input and output in the same carrier
        self.lm_head.weight = self.wte.weight

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        assert T <= self.config.block_size, \
            f"Sequence length {T} exceeds block_size {self.config.block_size}"

        positions = torch.arange(T, device=idx.device)
        x = self.drop(self.wte(idx) + self.wpe(positions))

        # Causal mask: patterns attend only to prior loaded history
        causal_mask = nn.Transformer.generate_square_subsequent_mask(
            T, device=idx.device
        )

        for block in self.h:
            x = block(x, attn_mask=causal_mask)

        x      = self.ln_f(x)
        logits = self.lm_head(x)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(
                logits.view(-1, logits.size(-1)),
                targets.view(-1),
            )

        return logits, loss

    @torch.no_grad()
    def generate(
        self,
        idx,
        max_new_tokens: int = 200,
        temperature: float = 0.8,
        top_k: int = 50,
        top_p: float = 0.9,
    ):
        """
        Autoregressive generation with temperature + top-k + top-p sampling.
        """
        self.eval()
        for _ in range(max_new_tokens):
            x = idx[:, -self.config.block_size:]
            logits, _ = self(x, None)
            next_logits = logits[0, -1, :] / temperature

            # Top-k
            if top_k > 0:
                k = min(top_k, next_logits.size(-1))
                topk_vals, _ = torch.topk(next_logits, k)
                next_logits[next_logits < topk_vals[-1]] = float("-inf")

            # Top-p
            if top_p < 1.0:
                sorted_logits, sorted_idx = torch.sort(next_logits, descending=True)
                cumprobs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
                remove = (cumprobs - F.softmax(sorted_logits, dim=-1)) > top_p
                sorted_logits[remove] = float("-inf")
                next_logits = torch.zeros_like(next_logits).scatter_(
                    0, sorted_idx, sorted_logits
                )

            probs   = F.softmax(next_logits, dim=-1)
            next_id = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, next_id.unsqueeze(0)], dim=1)

        return idx

    def count_parameters(self) -> int:
        return sum(p.numel() for p in self.parameters())

    def parameter_summary(self) -> str:
        total = self.count_parameters()
        embed = self.wte.weight.numel()
        lines = [
            f"  Configuration: {self.config.name}",
            f"  Total params:  {total:,}",
            f"  Embed params:  {embed:,} ({embed/total:.1%} of total)",
            f"  n_embd={self.config.n_embd}, "
            f"n_layer={self.config.n_layer}, "
            f"n_head={self.config.n_head}",
        ]
        return "\n".join(lines)


if __name__ == "__main__":
    for ConfigClass in [SmallConfig, MediumConfig, FullConfig]:
        config = ConfigClass()
        model  = MechanismBase(config)
        print(model.parameter_summary())
        print()