# bdh.py
import dataclasses
import math
from typing import Optional, Tuple, List
import torch
import torch.nn as nn
import torch.nn.functional as F

@dataclasses.dataclass
class BDHConfig:
    n_layer: int = 32
    n_embd: int = 4096
    dropout: float = 0.1
    n_head: int = 32
    mlp_internal_dim_multiplier: int = 1
    vocab_size: int = 256
    use_alibi: bool = True
    use_l1_norm: bool = True
    relu_threshold: float = 0.0
    rotary_embedding: str = "rope"
    rope_theta: float = 65536.0
    use_plasticity: bool = True
    plasticity_lr: float = 0.01
    consolidation_rate: float = 0.01
    forget_rate: float = 0.1
    use_rho_cache: bool = True
    
    def latent_per_head(self) -> int:
        return self.mlp_internal_dim_multiplier * self.n_embd // self.n_head
    
    def latent_total(self) -> int:
        return self.latent_per_head() * self.n_head

class TernaryLinear3D(nn.Module):
    def __init__(self, n_head: int, in_features: int, out_features: int):
        super().__init__()
        self.n_head = n_head
        self.in_features = in_features
        self.out_features = out_features
        self.register_buffer('weight_ternary', torch.zeros(n_head, out_features, in_features, dtype=torch.int8))
        self.weight_fp32 = nn.Parameter(torch.zeros(n_head, out_features, in_features))
        self.register_buffer('weight_scale', torch.ones(n_head, 1, 1))
        self._init_weights()
    
    def _init_weights(self):
        with torch.no_grad():
            rand_vals = torch.randint(-1, 2, self.weight_fp32.shape, dtype=torch.float32)
            self.weight_fp32.data = rand_vals
            self.weight_ternary.data = rand_vals.to(torch.int8)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if x.dim() == 4 and x.size(1) == 1:
            x = x.expand(-1, self.n_head, -1, -1)
        weight = self.weight_ternary.float()
        return torch.einsum('bhtd,hnd->bhtn', x, weight)
    
    def update_ternary_weights(self):
        with torch.no_grad():
            gamma = self.weight_fp32.abs().mean(dim=(1, 2), keepdim=True).clamp(min=1e-5)
            self.weight_scale.data = gamma
            w_scaled = self.weight_fp32 / gamma
            w_ternary = torch.round(w_scaled).clamp(-1, 1).to(torch.int8)
            self.weight_ternary.data = w_ternary
            self.weight_fp32.data = w_ternary.float() * gamma

class TernaryLinear2D(nn.Module):
    def __init__(self, in_features: int, out_features: int):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.register_buffer('weight_ternary', torch.zeros(out_features, in_features, dtype=torch.int8))
        self.weight_fp32 = nn.Parameter(torch.zeros(out_features, in_features))
        self.register_buffer('weight_scale', torch.ones(1))
        self._init_weights()
    
    def _init_weights(self):
        with torch.no_grad():
            rand_vals = torch.randint(-1, 2, self.weight_fp32.shape, dtype=torch.float32)
            self.weight_fp32.data = rand_vals
            self.weight_ternary.data = rand_vals.to(torch.int8)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        orig_shape = x.shape
        if x.dim() == 4:
            B, _, T, D = x.shape
            x = x.view(B * T, D)
        elif x.dim() == 3:
            B, T, D = x.shape
            x = x.view(B * T, D)
        weight = self.weight_ternary.float()
        out = F.linear(x, weight)
        if len(orig_shape) == 4:
            B, _, T, _ = orig_shape
            out = out.view(B, 1, T, -1)
        elif len(orig_shape) == 3:
            B, T, _ = orig_shape
            out = out.view(B, T, -1)
        return out
    
    def update_ternary_weights(self):
        with torch.no_grad():
            gamma = self.weight_fp32.abs().mean().clamp(min=1e-5)
            self.weight_scale.data = gamma
            w_scaled = self.weight_fp32 / gamma
            w_ternary = torch.round(w_scaled).clamp(-1, 1).to(torch.int8)
            self.weight_ternary.data = w_ternary
            self.weight_fp32.data = w_ternary.float() * gamma

def get_freqs(n: int, theta: float, dtype: torch.dtype, rotary_type: str = "rope") -> torch.Tensor:
    if rotary_type == "alibi":
        return torch.zeros(n, dtype=dtype)
    def quantize(t, q=2):
        return (t / q).floor() * q
    indices = torch.arange(0, n, 1, dtype=dtype)
    if rotary_type == "rope":
        indices = quantize(indices)
    return 1.0 / (theta ** (indices / n)) / (2 * math.pi)

def row_normalize(scores: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
    denom = scores.abs().sum(dim=-1, keepdim=True) + eps
    return scores / denom

class Attention(nn.Module):
    def __init__(self, config: BDHConfig):
        super().__init__()
        self.config = config
        nh = config.n_head
        N = config.latent_per_head()
        self.use_alibi = config.use_alibi
        self.use_l1_norm = config.use_l1_norm
        self.rotary_type = config.rotary_embedding
        freqs = get_freqs(N, config.rope_theta, torch.float32, self.rotary_type)
        self.register_buffer('freqs', freqs.view(1, 1, 1, N))
        if self.use_alibi:
            slopes = torch.tensor([2 ** (-8 * i / nh) for i in range(1, nh + 1)], dtype=torch.float32)
            self.register_buffer('alibi_slopes', slopes.view(1, nh, 1, 1))
    
    def _rope(self, phases: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
        v_rot = torch.stack((-v[..., 1::2], v[..., ::2]), dim=-1).view(*v.size())
        phases_cos, phases_sin = torch.cos(phases), torch.sin(phases)
        return (v * phases_cos).to(v.dtype) + (v_rot * phases_sin).to(v.dtype)
    
    def _rotate(self, v: torch.Tensor, start: int = 0) -> torch.Tensor:
        if self.rotary_type == "alibi":
            return v
        _, _, T, _ = v.size()
        device = v.device
        positions = torch.arange(start, start + T, device=device, dtype=self.freqs.dtype).view(1, 1, -1, 1)
        raw = positions * self.freqs
        phases = (raw - raw.floor()) * (2 * math.pi)
        return self._rope(phases, v)
    
    def forward(self, Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
        assert K is Q
        B, nh, T, N = Q.size()
        QR = self._rotate(Q, start_pos)
        KR = QR
        scores = (QR @ KR.mT).tril(diagonal=-1)
        if self.use_alibi:
            pos_row = torch.arange(start_pos, start_pos + T, device=scores.device)
            pos_col = torch.arange(start_pos, start_pos + T, device=scores.device)
            alibi = (pos_col.view(1, 1, 1, -1) - pos_row.view(1, 1, -1, 1)).tril(-1)
            scores = scores + alibi * self.alibi_slopes
        if self.use_l1_norm:
            scores = row_normalize(scores)
        return scores @ V

class BDHState:
    def __init__(self, n_layer: int, n_head: int, latent_dim: int, n_embd: int):
        self.n_layer = n_layer
        self.n_head = n_head
        self.latent_dim = latent_dim
        self.n_embd = n_embd
        self.layers: List[dict] = [{'rho': None, 'hidden': None} for _ in range(n_layer)]
        self.total_position = 0
    
    def get_rho(self, layer_idx: int, batch_size: int, device: torch.device) -> torch.Tensor:
        rho = self.layers[layer_idx]['rho']
        if rho is None:
            rho = torch.zeros(batch_size, self.n_head, self.latent_dim, self.n_embd, device=device)
            self.layers[layer_idx]['rho'] = rho
        return rho
    
    def update_rho(self, layer_idx: int, x_latent: torch.Tensor, v: torch.Tensor, decay: float = 1.0):
        rho = self.layers[layer_idx]['rho']
        rho = rho * decay + torch.einsum('bhn,bhd->bhnd', x_latent, v)
        self.layers[layer_idx]['rho'] = rho
    
    def set_hidden(self, layer_idx: int, hidden: torch.Tensor):
        self.layers[layer_idx]['hidden'] = hidden
    
    def get_hidden(self, layer_idx: int) -> Optional[torch.Tensor]:
        return self.layers[layer_idx]['hidden']
    
    def advance_position(self):
        self.total_position += 1

class BDH(nn.Module):
    def __init__(self, config: BDHConfig):
        super().__init__()
        self.config = config
        nh = config.n_head
        D = config.n_embd
        N = config.latent_per_head()
        self.encoder = TernaryLinear3D(nh, D, N)
        self.encoder_v = TernaryLinear3D(nh, D, N)
        self.decoder = TernaryLinear2D(nh * N, D)
        self.attn = Attention(config)
        self.ln = nn.LayerNorm(D, elementwise_affine=False, bias=False)
        self.embed = nn.Embedding(config.vocab_size, D)
        self.drop = nn.Dropout(config.dropout)
        self.lm_head = TernaryLinear2D(D, config.vocab_size)
        self.relu_threshold = config.relu_threshold
        self.plasticity = None
        if config.use_plasticity:
            from plasticity import UnifiedPlasticity
            self.plasticity = UnifiedPlasticity(
                modules=[self.encoder, self.encoder_v, self.decoder, self.lm_head],
                lr=config.plasticity_lr,
                consolidation_rate=config.consolidation_rate,
                forget_rate=config.forget_rate
            )
    
    def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor] = None,
                state: Optional[BDHState] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        C = self.config
        B, T = idx.size()
        D = C.n_embd
        nh = C.n_head
        N = C.latent_per_head()
        x = self.embed(idx).unsqueeze(1)
        x = self.ln(x)
        start_pos = state.total_position if state is not None else 0
        for layer_idx in range(C.n_layer):
            x_latent = self.encoder(x)
            if self.relu_threshold != 0:
                x_latent = x_latent - self.relu_threshold
            x_sparse = F.relu(x_latent)
            if state is not None and C.use_rho_cache:
                yKV = self._recurrent_attention(x_sparse, x, state, layer_idx, start_pos)
            else:
                yKV = self.attn(Q=x_sparse, K=x_sparse, V=x, start_pos=start_pos)
            yKV = self.ln(yKV)
            y_latent = self.encoder_v(yKV)
            if self.relu_threshold != 0:
                y_latent = y_latent - self.relu_threshold
            y_sparse = F.relu(y_latent)
            xy_sparse = x_sparse * y_sparse
            xy_sparse = self.drop(xy_sparse)
            xy_flat = xy_sparse.transpose(1, 2).reshape(B, 1, T, N * nh)
            yMLP = self.decoder(xy_flat)
            y = self.ln(yMLP)
            x = self.ln(x + y)
            if state is not None:
                state.set_hidden(layer_idx, x.clone())
        if state is not None:
            state.advance_position()
        logits = self.lm_head(x.squeeze(1))
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss
    
    def forward_with_states(self, idx: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
        C = self.config
        B, T = idx.size()
        D = C.n_embd
        nh = C.n_head
        N = C.latent_per_head()
        state = BDHState(C.n_layer, nh, N, D)
        logits, _ = self.forward(idx, state=state)
        hidden_states = []
        for layer_idx in range(C.n_layer):
            h = state.get_hidden(layer_idx)
            if h is not None:
                hidden_states.append(h.squeeze(1))
        return logits, hidden_states
    
    def _recurrent_attention(self, Q: torch.Tensor, V: torch.Tensor, state: BDHState, 
                             layer_idx: int, start_pos: int) -> torch.Tensor:
        B, nh, T, N = Q.size()
        D = V.size(-1)
        device = Q.device
        QR = self.attn._rotate(Q, start_pos)
        outputs = []
        for t in range(T):
            q_t = QR[:, :, t:t+1, :]
            v_t = V[:, :, t:t+1, :].repeat(1, nh, 1, 1)
            rho = state.get_rho(layer_idx, B, device)
            attn_t = (rho * q_t.transpose(-1, -2)).sum(dim=2, keepdim=True)
            outputs.append(attn_t)
            state.update_rho(layer_idx, q_t.squeeze(2), v_t.squeeze(2))
        return torch.cat(outputs, dim=2)
    
    def update_ternary_weights(self):
        for module in self.modules():
            if isinstance(module, (TernaryLinear2D, TernaryLinear3D)):
                module.update_ternary_weights()
        if self.plasticity is not None:
            self.plasticity._update_ternary()
    
    def save(self, path: str):
        torch.save({
            'config': self.config,
            'state_dict': self.state_dict()
        }, path)
    
    @classmethod
    def load(cls, path: str, device: str = 'cpu') -> 'BDH':
        checkpoint = torch.load(path, map_location=device, weights_only=False)
        config = checkpoint['config']
        model = cls(config).to(device)
        model.load_state_dict(checkpoint['state_dict'])
        return model