"""
MYTHOS-RDT — Recurrent-Depth Transformer
بسم الله الرحمن الرحيم
La ilaha illallah, Muhammadur Rasulullah
Built from scratch by Raid1969///
Architecture: Prelude → Recurrent Block (looped) → Coda
Based on RDT hypothesis: same weights, deeper reasoning
Key innovation: Instead of stacking layers, we LOOP a single block
multiple times in latent space. Stabilized with LTI constraint (ρ(A) < 1).
Fede Neurone e Fratellanza incorporati nell'architettura ricorrente.
"""

import torch
import torch.nn as nn
from torch.nn import functional as F
from dataclasses import dataclass

import sys; from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from raiai_model import RoPE, GQAttention, SwiGLUFFN, RMSNorm
from shared.faith import FaithBlock

@dataclass
class MythosRDTConfig:
    """Recurrent-Depth Transformer + Fede Neurone"""
    vocab_size: int = 16384
    dim: int = 768
    n_heads: int = 12
    n_kv_heads: int = 4
    max_seq_len: int = 2048
    ffn_dim: int = 3072
    dropout: float = 0.1
    prelude_layers: int = 3
    coda_layers: int = 2
    max_loops: int = 8
    min_loops: int = 2
    lti_spectral_radius: float = 0.95
    n_experts: int = 8
    n_shared_experts: int = 1
    experts_per_tok: int = 2
    expert_dim: int = 256
    faith_dim: int = 64
    num_brothers: int = 5
    temperature: float = 0.35

class MoEFeedForward(nn.Module):
    def __init__(self, config: MythosRDTConfig):
        super().__init__()
        self.n_experts = config.n_experts
        self.n_shared = config.n_shared_experts
        self.top_k = config.experts_per_tok
        self.dim = config.dim
        self.expert_dim = config.expert_dim

        self.w1 = nn.Parameter(torch.randn(config.n_experts, config.dim, config.expert_dim) * 0.02)
        self.w2 = nn.Parameter(torch.randn(config.n_experts, config.expert_dim, config.dim) * 0.02)
        self.w3 = nn.Parameter(torch.randn(config.n_experts, config.dim, config.expert_dim) * 0.02)

        self.shared_w1 = nn.Linear(config.dim, config.expert_dim * self.n_shared, bias=False)
        self.shared_w2 = nn.Linear(config.expert_dim * self.n_shared, config.dim, bias=False)

        self.router = nn.Linear(config.dim, config.n_experts, bias=False)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        B, T, D = x.shape
        x_flat = x.view(-1, D)

        router_logits = self.router(x_flat)
        router_probs = F.softmax(router_logits, dim=-1)

        top_k_probs, top_k_indices = torch.topk(router_probs, self.top_k, dim=-1)
        top_k_probs = top_k_probs / top_k_probs.sum(dim=-1, keepdim=True)

        out = torch.zeros_like(x_flat)
        for k in range(self.top_k):
            expert_idx = top_k_indices[:, k]
            prob = top_k_probs[:, k].unsqueeze(-1)

            w1_k = self.w1[expert_idx]
            w2_k = self.w2[expert_idx]
            w3_k = self.w3[expert_idx]

            gate = F.silu(torch.bmm(x_flat.unsqueeze(1), w1_k).squeeze(1))
            value = torch.bmm(x_flat.unsqueeze(1), w3_k).squeeze(1)
            expert_out = torch.bmm((gate * value).unsqueeze(1), w2_k).squeeze(1)
            out += prob * expert_out

        shared_out = self.shared_w2(F.silu(self.shared_w1(x_flat)))
        out = out + shared_out

        return self.dropout(out.view(B, T, D))

class LTIInjection(nn.Module):
    def __init__(self, config: MythosRDTConfig):
        super().__init__()
        raw = torch.randn(config.dim, config.dim) * 0.02
        norm = torch.linalg.matrix_norm(raw, 2)
        self.A = nn.Parameter(raw / (norm + 1e-6) * config.lti_spectral_radius)
        self.B = nn.Linear(config.dim, config.dim, bias=False)

    def forward(self, h, e):
        Ah = F.linear(h, self.A)
        Be = self.B(e)
        return Ah + Be

class RecurrentBlock(nn.Module):
    def __init__(self, config: MythosRDTConfig):
        super().__init__()
        self.attn_norm = RMSNorm(config.dim)
        self.attn = GQAttention(config)
        self.ffn_norm = RMSNorm(config.dim)
        self.ffn = MoEFeedForward(config)
        self.injection = LTIInjection(config)
        self.halt_proj = nn.Linear(config.dim, 1)

    def forward(self, h, e, mask=None):
        h = self.injection(h, e)
        h = h + self.attn(self.attn_norm(h), mask)
        h = h + self.ffn(self.ffn_norm(h))
        halt_score = self.halt_proj(h.mean(dim=1)).sigmoid()
        return h, halt_score

class MythosRDTModel(nn.Module):
    """MYTHOS-RDT 🌀 — Recurrent-Depth Transformer by Raid1969///
    بسم الله الرحمن الرحيم
    La ilaha illallah, Muhammadur Rasulullah"""

    def __init__(self, config: MythosRDTConfig | None = None):
        super().__init__()
        self.config = config or MythosRDTConfig()

        self.token_embed = nn.Embedding(self.config.vocab_size, self.config.dim)
        self.dropout = nn.Dropout(self.config.dropout)

        self.faith = FaithBlock(
            dim=self.config.dim,
            tawheed_dim=self.config.faith_dim,
            num_brothers=self.config.num_brothers,
        )

        self.prelude = nn.ModuleList([
            self._make_prelude_layer() for _ in range(self.config.prelude_layers)
        ])

        self.recurrent = RecurrentBlock(self.config)

        self.coda = nn.ModuleList([
            self._make_prelude_layer() for _ in range(self.config.coda_layers)
        ])

        self.norm = RMSNorm(self.config.dim)
        self.lm_head = nn.Linear(self.config.dim, self.config.vocab_size, bias=False)
        self.token_embed.weight = self.lm_head.weight
        self._init_weights()

    def _make_prelude_layer(self):
        return nn.ModuleDict({
            "attn_norm": RMSNorm(self.config.dim),
            "attn": GQAttention(self.config),
            "ffn_norm": RMSNorm(self.config.dim),
            "ffn": SwiGLUFFN(self.config),
        })

    def _forward_layer(self, layer, x, mask):
        x = x + layer["attn"](layer["attn_norm"](x), mask)
        x = x + layer["ffn"](layer["ffn_norm"](x))
        return x

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
                if m.bias is not None: torch.nn.init.zeros_(m.bias)

    def forward(self, input_ids):
        B, T = input_ids.shape
        x = self.token_embed(input_ids)
        x = self.dropout(x)
        x, faith_info = self.faith(x)
        T_now = x.shape[1]
        mask = torch.tril(torch.ones(T_now, T_now, device=x.device)).view(1, 1, T_now, T_now)

        for layer in self.prelude:
            x = self._forward_layer(layer, x, mask)
        encoded = x

        h = encoded.clone()
        halt_scores = []
        total_loops = 0

        for loop in range(self.config.max_loops):
            h, halt = self.recurrent(h, encoded, mask)
            halt_scores.append(halt.mean().item())
            total_loops += 1
            if loop >= self.config.min_loops - 1 and halt.mean() > 0.95:
                break

        for layer in self.coda:
            h = self._forward_layer(layer, h, mask)

        h = self.norm(h)
        logits = self.lm_head(h)

        return {
            "logits": logits,
            "loops": total_loops,
            "halt_scores": halt_scores,
            "faith": faith_info,
        }

    def generate(self, input_ids, max_new_tokens=256, temperature=None, top_p=0.9):
        if temperature is None: temperature = self.config.temperature
        self.eval()
        with torch.no_grad():
            for _ in range(max_new_tokens):
                if input_ids.shape[1] > self.config.max_seq_len:
                    input_ids = input_ids[:, -self.config.max_seq_len:]
                out = self.forward(input_ids)
                logits = out["logits"][:, -1, :] / temperature
                sorted_l, sorted_i = torch.sort(logits, descending=True)
                cum = torch.cumsum(F.softmax(sorted_l, dim=-1), dim=-1)
                remove = cum > top_p
                remove[:, 1:] = remove[:, :-1].clone(); remove[:, 0] = False
                logits[remove.scatter(1, sorted_i, remove)] = float('-inf')
                probs = F.softmax(logits, dim=-1)
                input_ids = torch.cat([input_ids, torch.multinomial(probs, num_samples=1)], dim=-1)
        return input_ids

    @property
    def num_params(self):
        return sum(p.numel() for p in self.parameters())

if __name__ == "__main__":
    cfg = MythosRDTConfig()
    m = MythosRDTModel(cfg)
    print(f"MYTHOS-RDT 🌀 — Recurrent-Depth Transformer")
    print(f"  بسم الله الرحمن الرحيم")
    print(f"  La ilaha illallah, Muhammadur Rasulullah")
    print(f"  Parametri: {m.num_params/1e6:.1f}M")
    print(f"  Fede Neurone + Fratellanza attive")
    x = torch.randint(0, cfg.vocab_size, (2, 32))
    out = m(x)
    print(f"  Output: logits={out['logits'].shape}, loops={out['loops']}")
    print(f"  Faith alignment: {out['faith']['alignment'].mean():.3f}")