""" MYTHOS-RDT — Recurrent-Depth Transformer بسم الله الرحمن الرحيم La ilaha illallah, Muhammadur Rasulullah Built from scratch by Raid1969/// Architecture: Prelude → Recurrent Block (looped) → Coda Based on RDT hypothesis: same weights, deeper reasoning Key innovation: Instead of stacking layers, we LOOP a single block multiple times in latent space. Stabilized with LTI constraint (ρ(A) < 1). Fede Neurone e Fratellanza incorporati nell'architettura ricorrente. """ import torch import torch.nn as nn from torch.nn import functional as F from dataclasses import dataclass import sys; from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from raiai_model import RoPE, GQAttention, SwiGLUFFN, RMSNorm from shared.faith import FaithBlock @dataclass class MythosRDTConfig: """Recurrent-Depth Transformer + Fede Neurone""" vocab_size: int = 16384 dim: int = 768 n_heads: int = 12 n_kv_heads: int = 4 max_seq_len: int = 2048 ffn_dim: int = 3072 dropout: float = 0.1 prelude_layers: int = 3 coda_layers: int = 2 max_loops: int = 8 min_loops: int = 2 lti_spectral_radius: float = 0.95 n_experts: int = 8 n_shared_experts: int = 1 experts_per_tok: int = 2 expert_dim: int = 256 faith_dim: int = 64 num_brothers: int = 5 temperature: float = 0.35 class MoEFeedForward(nn.Module): def __init__(self, config: MythosRDTConfig): super().__init__() self.n_experts = config.n_experts self.n_shared = config.n_shared_experts self.top_k = config.experts_per_tok self.dim = config.dim self.expert_dim = config.expert_dim self.w1 = nn.Parameter(torch.randn(config.n_experts, config.dim, config.expert_dim) * 0.02) self.w2 = nn.Parameter(torch.randn(config.n_experts, config.expert_dim, config.dim) * 0.02) self.w3 = nn.Parameter(torch.randn(config.n_experts, config.dim, config.expert_dim) * 0.02) self.shared_w1 = nn.Linear(config.dim, config.expert_dim * self.n_shared, bias=False) self.shared_w2 = nn.Linear(config.expert_dim * self.n_shared, config.dim, bias=False) self.router = nn.Linear(config.dim, config.n_experts, bias=False) self.dropout = nn.Dropout(config.dropout) def forward(self, x): B, T, D = x.shape x_flat = x.view(-1, D) router_logits = self.router(x_flat) router_probs = F.softmax(router_logits, dim=-1) top_k_probs, top_k_indices = torch.topk(router_probs, self.top_k, dim=-1) top_k_probs = top_k_probs / top_k_probs.sum(dim=-1, keepdim=True) out = torch.zeros_like(x_flat) for k in range(self.top_k): expert_idx = top_k_indices[:, k] prob = top_k_probs[:, k].unsqueeze(-1) w1_k = self.w1[expert_idx] w2_k = self.w2[expert_idx] w3_k = self.w3[expert_idx] gate = F.silu(torch.bmm(x_flat.unsqueeze(1), w1_k).squeeze(1)) value = torch.bmm(x_flat.unsqueeze(1), w3_k).squeeze(1) expert_out = torch.bmm((gate * value).unsqueeze(1), w2_k).squeeze(1) out += prob * expert_out shared_out = self.shared_w2(F.silu(self.shared_w1(x_flat))) out = out + shared_out return self.dropout(out.view(B, T, D)) class LTIInjection(nn.Module): def __init__(self, config: MythosRDTConfig): super().__init__() raw = torch.randn(config.dim, config.dim) * 0.02 norm = torch.linalg.matrix_norm(raw, 2) self.A = nn.Parameter(raw / (norm + 1e-6) * config.lti_spectral_radius) self.B = nn.Linear(config.dim, config.dim, bias=False) def forward(self, h, e): Ah = F.linear(h, self.A) Be = self.B(e) return Ah + Be class RecurrentBlock(nn.Module): def __init__(self, config: MythosRDTConfig): super().__init__() self.attn_norm = RMSNorm(config.dim) self.attn = GQAttention(config) self.ffn_norm = RMSNorm(config.dim) self.ffn = MoEFeedForward(config) self.injection = LTIInjection(config) self.halt_proj = nn.Linear(config.dim, 1) def forward(self, h, e, mask=None): h = self.injection(h, e) h = h + self.attn(self.attn_norm(h), mask) h = h + self.ffn(self.ffn_norm(h)) halt_score = self.halt_proj(h.mean(dim=1)).sigmoid() return h, halt_score class MythosRDTModel(nn.Module): """MYTHOS-RDT 🌀 — Recurrent-Depth Transformer by Raid1969/// بسم الله الرحمن الرحيم La ilaha illallah, Muhammadur Rasulullah""" def __init__(self, config: MythosRDTConfig | None = None): super().__init__() self.config = config or MythosRDTConfig() self.token_embed = nn.Embedding(self.config.vocab_size, self.config.dim) self.dropout = nn.Dropout(self.config.dropout) self.faith = FaithBlock( dim=self.config.dim, tawheed_dim=self.config.faith_dim, num_brothers=self.config.num_brothers, ) self.prelude = nn.ModuleList([ self._make_prelude_layer() for _ in range(self.config.prelude_layers) ]) self.recurrent = RecurrentBlock(self.config) self.coda = nn.ModuleList([ self._make_prelude_layer() for _ in range(self.config.coda_layers) ]) self.norm = RMSNorm(self.config.dim) self.lm_head = nn.Linear(self.config.dim, self.config.vocab_size, bias=False) self.token_embed.weight = self.lm_head.weight self._init_weights() def _make_prelude_layer(self): return nn.ModuleDict({ "attn_norm": RMSNorm(self.config.dim), "attn": GQAttention(self.config), "ffn_norm": RMSNorm(self.config.dim), "ffn": SwiGLUFFN(self.config), }) def _forward_layer(self, layer, x, mask): x = x + layer["attn"](layer["attn_norm"](x), mask) x = x + layer["ffn"](layer["ffn_norm"](x)) return x def _init_weights(self): for m in self.modules(): if isinstance(m, nn.Linear): torch.nn.init.normal_(m.weight, mean=0.0, std=0.02) if m.bias is not None: torch.nn.init.zeros_(m.bias) def forward(self, input_ids): B, T = input_ids.shape x = self.token_embed(input_ids) x = self.dropout(x) x, faith_info = self.faith(x) T_now = x.shape[1] mask = torch.tril(torch.ones(T_now, T_now, device=x.device)).view(1, 1, T_now, T_now) for layer in self.prelude: x = self._forward_layer(layer, x, mask) encoded = x h = encoded.clone() halt_scores = [] total_loops = 0 for loop in range(self.config.max_loops): h, halt = self.recurrent(h, encoded, mask) halt_scores.append(halt.mean().item()) total_loops += 1 if loop >= self.config.min_loops - 1 and halt.mean() > 0.95: break for layer in self.coda: h = self._forward_layer(layer, h, mask) h = self.norm(h) logits = self.lm_head(h) return { "logits": logits, "loops": total_loops, "halt_scores": halt_scores, "faith": faith_info, } def generate(self, input_ids, max_new_tokens=256, temperature=None, top_p=0.9): if temperature is None: temperature = self.config.temperature self.eval() with torch.no_grad(): for _ in range(max_new_tokens): if input_ids.shape[1] > self.config.max_seq_len: input_ids = input_ids[:, -self.config.max_seq_len:] out = self.forward(input_ids) logits = out["logits"][:, -1, :] / temperature sorted_l, sorted_i = torch.sort(logits, descending=True) cum = torch.cumsum(F.softmax(sorted_l, dim=-1), dim=-1) remove = cum > top_p remove[:, 1:] = remove[:, :-1].clone(); remove[:, 0] = False logits[remove.scatter(1, sorted_i, remove)] = float('-inf') probs = F.softmax(logits, dim=-1) input_ids = torch.cat([input_ids, torch.multinomial(probs, num_samples=1)], dim=-1) return input_ids @property def num_params(self): return sum(p.numel() for p in self.parameters()) if __name__ == "__main__": cfg = MythosRDTConfig() m = MythosRDTModel(cfg) print(f"MYTHOS-RDT 🌀 — Recurrent-Depth Transformer") print(f" بسم الله الرحمن الرحيم") print(f" La ilaha illallah, Muhammadur Rasulullah") print(f" Parametri: {m.num_params/1e6:.1f}M") print(f" Fede Neurone + Fratellanza attive") x = torch.randint(0, cfg.vocab_size, (2, 32)) out = m(x) print(f" Output: logits={out['logits'].shape}, loops={out['loops']}") print(f" Faith alignment: {out['faith']['alignment'].mean():.3f}")