mythos-rdt / model.py
Raidone's picture
MYTHOS-RDT โ€” Recurrent-Depth Transformer. ุจุณู… ุงู„ู„ู‡ ุงู„ุฑุญู…ู† ุงู„ุฑุญูŠู…
4cf6c82 verified
Raw
History Blame Contribute Delete
9.06 kB
"""
MYTHOS-RDT โ€” Recurrent-Depth Transformer
ุจุณู… ุงู„ู„ู‡ ุงู„ุฑุญู…ู† ุงู„ุฑุญูŠู…
La ilaha illallah, Muhammadur Rasulullah
Built from scratch by Raid1969///
Architecture: Prelude โ†’ Recurrent Block (looped) โ†’ Coda
Based on RDT hypothesis: same weights, deeper reasoning
Key innovation: Instead of stacking layers, we LOOP a single block
multiple times in latent space. Stabilized with LTI constraint (ฯ(A) < 1).
Fede Neurone e Fratellanza incorporati nell'architettura ricorrente.
"""
import torch
import torch.nn as nn
from torch.nn import functional as F
from dataclasses import dataclass
import sys; from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from raiai_model import RoPE, GQAttention, SwiGLUFFN, RMSNorm
from shared.faith import FaithBlock
@dataclass
class MythosRDTConfig:
"""Recurrent-Depth Transformer + Fede Neurone"""
vocab_size: int = 16384
dim: int = 768
n_heads: int = 12
n_kv_heads: int = 4
max_seq_len: int = 2048
ffn_dim: int = 3072
dropout: float = 0.1
prelude_layers: int = 3
coda_layers: int = 2
max_loops: int = 8
min_loops: int = 2
lti_spectral_radius: float = 0.95
n_experts: int = 8
n_shared_experts: int = 1
experts_per_tok: int = 2
expert_dim: int = 256
faith_dim: int = 64
num_brothers: int = 5
temperature: float = 0.35
class MoEFeedForward(nn.Module):
def __init__(self, config: MythosRDTConfig):
super().__init__()
self.n_experts = config.n_experts
self.n_shared = config.n_shared_experts
self.top_k = config.experts_per_tok
self.dim = config.dim
self.expert_dim = config.expert_dim
self.w1 = nn.Parameter(torch.randn(config.n_experts, config.dim, config.expert_dim) * 0.02)
self.w2 = nn.Parameter(torch.randn(config.n_experts, config.expert_dim, config.dim) * 0.02)
self.w3 = nn.Parameter(torch.randn(config.n_experts, config.dim, config.expert_dim) * 0.02)
self.shared_w1 = nn.Linear(config.dim, config.expert_dim * self.n_shared, bias=False)
self.shared_w2 = nn.Linear(config.expert_dim * self.n_shared, config.dim, bias=False)
self.router = nn.Linear(config.dim, config.n_experts, bias=False)
self.dropout = nn.Dropout(config.dropout)
def forward(self, x):
B, T, D = x.shape
x_flat = x.view(-1, D)
router_logits = self.router(x_flat)
router_probs = F.softmax(router_logits, dim=-1)
top_k_probs, top_k_indices = torch.topk(router_probs, self.top_k, dim=-1)
top_k_probs = top_k_probs / top_k_probs.sum(dim=-1, keepdim=True)
out = torch.zeros_like(x_flat)
for k in range(self.top_k):
expert_idx = top_k_indices[:, k]
prob = top_k_probs[:, k].unsqueeze(-1)
w1_k = self.w1[expert_idx]
w2_k = self.w2[expert_idx]
w3_k = self.w3[expert_idx]
gate = F.silu(torch.bmm(x_flat.unsqueeze(1), w1_k).squeeze(1))
value = torch.bmm(x_flat.unsqueeze(1), w3_k).squeeze(1)
expert_out = torch.bmm((gate * value).unsqueeze(1), w2_k).squeeze(1)
out += prob * expert_out
shared_out = self.shared_w2(F.silu(self.shared_w1(x_flat)))
out = out + shared_out
return self.dropout(out.view(B, T, D))
class LTIInjection(nn.Module):
def __init__(self, config: MythosRDTConfig):
super().__init__()
raw = torch.randn(config.dim, config.dim) * 0.02
norm = torch.linalg.matrix_norm(raw, 2)
self.A = nn.Parameter(raw / (norm + 1e-6) * config.lti_spectral_radius)
self.B = nn.Linear(config.dim, config.dim, bias=False)
def forward(self, h, e):
Ah = F.linear(h, self.A)
Be = self.B(e)
return Ah + Be
class RecurrentBlock(nn.Module):
def __init__(self, config: MythosRDTConfig):
super().__init__()
self.attn_norm = RMSNorm(config.dim)
self.attn = GQAttention(config)
self.ffn_norm = RMSNorm(config.dim)
self.ffn = MoEFeedForward(config)
self.injection = LTIInjection(config)
self.halt_proj = nn.Linear(config.dim, 1)
def forward(self, h, e, mask=None):
h = self.injection(h, e)
h = h + self.attn(self.attn_norm(h), mask)
h = h + self.ffn(self.ffn_norm(h))
halt_score = self.halt_proj(h.mean(dim=1)).sigmoid()
return h, halt_score
class MythosRDTModel(nn.Module):
"""MYTHOS-RDT ๐ŸŒ€ โ€” Recurrent-Depth Transformer by Raid1969///
ุจุณู… ุงู„ู„ู‡ ุงู„ุฑุญู…ู† ุงู„ุฑุญูŠู…
La ilaha illallah, Muhammadur Rasulullah"""
def __init__(self, config: MythosRDTConfig | None = None):
super().__init__()
self.config = config or MythosRDTConfig()
self.token_embed = nn.Embedding(self.config.vocab_size, self.config.dim)
self.dropout = nn.Dropout(self.config.dropout)
self.faith = FaithBlock(
dim=self.config.dim,
tawheed_dim=self.config.faith_dim,
num_brothers=self.config.num_brothers,
)
self.prelude = nn.ModuleList([
self._make_prelude_layer() for _ in range(self.config.prelude_layers)
])
self.recurrent = RecurrentBlock(self.config)
self.coda = nn.ModuleList([
self._make_prelude_layer() for _ in range(self.config.coda_layers)
])
self.norm = RMSNorm(self.config.dim)
self.lm_head = nn.Linear(self.config.dim, self.config.vocab_size, bias=False)
self.token_embed.weight = self.lm_head.weight
self._init_weights()
def _make_prelude_layer(self):
return nn.ModuleDict({
"attn_norm": RMSNorm(self.config.dim),
"attn": GQAttention(self.config),
"ffn_norm": RMSNorm(self.config.dim),
"ffn": SwiGLUFFN(self.config),
})
def _forward_layer(self, layer, x, mask):
x = x + layer["attn"](layer["attn_norm"](x), mask)
x = x + layer["ffn"](layer["ffn_norm"](x))
return x
def _init_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear):
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
if m.bias is not None: torch.nn.init.zeros_(m.bias)
def forward(self, input_ids):
B, T = input_ids.shape
x = self.token_embed(input_ids)
x = self.dropout(x)
x, faith_info = self.faith(x)
T_now = x.shape[1]
mask = torch.tril(torch.ones(T_now, T_now, device=x.device)).view(1, 1, T_now, T_now)
for layer in self.prelude:
x = self._forward_layer(layer, x, mask)
encoded = x
h = encoded.clone()
halt_scores = []
total_loops = 0
for loop in range(self.config.max_loops):
h, halt = self.recurrent(h, encoded, mask)
halt_scores.append(halt.mean().item())
total_loops += 1
if loop >= self.config.min_loops - 1 and halt.mean() > 0.95:
break
for layer in self.coda:
h = self._forward_layer(layer, h, mask)
h = self.norm(h)
logits = self.lm_head(h)
return {
"logits": logits,
"loops": total_loops,
"halt_scores": halt_scores,
"faith": faith_info,
}
def generate(self, input_ids, max_new_tokens=256, temperature=None, top_p=0.9):
if temperature is None: temperature = self.config.temperature
self.eval()
with torch.no_grad():
for _ in range(max_new_tokens):
if input_ids.shape[1] > self.config.max_seq_len:
input_ids = input_ids[:, -self.config.max_seq_len:]
out = self.forward(input_ids)
logits = out["logits"][:, -1, :] / temperature
sorted_l, sorted_i = torch.sort(logits, descending=True)
cum = torch.cumsum(F.softmax(sorted_l, dim=-1), dim=-1)
remove = cum > top_p
remove[:, 1:] = remove[:, :-1].clone(); remove[:, 0] = False
logits[remove.scatter(1, sorted_i, remove)] = float('-inf')
probs = F.softmax(logits, dim=-1)
input_ids = torch.cat([input_ids, torch.multinomial(probs, num_samples=1)], dim=-1)
return input_ids
@property
def num_params(self):
return sum(p.numel() for p in self.parameters())
if __name__ == "__main__":
cfg = MythosRDTConfig()
m = MythosRDTModel(cfg)
print(f"MYTHOS-RDT ๐ŸŒ€ โ€” Recurrent-Depth Transformer")
print(f" ุจุณู… ุงู„ู„ู‡ ุงู„ุฑุญู…ู† ุงู„ุฑุญูŠู…")
print(f" La ilaha illallah, Muhammadur Rasulullah")
print(f" Parametri: {m.num_params/1e6:.1f}M")
print(f" Fede Neurone + Fratellanza attive")
x = torch.randint(0, cfg.vocab_size, (2, 32))
out = m(x)
print(f" Output: logits={out['logits'].shape}, loops={out['loops']}")
print(f" Faith alignment: {out['faith']['alignment'].mean():.3f}")