bbkdevops's picture
download
raw
22.2 kB
"""Φ-Mind — LLM architecture derived from pure physics equations.
Five core components replacing standard Transformer primitives:
1. RenyiNorm — Rényi entropy normalization (replaces LayerNorm)
2. SolitonPE — KdV soliton position encoding (replaces RoPE)
3. HRRAttention — Holographic Reduced Representation (replaces KV-cache)
4. Phi4Dynamics — Φ⁴ field equation (replaces FFN)
5. RGScaleMixing — Renormalization Group blocking (replaces Multi-Head cross-layer)
"""
from __future__ import annotations
import math
from dataclasses import dataclass, field
from typing import Optional
import torch
import torch.nn as nn
import torch.nn.functional as F
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
@dataclass
class PhiMindConfig:
vocab_size: int = 32_000
dim: int = 512
n_layers: int = 12
max_seq_len: int = 4096
# Φ⁴ field params
phi4_epsilon: float = 0.1 # discretization step ε
phi4_mass_sq: float = -0.5 # m² < 0 → spontaneous symmetry breaking
phi4_lambda: float = 1.0 # λ > 0 → bounded nonlinearity
# HRR
hrr_decay: float = 0.95 # memory decay α
hrr_local_window: int = 64 # local token window
# Rényi norm
renyi_alpha_init: float = 1.5 # initial α (learned per layer)
# Soliton PE
soliton_n_modes: int = 32 # number of soliton modes (dim/2 used)
# RG scale mixing
rg_eta: float = 0.1 # cross-scale coupling strength
rg_period: int = 4 # apply RG every N layers
# Training
dropout: float = 0.0
tie_embeddings: bool = True
def phimind_tiny() -> PhiMindConfig:
return PhiMindConfig(dim=256, n_layers=8, max_seq_len=2048)
def phimind_small() -> PhiMindConfig:
return PhiMindConfig(dim=512, n_layers=12, max_seq_len=4096)
def phimind_base() -> PhiMindConfig:
return PhiMindConfig(dim=1024, n_layers=16, max_seq_len=8192)
# ---------------------------------------------------------------------------
# 1. Rényi Entropy Normalization
# ---------------------------------------------------------------------------
class RenyiNorm(nn.Module):
"""Normalize by Rényi α-norm with learned α per layer.
RN_α(x) = x / (||x||_α + ε) where ||x||_α = (Σ|xᵢ|^α)^{1/α}
α is learned in (1, 2] via α = 1 + σ(γ).
Special cases: α=2 ≡ L² (LayerNorm without affine), α→1 ≡ entropy-max.
"""
def __init__(self, dim: int, alpha_init: float = 1.5, eps: float = 1e-6):
super().__init__()
# γ initialised so σ(γ) ≈ alpha_init - 1
init_gamma = math.log(alpha_init - 1.0) - math.log(2.0 - alpha_init + 1e-8)
self.gamma = nn.Parameter(torch.tensor(init_gamma))
self.scale = nn.Parameter(torch.ones(dim))
self.eps = eps
@property
def alpha(self) -> torch.Tensor:
return 1.0 + torch.sigmoid(self.gamma) # in (1, 2]
def forward(self, x: torch.Tensor) -> torch.Tensor:
alpha = self.alpha
norm = x.abs().pow(alpha).sum(dim=-1, keepdim=True).pow(1.0 / alpha)
return self.scale * x / (norm + self.eps)
# ---------------------------------------------------------------------------
# 2. Soliton Position Encoding (KdV)
# ---------------------------------------------------------------------------
class SolitonPositionEncoding(nn.Module):
"""Fixed position encoding from KdV soliton solutions.
PE(p, 2k) = Aₖ · sech²(σₖ · (p − μₖ))
PE(p, 2k+1) = Aₖ · sech²(σₖ · (p − μₖ)) · tanh(σₖ · (p − μₖ))
Locality: sech²(x) ~ exp(-2|x|) → exponential decay with distance.
Topologically stable: information is conserved.
"""
def __init__(self, dim: int, max_seq_len: int, n_modes: int | None = None):
super().__init__()
n_modes = n_modes or (dim // 2)
n_modes = min(n_modes, dim // 2)
# Soliton parameters — spread geometrically to cover multiple scales
# μₖ: centres spread uniformly across sequence
# σₖ: widths on log scale (fine → coarse)
mu = torch.linspace(0, max_seq_len - 1, n_modes)
sigma = torch.exp(torch.linspace(math.log(0.5), math.log(max_seq_len / 4), n_modes))
amplitude = 1.0 / (sigma + 1.0).sqrt() # amplitude ~ σ^{-1/2}
self.register_buffer("mu", mu)
self.register_buffer("sigma", sigma)
self.register_buffer("amplitude", amplitude)
self.dim = dim
self.n_modes = n_modes
self._cache: dict[int, torch.Tensor] = {}
def _build(self, seq_len: int, device: torch.device) -> torch.Tensor:
p = torch.arange(seq_len, dtype=torch.float32, device=device) # (T,)
mu = self.mu.to(device) # (M,)
sigma = self.sigma.to(device)
amp = self.amplitude.to(device)
z = sigma.unsqueeze(0) * (p.unsqueeze(1) - mu.unsqueeze(0)) # (T, M)
sech2 = (1.0 / z.cosh()) ** 2 # (T, M)
tanh_z = z.tanh()
# Interleave even/odd channels
even = amp * sech2 # (T, M)
odd = amp * sech2 * tanh_z
pe = torch.zeros(seq_len, self.dim, device=device)
pe[:, 0::2][:, :self.n_modes] = even
pe[:, 1::2][:, :self.n_modes] = odd
return pe # (T, D)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""x: (B, T, D)"""
T = x.size(1)
pe = self._build(T, x.device)
return x + pe.unsqueeze(0)
# ---------------------------------------------------------------------------
# 3. HRR Attention — Holographic Reduced Representation
# ---------------------------------------------------------------------------
class HRRAttention(nn.Module):
"""O(d log d) attention via holographic memory.
Encoding: C = Σᵢ kᵢ ⊛ vᵢ (circular convolution via FFT)
Retrieval: v̂ ≈ q# ⊛ C (circular correlation = convolution with conjugate)
Memory: O(d) regardless of context length (vs O(nd) for KV-cache).
Capacity: reliable retrieval for n ≤ d+1 items.
Also maintains a local window for fine-grained recent context.
"""
def __init__(self, dim: int, decay: float = 0.95, local_window: int = 64,
dropout: float = 0.0):
super().__init__()
self.dim = dim
self.decay = decay
self.local_window = local_window
self.q_proj = nn.Linear(dim, dim, bias=False)
self.k_proj = nn.Linear(dim, dim, bias=False)
self.v_proj = nn.Linear(dim, dim, bias=False)
self.out_proj = nn.Linear(dim, dim, bias=False)
self.drop = nn.Dropout(dropout)
# Scale for local window scores
self.local_scale = dim ** -0.5
@staticmethod
def _circ_conv(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
"""Circular convolution via FFT: a ⊛ b. Inputs: (..., D)
Cast to float32 for FFT — BFloat16 not supported by rfft."""
orig_dtype = a.dtype
fa = torch.fft.rfft(a.float(), dim=-1)
fb = torch.fft.rfft(b.float(), dim=-1)
return torch.fft.irfft(fa * fb, n=a.shape[-1], dim=-1).to(orig_dtype)
@staticmethod
def _circ_corr(q: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
"""Circular correlation: q# ⊛ c (= conjugate in freq domain)."""
orig_dtype = q.dtype
fq = torch.fft.rfft(q.float(), dim=-1)
fc = torch.fft.rfft(c.float(), dim=-1)
return torch.fft.irfft(fq.conj() * fc, n=q.shape[-1], dim=-1).to(orig_dtype)
def forward(
self,
x: torch.Tensor,
hrr_memory: Optional[torch.Tensor] = None,
) -> tuple[torch.Tensor, torch.Tensor]:
"""
x: (B, T, D)
hrr_memory: (B, D) — running holographic memory, None on first call
Returns: (output (B,T,D), updated hrr_memory (B,D))
"""
B, T, D = x.shape
q = self.q_proj(x) # (B, T, D)
k = self.k_proj(x)
v = self.v_proj(x)
# --- Holographic memory retrieval ---
if hrr_memory is None:
hrr_memory = torch.zeros(B, D, device=x.device, dtype=x.dtype)
hrr_out = self._circ_corr(q, hrr_memory.unsqueeze(1).expand(B, T, D)) # (B,T,D)
# --- Update memory: C_{new} = α·C_{old} + (1-α)·Σ k⊛v ---
new_bindings = self._circ_conv(k, v) # (B, T, D)
delta = new_bindings.mean(dim=1) # (B, D) — average over tokens
hrr_memory = self.decay * hrr_memory + (1.0 - self.decay) * delta
# --- Local window causal attention ---
# Correct mask: position t attends to positions s where s<=t AND t-s<local_window
t_idx = torch.arange(T, device=x.device) # (T,)
diff = t_idx.unsqueeze(1) - t_idx.unsqueeze(0) # (T, T): diff[t,s] = t-s
local_causal = (diff >= 0) & (diff < self.local_window) # (T, T)
scores = torch.einsum("btd,bsd->bts", q, k) * self.local_scale # (B, T, T)
scores = scores.masked_fill(~local_causal.unsqueeze(0), float("-inf"))
attn = self.drop(scores.softmax(dim=-1))
local_out = torch.einsum("bts,bsd->btd", attn, v) # (B, T, D)
# Combine holographic (global) + local
out = self.out_proj(hrr_out + local_out)
return out, hrr_memory
# ---------------------------------------------------------------------------
# 4. Φ⁴ Field Dynamics (replaces FFN)
# ---------------------------------------------------------------------------
class Phi4Dynamics(nn.Module):
"""Discretized Φ⁴ field equation as neural network layer.
Φᵢ^{ℓ+1} = 2Φᵢ^ℓ − Φᵢ^{ℓ-1}
+ ε²[ΔΦ − m²Φ − (λ/6)Φ³ + J(Φ;θ)]
where:
- inertia = 2Φ^ℓ − Φ^{ℓ-1} (automatic skip connection)
- diffusion = ΔΦ (discrete Laplacian across hidden dim)
- mass = m²Φ (m² < 0 → Mexican hat → symmetry breaking)
- self-int = (λ/6)Φ³ (quartic term, λ > 0 → bounded)
- source = J(Φ;θ) (learned linear source current)
Properties vs ReLU/SwiGLU:
- No dying neurons (bounded by quartic potential)
- Automatic skip connection (inertia term)
- Z₂ symmetry → free regularization
- Gradient flows uniformly across depth
"""
def __init__(self, dim: int, epsilon: float = 0.1,
mass_sq: float = -0.5, lam: float = 1.0):
super().__init__()
self.epsilon = epsilon
self.mass_sq = mass_sq
self.lam = lam
# Learned source current J(Φ; θ) = tanh(W₂ · act(W₁Φ))
# Small init + tanh bound keeps J ∈ (-1, 1) — physics: source is a perturbation
hidden = dim * 2
self.j_in = nn.Linear(dim, hidden, bias=True)
self.j_out = nn.Linear(hidden, dim, bias=False)
nn.init.normal_(self.j_in.weight, std=0.01)
nn.init.zeros_(self.j_in.bias)
nn.init.normal_(self.j_out.weight, std=0.01)
# Discrete Laplacian: Δhᵢ = h_{i+1} - 2hᵢ + h_{i-1}
# Implemented as a 1-D depthwise conv with kernel [-1, 2, -1] (negated)
lap_kernel = torch.tensor([-1.0, 2.0, -1.0]).view(1, 1, 3).expand(dim, 1, 3)
self.register_buffer("lap_kernel", lap_kernel.clone())
def _laplacian(self, phi: torch.Tensor) -> torch.Tensor:
"""Discrete Laplacian across hidden dim. phi: (B, T, D)"""
B, T, D = phi.shape
# Treat (B*T) as batch, D as 1-D spatial dimension
x = phi.reshape(B * T, 1, D) # (B*T, 1, D)
x = F.pad(x, (1, 1), mode="circular")
lap = F.conv1d(x, self.lap_kernel[:1].float(), groups=1) # (B*T, 1, D)
return lap.squeeze(1).reshape(B, T, D)
def forward(
self,
phi_curr: torch.Tensor,
phi_prev: Optional[torch.Tensor] = None,
) -> tuple[torch.Tensor, torch.Tensor]:
"""
phi_curr: (B, T, D) — current "time step" = current layer
phi_prev: (B, T, D) — previous layer (None → treated as phi_curr)
Returns: (phi_next (B,T,D), phi_curr as new phi_prev)
"""
if phi_prev is None:
phi_prev = phi_curr.detach()
B, T, D = phi_curr.shape
# Discrete Laplacian
delta_phi = self._laplacian(phi_curr)
# Source current J — bounded by tanh (source is a small perturbation)
j = torch.tanh(self.j_out(F.silu(self.j_in(phi_curr))))
# Φ⁴ equation
rhs = (delta_phi
- self.mass_sq * phi_curr
- (self.lam / 6.0) * phi_curr ** 3
+ j)
phi_next = 2.0 * phi_curr - phi_prev + (self.epsilon ** 2) * rhs
return phi_next, phi_curr
# ---------------------------------------------------------------------------
# 5. Renormalization Group Scale Mixing
# ---------------------------------------------------------------------------
class RGScaleMixing(nn.Module):
"""Wilson RG blocking + cross-scale interaction.
x^{ℓ+1} = B_ℓ[x^ℓ] + η · S_ℓ[x^ℓ, x^{ℓ-rg_period}]
Blocking: Gaussian convolution over sequence dimension at scale λ_ℓ.
Cross-scale: learned gating between current and distant-layer features.
"""
def __init__(self, dim: int, seq_scale: float = 4.0, eta: float = 0.1):
super().__init__()
self.eta = eta
self.seq_scale = seq_scale # Gaussian σ in token positions
# Cross-scale gating
self.gate = nn.Linear(dim * 2, dim, bias=True)
def _gaussian_block(self, x: torch.Tensor, sigma: float) -> torch.Tensor:
"""Gaussian blur over sequence dimension. x: (B, T, D)"""
if sigma < 0.5:
return x
# Depthwise conv over T: (B, D, T)
xp = x.permute(0, 2, 1) # (B, D, T)
# Clamp radius so padding never exceeds sequence length
radius = min(max(1, int(3 * sigma)), max(1, xp.shape[-1] - 1))
k_size = 2 * radius + 1
t = torch.arange(-radius, radius + 1, dtype=x.dtype, device=x.device)
kernel = torch.exp(-0.5 * (t / sigma) ** 2)
kernel = kernel / kernel.sum() # (k_size,)
xp = F.pad(xp, (radius, radius), mode="replicate")
kernel_2d = kernel.view(1, 1, k_size).expand(x.shape[-1], 1, k_size)
out = F.conv1d(xp, kernel_2d, groups=x.shape[-1]) # (B, D, T)
return out.permute(0, 2, 1) # (B, T, D)
def forward(
self,
x_curr: torch.Tensor,
x_past: Optional[torch.Tensor] = None,
) -> torch.Tensor:
"""
x_curr: (B, T, D)
x_past: (B, T, D) from rg_period layers ago (None → skip cross-scale)
"""
blocked = self._gaussian_block(x_curr, self.seq_scale)
if x_past is not None:
combined = torch.cat([x_curr, x_past], dim=-1)
cross = self.eta * torch.sigmoid(self.gate(combined)) * x_past
return blocked + cross
return blocked
# ---------------------------------------------------------------------------
# Full PhiMind Block
# ---------------------------------------------------------------------------
class PhiMindBlock(nn.Module):
def __init__(self, cfg: PhiMindConfig, layer_idx: int):
super().__init__()
self.layer_idx = layer_idx
self.norm1 = RenyiNorm(cfg.dim, cfg.renyi_alpha_init)
self.norm2 = RenyiNorm(cfg.dim, cfg.renyi_alpha_init)
self.hrr_attn = HRRAttention(
cfg.dim, cfg.hrr_decay, cfg.hrr_local_window, cfg.dropout
)
self.phi4 = Phi4Dynamics(
cfg.dim, cfg.phi4_epsilon, cfg.phi4_mass_sq, cfg.phi4_lambda
)
apply_rg = (layer_idx > 0) and (layer_idx % cfg.rg_period == 0)
self.rg: Optional[RGScaleMixing] = (
RGScaleMixing(cfg.dim, eta=cfg.rg_eta) if apply_rg else None
)
def forward(
self,
x: torch.Tensor,
hrr_memory: Optional[torch.Tensor],
phi_prev: Optional[torch.Tensor],
x_distant: Optional[torch.Tensor],
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Returns: (x_out, hrr_memory_updated, x as phi_prev_out)
"""
# HRR attention
attn_out, hrr_memory = self.hrr_attn(self.norm1(x), hrr_memory)
x = x + attn_out
# Φ⁴ dynamics
phi_next, phi_prev_out = self.phi4(self.norm2(x), phi_prev)
x = x + phi_next
# RG scale mixing (every rg_period layers)
if self.rg is not None:
x = self.rg(x, x_distant)
return x, hrr_memory, phi_prev_out
# ---------------------------------------------------------------------------
# Full PhiMind Language Model
# ---------------------------------------------------------------------------
class PhiMindModel(nn.Module):
"""Φ-Mind autoregressive language model.
All standard Transformer components replaced by physics-derived equations:
LayerNorm → RenyiNorm (Rényi entropy normalization)
RoPE → SolitonPE (KdV soliton position encoding)
KV-Cache → HRR (holographic memory, O(d) storage)
FFN → Φ⁴ dynamics (field equation, automatic skip)
Multi-Head → RG scale mixing (Wilson blocking)
Complexity: O(n·d·log d) vs O(n²d + nd²) Transformer.
"""
def __init__(self, cfg: PhiMindConfig):
super().__init__()
self.cfg = cfg
self.embed = nn.Embedding(cfg.vocab_size, cfg.dim)
self.soliton_pe = SolitonPositionEncoding(
cfg.dim, cfg.max_seq_len, cfg.soliton_n_modes
)
self.drop = nn.Dropout(cfg.dropout)
self.blocks = nn.ModuleList(
[PhiMindBlock(cfg, i) for i in range(cfg.n_layers)]
)
self.norm_out = RenyiNorm(cfg.dim, cfg.renyi_alpha_init)
self.lm_head = nn.Linear(cfg.dim, cfg.vocab_size, bias=False)
if cfg.tie_embeddings:
self.lm_head.weight = self.embed.weight
self._init_weights()
def _init_weights(self):
std = 0.02
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight, std=std)
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.Embedding):
nn.init.normal_(m.weight, std=std)
def forward(
self,
input_ids: torch.Tensor,
hrr_memories: Optional[list[Optional[torch.Tensor]]] = None,
) -> dict:
"""
input_ids: (B, T)
Returns dict with:
logits: (B, T, V)
hrr_memories: list of (B, D) tensors, one per layer
"""
x = self.embed(input_ids) # (B, T, D)
x = self.soliton_pe(x)
x = self.drop(x)
if hrr_memories is None:
hrr_memories = [None] * self.cfg.n_layers
new_hrr: list[Optional[torch.Tensor]] = []
phi_prevs: list[Optional[torch.Tensor]] = [None] * self.cfg.n_layers
layer_outputs: list[torch.Tensor] = []
for i, block in enumerate(self.blocks):
# Fetch layer output from rg_period steps ago for RG mixing
x_distant = (
layer_outputs[i - self.cfg.rg_period]
if (block.rg is not None and i >= self.cfg.rg_period)
else None
)
x, mem, phi_prev_out = block(
x,
hrr_memories[i],
phi_prevs[i],
x_distant,
)
new_hrr.append(mem)
phi_prevs[i] = phi_prev_out
layer_outputs.append(x)
x = self.norm_out(x)
logits = self.lm_head(x)
return {"logits": logits, "hrr_memories": new_hrr}
@torch.no_grad()
def generate(
self,
input_ids: torch.Tensor,
max_new_tokens: int = 256,
temperature: float = 1.0,
top_k: int = 50,
top_p: float = 0.9,
eos_token_id: Optional[int] = None,
) -> torch.Tensor:
"""Autoregressive generation with persistent HRR memory."""
hrr_memories: list[Optional[torch.Tensor]] = [None] * self.cfg.n_layers
# Prefill
out = self.forward(input_ids, hrr_memories)
hrr_memories = out["hrr_memories"]
generated = input_ids
for _ in range(max_new_tokens):
last = generated[:, -1:]
out = self.forward(last, hrr_memories)
hrr_memories = out["hrr_memories"]
logits = out["logits"][:, -1, :] # (B, V)
if temperature != 1.0:
logits = logits / max(temperature, 1e-8)
# Top-k
if top_k > 0:
v, _ = logits.topk(min(top_k, logits.size(-1)))
logits[logits < v[:, -1:]] = float("-inf")
# Top-p (nucleus)
if 0.0 < top_p < 1.0:
sorted_logits, sorted_idx = logits.sort(dim=-1, descending=True)
cumprobs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
remove = cumprobs - sorted_logits.softmax(dim=-1) > top_p
sorted_logits[remove] = float("-inf")
logits.scatter_(-1, sorted_idx, sorted_logits)
next_token = logits.softmax(dim=-1).multinomial(1)
generated = torch.cat([generated, next_token], dim=-1)
if eos_token_id is not None and (next_token == eos_token_id).all():
break
return generated
# ---------------------------------------------------------------------------
# Parameter count helper
# ---------------------------------------------------------------------------
def count_params(model: nn.Module) -> str:
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
def fmt(n: int) -> str:
if n >= 1_000_000_000:
return f"{n/1e9:.2f}B"
if n >= 1_000_000:
return f"{n/1e6:.2f}M"
return f"{n/1e3:.1f}K"
return f"total={fmt(total)}, trainable={fmt(trainable)}"

Xet Storage Details

Size:
22.2 kB
·
Xet hash:
94eeb390c07af3f619b5dcf08d8e8997ac8933a0c79d60c59b7cde46c5b2c84e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.