Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /model /phimind.py

bbkdevops

about 1 month ago

download

raw

22.2 kB

	"""Φ-Mind — LLM architecture derived from pure physics equations.

	Five core components replacing standard Transformer primitives:
	1. RenyiNorm — Rényi entropy normalization (replaces LayerNorm)
	2. SolitonPE — KdV soliton position encoding (replaces RoPE)
	3. HRRAttention — Holographic Reduced Representation (replaces KV-cache)
	4. Phi4Dynamics — Φ⁴ field equation (replaces FFN)
	5. RGScaleMixing — Renormalization Group blocking (replaces Multi-Head cross-layer)
	"""

	from __future__ import annotations

	import math
	from dataclasses import dataclass, field
	from typing import Optional

	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	# ---------------------------------------------------------------------------
	# Config
	# ---------------------------------------------------------------------------

	@dataclass
	class PhiMindConfig:
	vocab_size: int = 32_000
	dim: int = 512
	n_layers: int = 12
	max_seq_len: int = 4096
	# Φ⁴ field params
	phi4_epsilon: float = 0.1 # discretization step ε
	phi4_mass_sq: float = -0.5 # m² < 0 → spontaneous symmetry breaking
	phi4_lambda: float = 1.0 # λ > 0 → bounded nonlinearity
	# HRR
	hrr_decay: float = 0.95 # memory decay α
	hrr_local_window: int = 64 # local token window
	# Rényi norm
	renyi_alpha_init: float = 1.5 # initial α (learned per layer)
	# Soliton PE
	soliton_n_modes: int = 32 # number of soliton modes (dim/2 used)
	# RG scale mixing
	rg_eta: float = 0.1 # cross-scale coupling strength
	rg_period: int = 4 # apply RG every N layers
	# Training
	dropout: float = 0.0
	tie_embeddings: bool = True


	def phimind_tiny() -> PhiMindConfig:
	return PhiMindConfig(dim=256, n_layers=8, max_seq_len=2048)


	def phimind_small() -> PhiMindConfig:
	return PhiMindConfig(dim=512, n_layers=12, max_seq_len=4096)


	def phimind_base() -> PhiMindConfig:
	return PhiMindConfig(dim=1024, n_layers=16, max_seq_len=8192)


	# ---------------------------------------------------------------------------
	# 1. Rényi Entropy Normalization
	# ---------------------------------------------------------------------------

	class RenyiNorm(nn.Module):
	"""Normalize by Rényi α-norm with learned α per layer.

	RN_α(x) = x / (\|\|x\|\|_α + ε) where \|\|x\|\|_α = (Σ\|xᵢ\|^α)^{1/α}

	α is learned in (1, 2] via α = 1 + σ(γ).
	Special cases: α=2 ≡ L² (LayerNorm without affine), α→1 ≡ entropy-max.
	"""

	def __init__(self, dim: int, alpha_init: float = 1.5, eps: float = 1e-6):
	super().__init__()
	# γ initialised so σ(γ) ≈ alpha_init - 1
	init_gamma = math.log(alpha_init - 1.0) - math.log(2.0 - alpha_init + 1e-8)
	self.gamma = nn.Parameter(torch.tensor(init_gamma))
	self.scale = nn.Parameter(torch.ones(dim))
	self.eps = eps

	@property
	def alpha(self) -> torch.Tensor:
	return 1.0 + torch.sigmoid(self.gamma) # in (1, 2]

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	alpha = self.alpha
	norm = x.abs().pow(alpha).sum(dim=-1, keepdim=True).pow(1.0 / alpha)
	return self.scale * x / (norm + self.eps)


	# ---------------------------------------------------------------------------
	# 2. Soliton Position Encoding (KdV)
	# ---------------------------------------------------------------------------

	class SolitonPositionEncoding(nn.Module):
	"""Fixed position encoding from KdV soliton solutions.

	PE(p, 2k) = Aₖ · sech²(σₖ · (p − μₖ))
	PE(p, 2k+1) = Aₖ · sech²(σₖ · (p − μₖ)) · tanh(σₖ · (p − μₖ))

	Locality: sech²(x) ~ exp(-2\|x\|) → exponential decay with distance.
	Topologically stable: information is conserved.
	"""

	def __init__(self, dim: int, max_seq_len: int, n_modes: int \| None = None):
	super().__init__()
	n_modes = n_modes or (dim // 2)
	n_modes = min(n_modes, dim // 2)

	# Soliton parameters — spread geometrically to cover multiple scales
	# μₖ: centres spread uniformly across sequence
	# σₖ: widths on log scale (fine → coarse)
	mu = torch.linspace(0, max_seq_len - 1, n_modes)
	sigma = torch.exp(torch.linspace(math.log(0.5), math.log(max_seq_len / 4), n_modes))
	amplitude = 1.0 / (sigma + 1.0).sqrt() # amplitude ~ σ^{-1/2}

	self.register_buffer("mu", mu)
	self.register_buffer("sigma", sigma)
	self.register_buffer("amplitude", amplitude)

	self.dim = dim
	self.n_modes = n_modes
	self._cache: dict[int, torch.Tensor] = {}

	def _build(self, seq_len: int, device: torch.device) -> torch.Tensor:
	p = torch.arange(seq_len, dtype=torch.float32, device=device) # (T,)
	mu = self.mu.to(device) # (M,)
	sigma = self.sigma.to(device)
	amp = self.amplitude.to(device)

	z = sigma.unsqueeze(0) * (p.unsqueeze(1) - mu.unsqueeze(0)) # (T, M)
	sech2 = (1.0 / z.cosh()) ** 2 # (T, M)
	tanh_z = z.tanh()

	# Interleave even/odd channels
	even = amp * sech2 # (T, M)
	odd = amp * sech2 * tanh_z

	pe = torch.zeros(seq_len, self.dim, device=device)
	pe[:, 0::2][:, :self.n_modes] = even
	pe[:, 1::2][:, :self.n_modes] = odd
	return pe # (T, D)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""x: (B, T, D)"""
	T = x.size(1)
	pe = self._build(T, x.device)
	return x + pe.unsqueeze(0)


	# ---------------------------------------------------------------------------
	# 3. HRR Attention — Holographic Reduced Representation
	# ---------------------------------------------------------------------------

	class HRRAttention(nn.Module):
	"""O(d log d) attention via holographic memory.

	Encoding: C = Σᵢ kᵢ ⊛ vᵢ (circular convolution via FFT)
	Retrieval: v̂ ≈ q# ⊛ C (circular correlation = convolution with conjugate)

	Memory: O(d) regardless of context length (vs O(nd) for KV-cache).
	Capacity: reliable retrieval for n ≤ d+1 items.

	Also maintains a local window for fine-grained recent context.
	"""

	def __init__(self, dim: int, decay: float = 0.95, local_window: int = 64,
	dropout: float = 0.0):
	super().__init__()
	self.dim = dim
	self.decay = decay
	self.local_window = local_window

	self.q_proj = nn.Linear(dim, dim, bias=False)
	self.k_proj = nn.Linear(dim, dim, bias=False)
	self.v_proj = nn.Linear(dim, dim, bias=False)
	self.out_proj = nn.Linear(dim, dim, bias=False)
	self.drop = nn.Dropout(dropout)

	# Scale for local window scores
	self.local_scale = dim ** -0.5

	@staticmethod
	def _circ_conv(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
	"""Circular convolution via FFT: a ⊛ b. Inputs: (..., D)
	Cast to float32 for FFT — BFloat16 not supported by rfft."""
	orig_dtype = a.dtype
	fa = torch.fft.rfft(a.float(), dim=-1)
	fb = torch.fft.rfft(b.float(), dim=-1)
	return torch.fft.irfft(fa * fb, n=a.shape[-1], dim=-1).to(orig_dtype)

	@staticmethod
	def _circ_corr(q: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
	"""Circular correlation: q# ⊛ c (= conjugate in freq domain)."""
	orig_dtype = q.dtype
	fq = torch.fft.rfft(q.float(), dim=-1)
	fc = torch.fft.rfft(c.float(), dim=-1)
	return torch.fft.irfft(fq.conj() * fc, n=q.shape[-1], dim=-1).to(orig_dtype)

	def forward(
	self,
	x: torch.Tensor,
	hrr_memory: Optional[torch.Tensor] = None,
	) -> tuple[torch.Tensor, torch.Tensor]:
	"""
	x: (B, T, D)
	hrr_memory: (B, D) — running holographic memory, None on first call

	Returns: (output (B,T,D), updated hrr_memory (B,D))
	"""
	B, T, D = x.shape

	q = self.q_proj(x) # (B, T, D)
	k = self.k_proj(x)
	v = self.v_proj(x)

	# --- Holographic memory retrieval ---
	if hrr_memory is None:
	hrr_memory = torch.zeros(B, D, device=x.device, dtype=x.dtype)

	hrr_out = self._circ_corr(q, hrr_memory.unsqueeze(1).expand(B, T, D)) # (B,T,D)

	# --- Update memory: C_{new} = α·C_{old} + (1-α)·Σ k⊛v ---
	new_bindings = self._circ_conv(k, v) # (B, T, D)
	delta = new_bindings.mean(dim=1) # (B, D) — average over tokens
	hrr_memory = self.decay * hrr_memory + (1.0 - self.decay) * delta

	# --- Local window causal attention ---
	# Correct mask: position t attends to positions s where s<=t AND t-s<local_window
	t_idx = torch.arange(T, device=x.device) # (T,)
	diff = t_idx.unsqueeze(1) - t_idx.unsqueeze(0) # (T, T): diff[t,s] = t-s
	local_causal = (diff >= 0) & (diff < self.local_window) # (T, T)

	scores = torch.einsum("btd,bsd->bts", q, k) * self.local_scale # (B, T, T)
	scores = scores.masked_fill(~local_causal.unsqueeze(0), float("-inf"))
	attn = self.drop(scores.softmax(dim=-1))
	local_out = torch.einsum("bts,bsd->btd", attn, v) # (B, T, D)

	# Combine holographic (global) + local
	out = self.out_proj(hrr_out + local_out)
	return out, hrr_memory


	# ---------------------------------------------------------------------------
	# 4. Φ⁴ Field Dynamics (replaces FFN)
	# ---------------------------------------------------------------------------

	class Phi4Dynamics(nn.Module):
	"""Discretized Φ⁴ field equation as neural network layer.

	Φᵢ^{ℓ+1} = 2Φᵢ^ℓ − Φᵢ^{ℓ-1}
	+ ε²[ΔΦ − m²Φ − (λ/6)Φ³ + J(Φ;θ)]

	where:
	- inertia = 2Φ^ℓ − Φ^{ℓ-1} (automatic skip connection)
	- diffusion = ΔΦ (discrete Laplacian across hidden dim)
	- mass = m²Φ (m² < 0 → Mexican hat → symmetry breaking)
	- self-int = (λ/6)Φ³ (quartic term, λ > 0 → bounded)
	- source = J(Φ;θ) (learned linear source current)

	Properties vs ReLU/SwiGLU:
	- No dying neurons (bounded by quartic potential)
	- Automatic skip connection (inertia term)
	- Z₂ symmetry → free regularization
	- Gradient flows uniformly across depth
	"""

	def __init__(self, dim: int, epsilon: float = 0.1,
	mass_sq: float = -0.5, lam: float = 1.0):
	super().__init__()
	self.epsilon = epsilon
	self.mass_sq = mass_sq
	self.lam = lam

	# Learned source current J(Φ; θ) = tanh(W₂ · act(W₁Φ))
	# Small init + tanh bound keeps J ∈ (-1, 1) — physics: source is a perturbation
	hidden = dim * 2
	self.j_in = nn.Linear(dim, hidden, bias=True)
	self.j_out = nn.Linear(hidden, dim, bias=False)
	nn.init.normal_(self.j_in.weight, std=0.01)
	nn.init.zeros_(self.j_in.bias)
	nn.init.normal_(self.j_out.weight, std=0.01)

	# Discrete Laplacian: Δhᵢ = h_{i+1} - 2hᵢ + h_{i-1}
	# Implemented as a 1-D depthwise conv with kernel [-1, 2, -1] (negated)
	lap_kernel = torch.tensor([-1.0, 2.0, -1.0]).view(1, 1, 3).expand(dim, 1, 3)
	self.register_buffer("lap_kernel", lap_kernel.clone())

	def _laplacian(self, phi: torch.Tensor) -> torch.Tensor:
	"""Discrete Laplacian across hidden dim. phi: (B, T, D)"""
	B, T, D = phi.shape
	# Treat (B*T) as batch, D as 1-D spatial dimension
	x = phi.reshape(B * T, 1, D) # (B*T, 1, D)
	x = F.pad(x, (1, 1), mode="circular")
	lap = F.conv1d(x, self.lap_kernel[:1].float(), groups=1) # (B*T, 1, D)
	return lap.squeeze(1).reshape(B, T, D)

	def forward(
	self,
	phi_curr: torch.Tensor,
	phi_prev: Optional[torch.Tensor] = None,
	) -> tuple[torch.Tensor, torch.Tensor]:
	"""
	phi_curr: (B, T, D) — current "time step" = current layer
	phi_prev: (B, T, D) — previous layer (None → treated as phi_curr)

	Returns: (phi_next (B,T,D), phi_curr as new phi_prev)
	"""
	if phi_prev is None:
	phi_prev = phi_curr.detach()

	B, T, D = phi_curr.shape

	# Discrete Laplacian
	delta_phi = self._laplacian(phi_curr)

	# Source current J — bounded by tanh (source is a small perturbation)
	j = torch.tanh(self.j_out(F.silu(self.j_in(phi_curr))))

	# Φ⁴ equation
	rhs = (delta_phi
	- self.mass_sq * phi_curr
	- (self.lam / 6.0) * phi_curr ** 3
	+ j)

	phi_next = 2.0 * phi_curr - phi_prev + (self.epsilon ** 2) * rhs

	return phi_next, phi_curr


	# ---------------------------------------------------------------------------
	# 5. Renormalization Group Scale Mixing
	# ---------------------------------------------------------------------------

	class RGScaleMixing(nn.Module):
	"""Wilson RG blocking + cross-scale interaction.

	x^{ℓ+1} = B_ℓ[x^ℓ] + η · S_ℓ[x^ℓ, x^{ℓ-rg_period}]

	Blocking: Gaussian convolution over sequence dimension at scale λ_ℓ.
	Cross-scale: learned gating between current and distant-layer features.
	"""

	def __init__(self, dim: int, seq_scale: float = 4.0, eta: float = 0.1):
	super().__init__()
	self.eta = eta
	self.seq_scale = seq_scale # Gaussian σ in token positions

	# Cross-scale gating
	self.gate = nn.Linear(dim * 2, dim, bias=True)

	def _gaussian_block(self, x: torch.Tensor, sigma: float) -> torch.Tensor:
	"""Gaussian blur over sequence dimension. x: (B, T, D)"""
	if sigma < 0.5:
	return x
	# Depthwise conv over T: (B, D, T)
	xp = x.permute(0, 2, 1) # (B, D, T)
	# Clamp radius so padding never exceeds sequence length
	radius = min(max(1, int(3 * sigma)), max(1, xp.shape[-1] - 1))
	k_size = 2 * radius + 1
	t = torch.arange(-radius, radius + 1, dtype=x.dtype, device=x.device)
	kernel = torch.exp(-0.5 * (t / sigma) ** 2)
	kernel = kernel / kernel.sum() # (k_size,)

	xp = F.pad(xp, (radius, radius), mode="replicate")
	kernel_2d = kernel.view(1, 1, k_size).expand(x.shape[-1], 1, k_size)
	out = F.conv1d(xp, kernel_2d, groups=x.shape[-1]) # (B, D, T)
	return out.permute(0, 2, 1) # (B, T, D)

	def forward(
	self,
	x_curr: torch.Tensor,
	x_past: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""
	x_curr: (B, T, D)
	x_past: (B, T, D) from rg_period layers ago (None → skip cross-scale)
	"""
	blocked = self._gaussian_block(x_curr, self.seq_scale)

	if x_past is not None:
	combined = torch.cat([x_curr, x_past], dim=-1)
	cross = self.eta * torch.sigmoid(self.gate(combined)) * x_past
	return blocked + cross

	return blocked


	# ---------------------------------------------------------------------------
	# Full PhiMind Block
	# ---------------------------------------------------------------------------

	class PhiMindBlock(nn.Module):
	def __init__(self, cfg: PhiMindConfig, layer_idx: int):
	super().__init__()
	self.layer_idx = layer_idx

	self.norm1 = RenyiNorm(cfg.dim, cfg.renyi_alpha_init)
	self.norm2 = RenyiNorm(cfg.dim, cfg.renyi_alpha_init)

	self.hrr_attn = HRRAttention(
	cfg.dim, cfg.hrr_decay, cfg.hrr_local_window, cfg.dropout
	)
	self.phi4 = Phi4Dynamics(
	cfg.dim, cfg.phi4_epsilon, cfg.phi4_mass_sq, cfg.phi4_lambda
	)

	apply_rg = (layer_idx > 0) and (layer_idx % cfg.rg_period == 0)
	self.rg: Optional[RGScaleMixing] = (
	RGScaleMixing(cfg.dim, eta=cfg.rg_eta) if apply_rg else None
	)

	def forward(
	self,
	x: torch.Tensor,
	hrr_memory: Optional[torch.Tensor],
	phi_prev: Optional[torch.Tensor],
	x_distant: Optional[torch.Tensor],
	) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	"""
	Returns: (x_out, hrr_memory_updated, x as phi_prev_out)
	"""
	# HRR attention
	attn_out, hrr_memory = self.hrr_attn(self.norm1(x), hrr_memory)
	x = x + attn_out

	# Φ⁴ dynamics
	phi_next, phi_prev_out = self.phi4(self.norm2(x), phi_prev)
	x = x + phi_next

	# RG scale mixing (every rg_period layers)
	if self.rg is not None:
	x = self.rg(x, x_distant)

	return x, hrr_memory, phi_prev_out


	# ---------------------------------------------------------------------------
	# Full PhiMind Language Model
	# ---------------------------------------------------------------------------

	class PhiMindModel(nn.Module):
	"""Φ-Mind autoregressive language model.

	All standard Transformer components replaced by physics-derived equations:
	LayerNorm → RenyiNorm (Rényi entropy normalization)
	RoPE → SolitonPE (KdV soliton position encoding)
	KV-Cache → HRR (holographic memory, O(d) storage)
	FFN → Φ⁴ dynamics (field equation, automatic skip)
	Multi-Head → RG scale mixing (Wilson blocking)

	Complexity: O(n·d·log d) vs O(n²d + nd²) Transformer.
	"""

	def __init__(self, cfg: PhiMindConfig):
	super().__init__()
	self.cfg = cfg

	self.embed = nn.Embedding(cfg.vocab_size, cfg.dim)
	self.soliton_pe = SolitonPositionEncoding(
	cfg.dim, cfg.max_seq_len, cfg.soliton_n_modes
	)
	self.drop = nn.Dropout(cfg.dropout)

	self.blocks = nn.ModuleList(
	[PhiMindBlock(cfg, i) for i in range(cfg.n_layers)]
	)

	self.norm_out = RenyiNorm(cfg.dim, cfg.renyi_alpha_init)
	self.lm_head = nn.Linear(cfg.dim, cfg.vocab_size, bias=False)

	if cfg.tie_embeddings:
	self.lm_head.weight = self.embed.weight

	self._init_weights()

	def _init_weights(self):
	std = 0.02
	for m in self.modules():
	if isinstance(m, nn.Linear):
	nn.init.normal_(m.weight, std=std)
	if m.bias is not None:
	nn.init.zeros_(m.bias)
	elif isinstance(m, nn.Embedding):
	nn.init.normal_(m.weight, std=std)

	def forward(
	self,
	input_ids: torch.Tensor,
	hrr_memories: Optional[list[Optional[torch.Tensor]]] = None,
	) -> dict:
	"""
	input_ids: (B, T)

	Returns dict with:
	logits: (B, T, V)
	hrr_memories: list of (B, D) tensors, one per layer
	"""
	x = self.embed(input_ids) # (B, T, D)
	x = self.soliton_pe(x)
	x = self.drop(x)

	if hrr_memories is None:
	hrr_memories = [None] * self.cfg.n_layers

	new_hrr: list[Optional[torch.Tensor]] = []
	phi_prevs: list[Optional[torch.Tensor]] = [None] * self.cfg.n_layers
	layer_outputs: list[torch.Tensor] = []

	for i, block in enumerate(self.blocks):
	# Fetch layer output from rg_period steps ago for RG mixing
	x_distant = (
	layer_outputs[i - self.cfg.rg_period]
	if (block.rg is not None and i >= self.cfg.rg_period)
	else None
	)

	x, mem, phi_prev_out = block(
	x,
	hrr_memories[i],
	phi_prevs[i],
	x_distant,
	)
	new_hrr.append(mem)
	phi_prevs[i] = phi_prev_out
	layer_outputs.append(x)

	x = self.norm_out(x)
	logits = self.lm_head(x)

	return {"logits": logits, "hrr_memories": new_hrr}

	@torch.no_grad()
	def generate(
	self,
	input_ids: torch.Tensor,
	max_new_tokens: int = 256,
	temperature: float = 1.0,
	top_k: int = 50,
	top_p: float = 0.9,
	eos_token_id: Optional[int] = None,
	) -> torch.Tensor:
	"""Autoregressive generation with persistent HRR memory."""
	hrr_memories: list[Optional[torch.Tensor]] = [None] * self.cfg.n_layers

	# Prefill
	out = self.forward(input_ids, hrr_memories)
	hrr_memories = out["hrr_memories"]

	generated = input_ids
	for _ in range(max_new_tokens):
	last = generated[:, -1:]
	out = self.forward(last, hrr_memories)
	hrr_memories = out["hrr_memories"]

	logits = out["logits"][:, -1, :] # (B, V)

	if temperature != 1.0:
	logits = logits / max(temperature, 1e-8)

	# Top-k
	if top_k > 0:
	v, _ = logits.topk(min(top_k, logits.size(-1)))
	logits[logits < v[:, -1:]] = float("-inf")

	# Top-p (nucleus)
	if 0.0 < top_p < 1.0:
	sorted_logits, sorted_idx = logits.sort(dim=-1, descending=True)
	cumprobs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
	remove = cumprobs - sorted_logits.softmax(dim=-1) > top_p
	sorted_logits[remove] = float("-inf")
	logits.scatter_(-1, sorted_idx, sorted_logits)

	next_token = logits.softmax(dim=-1).multinomial(1)
	generated = torch.cat([generated, next_token], dim=-1)

	if eos_token_id is not None and (next_token == eos_token_id).all():
	break

	return generated


	# ---------------------------------------------------------------------------
	# Parameter count helper
	# ---------------------------------------------------------------------------

	def count_params(model: nn.Module) -> str:
	total = sum(p.numel() for p in model.parameters())
	trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
	def fmt(n: int) -> str:
	if n >= 1_000_000_000:
	return f"{n/1e9:.2f}B"
	if n >= 1_000_000:
	return f"{n/1e6:.2f}M"
	return f"{n/1e3:.1f}K"
	return f"total={fmt(total)}, trainable={fmt(trainable)}"

Xet Storage Details

Size:: 22.2 kB
Xet hash:: 94eeb390c07af3f619b5dcf08d8e8997ac8933a0c79d60c59b7cde46c5b2c84e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.