Buckets:
| """Φ-Mind — LLM architecture derived from pure physics equations. | |
| Five core components replacing standard Transformer primitives: | |
| 1. RenyiNorm — Rényi entropy normalization (replaces LayerNorm) | |
| 2. SolitonPE — KdV soliton position encoding (replaces RoPE) | |
| 3. HRRAttention — Holographic Reduced Representation (replaces KV-cache) | |
| 4. Phi4Dynamics — Φ⁴ field equation (replaces FFN) | |
| 5. RGScaleMixing — Renormalization Group blocking (replaces Multi-Head cross-layer) | |
| """ | |
| from __future__ import annotations | |
| import math | |
| from dataclasses import dataclass, field | |
| from typing import Optional | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| # --------------------------------------------------------------------------- | |
| # Config | |
| # --------------------------------------------------------------------------- | |
| class PhiMindConfig: | |
| vocab_size: int = 32_000 | |
| dim: int = 512 | |
| n_layers: int = 12 | |
| max_seq_len: int = 4096 | |
| # Φ⁴ field params | |
| phi4_epsilon: float = 0.1 # discretization step ε | |
| phi4_mass_sq: float = -0.5 # m² < 0 → spontaneous symmetry breaking | |
| phi4_lambda: float = 1.0 # λ > 0 → bounded nonlinearity | |
| # HRR | |
| hrr_decay: float = 0.95 # memory decay α | |
| hrr_local_window: int = 64 # local token window | |
| # Rényi norm | |
| renyi_alpha_init: float = 1.5 # initial α (learned per layer) | |
| # Soliton PE | |
| soliton_n_modes: int = 32 # number of soliton modes (dim/2 used) | |
| # RG scale mixing | |
| rg_eta: float = 0.1 # cross-scale coupling strength | |
| rg_period: int = 4 # apply RG every N layers | |
| # Training | |
| dropout: float = 0.0 | |
| tie_embeddings: bool = True | |
| def phimind_tiny() -> PhiMindConfig: | |
| return PhiMindConfig(dim=256, n_layers=8, max_seq_len=2048) | |
| def phimind_small() -> PhiMindConfig: | |
| return PhiMindConfig(dim=512, n_layers=12, max_seq_len=4096) | |
| def phimind_base() -> PhiMindConfig: | |
| return PhiMindConfig(dim=1024, n_layers=16, max_seq_len=8192) | |
| # --------------------------------------------------------------------------- | |
| # 1. Rényi Entropy Normalization | |
| # --------------------------------------------------------------------------- | |
| class RenyiNorm(nn.Module): | |
| """Normalize by Rényi α-norm with learned α per layer. | |
| RN_α(x) = x / (||x||_α + ε) where ||x||_α = (Σ|xᵢ|^α)^{1/α} | |
| α is learned in (1, 2] via α = 1 + σ(γ). | |
| Special cases: α=2 ≡ L² (LayerNorm without affine), α→1 ≡ entropy-max. | |
| """ | |
| def __init__(self, dim: int, alpha_init: float = 1.5, eps: float = 1e-6): | |
| super().__init__() | |
| # γ initialised so σ(γ) ≈ alpha_init - 1 | |
| init_gamma = math.log(alpha_init - 1.0) - math.log(2.0 - alpha_init + 1e-8) | |
| self.gamma = nn.Parameter(torch.tensor(init_gamma)) | |
| self.scale = nn.Parameter(torch.ones(dim)) | |
| self.eps = eps | |
| def alpha(self) -> torch.Tensor: | |
| return 1.0 + torch.sigmoid(self.gamma) # in (1, 2] | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| alpha = self.alpha | |
| norm = x.abs().pow(alpha).sum(dim=-1, keepdim=True).pow(1.0 / alpha) | |
| return self.scale * x / (norm + self.eps) | |
| # --------------------------------------------------------------------------- | |
| # 2. Soliton Position Encoding (KdV) | |
| # --------------------------------------------------------------------------- | |
| class SolitonPositionEncoding(nn.Module): | |
| """Fixed position encoding from KdV soliton solutions. | |
| PE(p, 2k) = Aₖ · sech²(σₖ · (p − μₖ)) | |
| PE(p, 2k+1) = Aₖ · sech²(σₖ · (p − μₖ)) · tanh(σₖ · (p − μₖ)) | |
| Locality: sech²(x) ~ exp(-2|x|) → exponential decay with distance. | |
| Topologically stable: information is conserved. | |
| """ | |
| def __init__(self, dim: int, max_seq_len: int, n_modes: int | None = None): | |
| super().__init__() | |
| n_modes = n_modes or (dim // 2) | |
| n_modes = min(n_modes, dim // 2) | |
| # Soliton parameters — spread geometrically to cover multiple scales | |
| # μₖ: centres spread uniformly across sequence | |
| # σₖ: widths on log scale (fine → coarse) | |
| mu = torch.linspace(0, max_seq_len - 1, n_modes) | |
| sigma = torch.exp(torch.linspace(math.log(0.5), math.log(max_seq_len / 4), n_modes)) | |
| amplitude = 1.0 / (sigma + 1.0).sqrt() # amplitude ~ σ^{-1/2} | |
| self.register_buffer("mu", mu) | |
| self.register_buffer("sigma", sigma) | |
| self.register_buffer("amplitude", amplitude) | |
| self.dim = dim | |
| self.n_modes = n_modes | |
| self._cache: dict[int, torch.Tensor] = {} | |
| def _build(self, seq_len: int, device: torch.device) -> torch.Tensor: | |
| p = torch.arange(seq_len, dtype=torch.float32, device=device) # (T,) | |
| mu = self.mu.to(device) # (M,) | |
| sigma = self.sigma.to(device) | |
| amp = self.amplitude.to(device) | |
| z = sigma.unsqueeze(0) * (p.unsqueeze(1) - mu.unsqueeze(0)) # (T, M) | |
| sech2 = (1.0 / z.cosh()) ** 2 # (T, M) | |
| tanh_z = z.tanh() | |
| # Interleave even/odd channels | |
| even = amp * sech2 # (T, M) | |
| odd = amp * sech2 * tanh_z | |
| pe = torch.zeros(seq_len, self.dim, device=device) | |
| pe[:, 0::2][:, :self.n_modes] = even | |
| pe[:, 1::2][:, :self.n_modes] = odd | |
| return pe # (T, D) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| """x: (B, T, D)""" | |
| T = x.size(1) | |
| pe = self._build(T, x.device) | |
| return x + pe.unsqueeze(0) | |
| # --------------------------------------------------------------------------- | |
| # 3. HRR Attention — Holographic Reduced Representation | |
| # --------------------------------------------------------------------------- | |
| class HRRAttention(nn.Module): | |
| """O(d log d) attention via holographic memory. | |
| Encoding: C = Σᵢ kᵢ ⊛ vᵢ (circular convolution via FFT) | |
| Retrieval: v̂ ≈ q# ⊛ C (circular correlation = convolution with conjugate) | |
| Memory: O(d) regardless of context length (vs O(nd) for KV-cache). | |
| Capacity: reliable retrieval for n ≤ d+1 items. | |
| Also maintains a local window for fine-grained recent context. | |
| """ | |
| def __init__(self, dim: int, decay: float = 0.95, local_window: int = 64, | |
| dropout: float = 0.0): | |
| super().__init__() | |
| self.dim = dim | |
| self.decay = decay | |
| self.local_window = local_window | |
| self.q_proj = nn.Linear(dim, dim, bias=False) | |
| self.k_proj = nn.Linear(dim, dim, bias=False) | |
| self.v_proj = nn.Linear(dim, dim, bias=False) | |
| self.out_proj = nn.Linear(dim, dim, bias=False) | |
| self.drop = nn.Dropout(dropout) | |
| # Scale for local window scores | |
| self.local_scale = dim ** -0.5 | |
| def _circ_conv(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: | |
| """Circular convolution via FFT: a ⊛ b. Inputs: (..., D) | |
| Cast to float32 for FFT — BFloat16 not supported by rfft.""" | |
| orig_dtype = a.dtype | |
| fa = torch.fft.rfft(a.float(), dim=-1) | |
| fb = torch.fft.rfft(b.float(), dim=-1) | |
| return torch.fft.irfft(fa * fb, n=a.shape[-1], dim=-1).to(orig_dtype) | |
| def _circ_corr(q: torch.Tensor, c: torch.Tensor) -> torch.Tensor: | |
| """Circular correlation: q# ⊛ c (= conjugate in freq domain).""" | |
| orig_dtype = q.dtype | |
| fq = torch.fft.rfft(q.float(), dim=-1) | |
| fc = torch.fft.rfft(c.float(), dim=-1) | |
| return torch.fft.irfft(fq.conj() * fc, n=q.shape[-1], dim=-1).to(orig_dtype) | |
| def forward( | |
| self, | |
| x: torch.Tensor, | |
| hrr_memory: Optional[torch.Tensor] = None, | |
| ) -> tuple[torch.Tensor, torch.Tensor]: | |
| """ | |
| x: (B, T, D) | |
| hrr_memory: (B, D) — running holographic memory, None on first call | |
| Returns: (output (B,T,D), updated hrr_memory (B,D)) | |
| """ | |
| B, T, D = x.shape | |
| q = self.q_proj(x) # (B, T, D) | |
| k = self.k_proj(x) | |
| v = self.v_proj(x) | |
| # --- Holographic memory retrieval --- | |
| if hrr_memory is None: | |
| hrr_memory = torch.zeros(B, D, device=x.device, dtype=x.dtype) | |
| hrr_out = self._circ_corr(q, hrr_memory.unsqueeze(1).expand(B, T, D)) # (B,T,D) | |
| # --- Update memory: C_{new} = α·C_{old} + (1-α)·Σ k⊛v --- | |
| new_bindings = self._circ_conv(k, v) # (B, T, D) | |
| delta = new_bindings.mean(dim=1) # (B, D) — average over tokens | |
| hrr_memory = self.decay * hrr_memory + (1.0 - self.decay) * delta | |
| # --- Local window causal attention --- | |
| # Correct mask: position t attends to positions s where s<=t AND t-s<local_window | |
| t_idx = torch.arange(T, device=x.device) # (T,) | |
| diff = t_idx.unsqueeze(1) - t_idx.unsqueeze(0) # (T, T): diff[t,s] = t-s | |
| local_causal = (diff >= 0) & (diff < self.local_window) # (T, T) | |
| scores = torch.einsum("btd,bsd->bts", q, k) * self.local_scale # (B, T, T) | |
| scores = scores.masked_fill(~local_causal.unsqueeze(0), float("-inf")) | |
| attn = self.drop(scores.softmax(dim=-1)) | |
| local_out = torch.einsum("bts,bsd->btd", attn, v) # (B, T, D) | |
| # Combine holographic (global) + local | |
| out = self.out_proj(hrr_out + local_out) | |
| return out, hrr_memory | |
| # --------------------------------------------------------------------------- | |
| # 4. Φ⁴ Field Dynamics (replaces FFN) | |
| # --------------------------------------------------------------------------- | |
| class Phi4Dynamics(nn.Module): | |
| """Discretized Φ⁴ field equation as neural network layer. | |
| Φᵢ^{ℓ+1} = 2Φᵢ^ℓ − Φᵢ^{ℓ-1} | |
| + ε²[ΔΦ − m²Φ − (λ/6)Φ³ + J(Φ;θ)] | |
| where: | |
| - inertia = 2Φ^ℓ − Φ^{ℓ-1} (automatic skip connection) | |
| - diffusion = ΔΦ (discrete Laplacian across hidden dim) | |
| - mass = m²Φ (m² < 0 → Mexican hat → symmetry breaking) | |
| - self-int = (λ/6)Φ³ (quartic term, λ > 0 → bounded) | |
| - source = J(Φ;θ) (learned linear source current) | |
| Properties vs ReLU/SwiGLU: | |
| - No dying neurons (bounded by quartic potential) | |
| - Automatic skip connection (inertia term) | |
| - Z₂ symmetry → free regularization | |
| - Gradient flows uniformly across depth | |
| """ | |
| def __init__(self, dim: int, epsilon: float = 0.1, | |
| mass_sq: float = -0.5, lam: float = 1.0): | |
| super().__init__() | |
| self.epsilon = epsilon | |
| self.mass_sq = mass_sq | |
| self.lam = lam | |
| # Learned source current J(Φ; θ) = tanh(W₂ · act(W₁Φ)) | |
| # Small init + tanh bound keeps J ∈ (-1, 1) — physics: source is a perturbation | |
| hidden = dim * 2 | |
| self.j_in = nn.Linear(dim, hidden, bias=True) | |
| self.j_out = nn.Linear(hidden, dim, bias=False) | |
| nn.init.normal_(self.j_in.weight, std=0.01) | |
| nn.init.zeros_(self.j_in.bias) | |
| nn.init.normal_(self.j_out.weight, std=0.01) | |
| # Discrete Laplacian: Δhᵢ = h_{i+1} - 2hᵢ + h_{i-1} | |
| # Implemented as a 1-D depthwise conv with kernel [-1, 2, -1] (negated) | |
| lap_kernel = torch.tensor([-1.0, 2.0, -1.0]).view(1, 1, 3).expand(dim, 1, 3) | |
| self.register_buffer("lap_kernel", lap_kernel.clone()) | |
| def _laplacian(self, phi: torch.Tensor) -> torch.Tensor: | |
| """Discrete Laplacian across hidden dim. phi: (B, T, D)""" | |
| B, T, D = phi.shape | |
| # Treat (B*T) as batch, D as 1-D spatial dimension | |
| x = phi.reshape(B * T, 1, D) # (B*T, 1, D) | |
| x = F.pad(x, (1, 1), mode="circular") | |
| lap = F.conv1d(x, self.lap_kernel[:1].float(), groups=1) # (B*T, 1, D) | |
| return lap.squeeze(1).reshape(B, T, D) | |
| def forward( | |
| self, | |
| phi_curr: torch.Tensor, | |
| phi_prev: Optional[torch.Tensor] = None, | |
| ) -> tuple[torch.Tensor, torch.Tensor]: | |
| """ | |
| phi_curr: (B, T, D) — current "time step" = current layer | |
| phi_prev: (B, T, D) — previous layer (None → treated as phi_curr) | |
| Returns: (phi_next (B,T,D), phi_curr as new phi_prev) | |
| """ | |
| if phi_prev is None: | |
| phi_prev = phi_curr.detach() | |
| B, T, D = phi_curr.shape | |
| # Discrete Laplacian | |
| delta_phi = self._laplacian(phi_curr) | |
| # Source current J — bounded by tanh (source is a small perturbation) | |
| j = torch.tanh(self.j_out(F.silu(self.j_in(phi_curr)))) | |
| # Φ⁴ equation | |
| rhs = (delta_phi | |
| - self.mass_sq * phi_curr | |
| - (self.lam / 6.0) * phi_curr ** 3 | |
| + j) | |
| phi_next = 2.0 * phi_curr - phi_prev + (self.epsilon ** 2) * rhs | |
| return phi_next, phi_curr | |
| # --------------------------------------------------------------------------- | |
| # 5. Renormalization Group Scale Mixing | |
| # --------------------------------------------------------------------------- | |
| class RGScaleMixing(nn.Module): | |
| """Wilson RG blocking + cross-scale interaction. | |
| x^{ℓ+1} = B_ℓ[x^ℓ] + η · S_ℓ[x^ℓ, x^{ℓ-rg_period}] | |
| Blocking: Gaussian convolution over sequence dimension at scale λ_ℓ. | |
| Cross-scale: learned gating between current and distant-layer features. | |
| """ | |
| def __init__(self, dim: int, seq_scale: float = 4.0, eta: float = 0.1): | |
| super().__init__() | |
| self.eta = eta | |
| self.seq_scale = seq_scale # Gaussian σ in token positions | |
| # Cross-scale gating | |
| self.gate = nn.Linear(dim * 2, dim, bias=True) | |
| def _gaussian_block(self, x: torch.Tensor, sigma: float) -> torch.Tensor: | |
| """Gaussian blur over sequence dimension. x: (B, T, D)""" | |
| if sigma < 0.5: | |
| return x | |
| # Depthwise conv over T: (B, D, T) | |
| xp = x.permute(0, 2, 1) # (B, D, T) | |
| # Clamp radius so padding never exceeds sequence length | |
| radius = min(max(1, int(3 * sigma)), max(1, xp.shape[-1] - 1)) | |
| k_size = 2 * radius + 1 | |
| t = torch.arange(-radius, radius + 1, dtype=x.dtype, device=x.device) | |
| kernel = torch.exp(-0.5 * (t / sigma) ** 2) | |
| kernel = kernel / kernel.sum() # (k_size,) | |
| xp = F.pad(xp, (radius, radius), mode="replicate") | |
| kernel_2d = kernel.view(1, 1, k_size).expand(x.shape[-1], 1, k_size) | |
| out = F.conv1d(xp, kernel_2d, groups=x.shape[-1]) # (B, D, T) | |
| return out.permute(0, 2, 1) # (B, T, D) | |
| def forward( | |
| self, | |
| x_curr: torch.Tensor, | |
| x_past: Optional[torch.Tensor] = None, | |
| ) -> torch.Tensor: | |
| """ | |
| x_curr: (B, T, D) | |
| x_past: (B, T, D) from rg_period layers ago (None → skip cross-scale) | |
| """ | |
| blocked = self._gaussian_block(x_curr, self.seq_scale) | |
| if x_past is not None: | |
| combined = torch.cat([x_curr, x_past], dim=-1) | |
| cross = self.eta * torch.sigmoid(self.gate(combined)) * x_past | |
| return blocked + cross | |
| return blocked | |
| # --------------------------------------------------------------------------- | |
| # Full PhiMind Block | |
| # --------------------------------------------------------------------------- | |
| class PhiMindBlock(nn.Module): | |
| def __init__(self, cfg: PhiMindConfig, layer_idx: int): | |
| super().__init__() | |
| self.layer_idx = layer_idx | |
| self.norm1 = RenyiNorm(cfg.dim, cfg.renyi_alpha_init) | |
| self.norm2 = RenyiNorm(cfg.dim, cfg.renyi_alpha_init) | |
| self.hrr_attn = HRRAttention( | |
| cfg.dim, cfg.hrr_decay, cfg.hrr_local_window, cfg.dropout | |
| ) | |
| self.phi4 = Phi4Dynamics( | |
| cfg.dim, cfg.phi4_epsilon, cfg.phi4_mass_sq, cfg.phi4_lambda | |
| ) | |
| apply_rg = (layer_idx > 0) and (layer_idx % cfg.rg_period == 0) | |
| self.rg: Optional[RGScaleMixing] = ( | |
| RGScaleMixing(cfg.dim, eta=cfg.rg_eta) if apply_rg else None | |
| ) | |
| def forward( | |
| self, | |
| x: torch.Tensor, | |
| hrr_memory: Optional[torch.Tensor], | |
| phi_prev: Optional[torch.Tensor], | |
| x_distant: Optional[torch.Tensor], | |
| ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: | |
| """ | |
| Returns: (x_out, hrr_memory_updated, x as phi_prev_out) | |
| """ | |
| # HRR attention | |
| attn_out, hrr_memory = self.hrr_attn(self.norm1(x), hrr_memory) | |
| x = x + attn_out | |
| # Φ⁴ dynamics | |
| phi_next, phi_prev_out = self.phi4(self.norm2(x), phi_prev) | |
| x = x + phi_next | |
| # RG scale mixing (every rg_period layers) | |
| if self.rg is not None: | |
| x = self.rg(x, x_distant) | |
| return x, hrr_memory, phi_prev_out | |
| # --------------------------------------------------------------------------- | |
| # Full PhiMind Language Model | |
| # --------------------------------------------------------------------------- | |
| class PhiMindModel(nn.Module): | |
| """Φ-Mind autoregressive language model. | |
| All standard Transformer components replaced by physics-derived equations: | |
| LayerNorm → RenyiNorm (Rényi entropy normalization) | |
| RoPE → SolitonPE (KdV soliton position encoding) | |
| KV-Cache → HRR (holographic memory, O(d) storage) | |
| FFN → Φ⁴ dynamics (field equation, automatic skip) | |
| Multi-Head → RG scale mixing (Wilson blocking) | |
| Complexity: O(n·d·log d) vs O(n²d + nd²) Transformer. | |
| """ | |
| def __init__(self, cfg: PhiMindConfig): | |
| super().__init__() | |
| self.cfg = cfg | |
| self.embed = nn.Embedding(cfg.vocab_size, cfg.dim) | |
| self.soliton_pe = SolitonPositionEncoding( | |
| cfg.dim, cfg.max_seq_len, cfg.soliton_n_modes | |
| ) | |
| self.drop = nn.Dropout(cfg.dropout) | |
| self.blocks = nn.ModuleList( | |
| [PhiMindBlock(cfg, i) for i in range(cfg.n_layers)] | |
| ) | |
| self.norm_out = RenyiNorm(cfg.dim, cfg.renyi_alpha_init) | |
| self.lm_head = nn.Linear(cfg.dim, cfg.vocab_size, bias=False) | |
| if cfg.tie_embeddings: | |
| self.lm_head.weight = self.embed.weight | |
| self._init_weights() | |
| def _init_weights(self): | |
| std = 0.02 | |
| for m in self.modules(): | |
| if isinstance(m, nn.Linear): | |
| nn.init.normal_(m.weight, std=std) | |
| if m.bias is not None: | |
| nn.init.zeros_(m.bias) | |
| elif isinstance(m, nn.Embedding): | |
| nn.init.normal_(m.weight, std=std) | |
| def forward( | |
| self, | |
| input_ids: torch.Tensor, | |
| hrr_memories: Optional[list[Optional[torch.Tensor]]] = None, | |
| ) -> dict: | |
| """ | |
| input_ids: (B, T) | |
| Returns dict with: | |
| logits: (B, T, V) | |
| hrr_memories: list of (B, D) tensors, one per layer | |
| """ | |
| x = self.embed(input_ids) # (B, T, D) | |
| x = self.soliton_pe(x) | |
| x = self.drop(x) | |
| if hrr_memories is None: | |
| hrr_memories = [None] * self.cfg.n_layers | |
| new_hrr: list[Optional[torch.Tensor]] = [] | |
| phi_prevs: list[Optional[torch.Tensor]] = [None] * self.cfg.n_layers | |
| layer_outputs: list[torch.Tensor] = [] | |
| for i, block in enumerate(self.blocks): | |
| # Fetch layer output from rg_period steps ago for RG mixing | |
| x_distant = ( | |
| layer_outputs[i - self.cfg.rg_period] | |
| if (block.rg is not None and i >= self.cfg.rg_period) | |
| else None | |
| ) | |
| x, mem, phi_prev_out = block( | |
| x, | |
| hrr_memories[i], | |
| phi_prevs[i], | |
| x_distant, | |
| ) | |
| new_hrr.append(mem) | |
| phi_prevs[i] = phi_prev_out | |
| layer_outputs.append(x) | |
| x = self.norm_out(x) | |
| logits = self.lm_head(x) | |
| return {"logits": logits, "hrr_memories": new_hrr} | |
| def generate( | |
| self, | |
| input_ids: torch.Tensor, | |
| max_new_tokens: int = 256, | |
| temperature: float = 1.0, | |
| top_k: int = 50, | |
| top_p: float = 0.9, | |
| eos_token_id: Optional[int] = None, | |
| ) -> torch.Tensor: | |
| """Autoregressive generation with persistent HRR memory.""" | |
| hrr_memories: list[Optional[torch.Tensor]] = [None] * self.cfg.n_layers | |
| # Prefill | |
| out = self.forward(input_ids, hrr_memories) | |
| hrr_memories = out["hrr_memories"] | |
| generated = input_ids | |
| for _ in range(max_new_tokens): | |
| last = generated[:, -1:] | |
| out = self.forward(last, hrr_memories) | |
| hrr_memories = out["hrr_memories"] | |
| logits = out["logits"][:, -1, :] # (B, V) | |
| if temperature != 1.0: | |
| logits = logits / max(temperature, 1e-8) | |
| # Top-k | |
| if top_k > 0: | |
| v, _ = logits.topk(min(top_k, logits.size(-1))) | |
| logits[logits < v[:, -1:]] = float("-inf") | |
| # Top-p (nucleus) | |
| if 0.0 < top_p < 1.0: | |
| sorted_logits, sorted_idx = logits.sort(dim=-1, descending=True) | |
| cumprobs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) | |
| remove = cumprobs - sorted_logits.softmax(dim=-1) > top_p | |
| sorted_logits[remove] = float("-inf") | |
| logits.scatter_(-1, sorted_idx, sorted_logits) | |
| next_token = logits.softmax(dim=-1).multinomial(1) | |
| generated = torch.cat([generated, next_token], dim=-1) | |
| if eos_token_id is not None and (next_token == eos_token_id).all(): | |
| break | |
| return generated | |
| # --------------------------------------------------------------------------- | |
| # Parameter count helper | |
| # --------------------------------------------------------------------------- | |
| def count_params(model: nn.Module) -> str: | |
| total = sum(p.numel() for p in model.parameters()) | |
| trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) | |
| def fmt(n: int) -> str: | |
| if n >= 1_000_000_000: | |
| return f"{n/1e9:.2f}B" | |
| if n >= 1_000_000: | |
| return f"{n/1e6:.2f}M" | |
| return f"{n/1e3:.1f}K" | |
| return f"total={fmt(total)}, trainable={fmt(trainable)}" | |
Xet Storage Details
- Size:
- 22.2 kB
- Xet hash:
- 94eeb390c07af3f619b5dcf08d8e8997ac8933a0c79d60c59b7cde46c5b2c84e
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.