KitsuVp
/

NeoLLM

@@ -783,41 +783,9 @@ class LeviathanGenerator(nn.Module):
     matches the authors' ``1 + wd_i`` parameterization so phi ≈ 1.0 at init
     and the product of d_seed factors starts near 1.0 instead of ~10^{-21}.
-    Compile-stability note: the main KHRONOS product is evaluated by chunks
-    over the seed dimension.  This preserves the exact separable product but
-    avoids materializing the full [N, d_seed, krank] tensor that triggers
-    very large Inductor/Triton BMM graphs at batch_size × seq_len = 32768.
     FP8 note: Leviathan deliberately stores the shared JTok-M seed projection
     as raw Parameters rather than nn.Linear.  This keeps the generator outside
     TorchAO Float8Linear conversion even if an external FP8 filter is too broad.
-    **Frequency-based codebook ordering (optional)**
-    By default, the base-k decomposition maps token indices directly to
-    codebook coordinates via arithmetic: token x → (x // b², x // b % b, x % b).
-    This assigns coordinates based on index position, which is arbitrary with
-    respect to linguistic meaning under BPE tokenisation.
-    When ``set_freq_order`` is called with a frequency-rank tensor, the
-    decomposition maps tokens through their frequency rank first:
-    token x → rank_freq[x] → (rank // b², rank // b % b, rank % b).
-    This makes tokens with similar corpus frequency share codebook entries,
-    introducing pre-existing statistical structure into the gradient of W_res
-    from step 0. Since token frequency correlates with distributional behaviour
-    (Zipfian distribution, syntactic category, semantic class), the gradient
-        ∂L/∂W_res = Σ_x δ_x · z̃_x^T
-    has low-rank structure immediately exploitable by Conda's SVD projection,
-    analogous to how the dense embedding table E gradient has low-rank structure
-    from the language distribution. Without this ordering, the SVD finds only
-    noise until codebooks organise through training, delaying Conda's advantage.
-    If ``set_freq_order`` is never called, ``freq_order`` remains None and the
-    module behaves identically to the original implementation — the feature is
-    fully opt-in and backward compatible.
     """
     def __init__(self, config: NeoLLMConfig):
@@ -842,21 +810,11 @@ class LeviathanGenerator(nn.Module):
         self.spline_degree = spline_degree
         self.krank         = krank
         self.hidden_size   = hidden_size
-        # Chunk size over d_seed used by the KHRONOS log-product.  The default
-        # 16 keeps the largest per-head intermediate at [N, 16, krank] instead
-        # of [N, 128, krank] while preserving the exact product algebra.
-        self.khronos_chunk_size = int(getattr(config, "generator_khronos_chunk_size", 16))
-        self.khronos_chunk_size = max(1, min(self.khronos_chunk_size, d_seed))
         # ── Stage 1: shared codebook lookup ──────────────────────────────
         # Produces z [N, d_seed] — the raw seed before any per-head
         # preprocessing. This is the only shared computation across heads.
         self.codebooks = nn.Parameter(torch.empty(k, b, d_seed))
-        # Frequency-based codebook ordering (opt-in via set_freq_order).
-        # Non-persistent: not saved to checkpoints, must be set at load time.
-        self.register_buffer("freq_order", None, persistent=False)
         # Shared knot grid — fixed, not learned.
         # Used by both the generator heads and the JTok-M shared path.
         self.register_buffer(
@@ -927,48 +885,14 @@ class LeviathanGenerator(nn.Module):
             torch.empty(num_modes, krank, hidden_size)
         )
-    def set_freq_order(self, freq_order: torch.Tensor) -> None:
-        """
-        Register a frequency-rank mapping to structure codebook coordinates.
-        Must be called after model instantiation and after any device transfer
-        (.to(device), .cuda(), etc.) since the buffer is non-persistent and
-        is not saved to checkpoints.
-        Args:
-            freq_order: Long tensor of shape ``(vocab_size,)`` where
-                ``freq_order[x]`` is the frequency rank of token x in the
-                training corpus (rank 0 = most frequent token). Typically
-                computed as ``torch.argsort(token_counts, descending=True)``.
-        Example::
-            counts = compute_token_frequencies(tokenizer, dataset)  # [V]
-            ranks  = torch.argsort(counts, descending=True)         # [V]
-            model.model.token_generator.set_freq_order(ranks)
-        """
-        if freq_order.shape[0] != self.codebooks.shape[1] ** self.k:
-            # Soft warning: shape mismatch may indicate wrong vocab size.
-            # Not a hard error since vocab_size in config may be padded.
-            pass
-        self.freq_order = freq_order.long().to(self.codebooks.device)
     def _base_k_decompose(self, token_ids: torch.Tensor) -> torch.Tensor:
         """
         Deterministic base-b decomposition: i → (i_0, ..., i_{k-1}).
-        When ``freq_order`` is set, token indices are remapped through their
-        frequency rank before decomposition. This ensures that tokens sharing
-        codebook entries are similar in corpus frequency rather than arbitrary
-        in BPE index space, providing pre-existing low-rank gradient structure
-        for Conda's SVD projection from step 0.
-        Without ``freq_order``: x → (x // b^{k-1}, ..., x % b)
-        With    ``freq_order``: x → freq_order[x] → (rank // b^{k-1}, ..., rank % b)
         """
         ids = token_ids.long().clone()
-        if self.freq_order is not None:
-            ids = self.freq_order[ids]
         coords = torch.empty(
             *token_ids.shape, self.k,
@@ -1058,26 +982,20 @@ class LeviathanGenerator(nn.Module):
         m: int,
     ) -> torch.Tensor:
         """
-        Forward completo para el cabezal m del generator sin materializar
-        ``per_dim`` completo.
-        Matemática preservada:
             phi[n, d, k] = Σ_g B[n, d, g] · (1 + wd[m, d, g, k])
             modes[n, k]  = Π_d phi[n, d, k]
             out[n, :]    = modes[n, :] @ W_out[m]
-        La implementación acumula el producto en log-space por chunks de la
-        dimensión ``d_seed``:
-            log|Π_d phi_d| = Σ_chunks Σ_{d∈chunk} log|phi_d|
-        Esto evita el tensor gigante [N, d_seed, krank].  Con N=32768,
-        d_seed=128 y krank=64, ese tensor tendría 268,435,456 elementos.  Con
-        chunk=16, el mayor tensor equivalente baja a [N, 16, krank], una octava
-        parte, sin cambiar la fórmula del artículo.
         """
-        d   = self.d_seed
-        kr  = self.krank
-        csz = self.khronos_chunk_size
         # ── Proyección lineal para el cabezal m ──────────────────────────
         proj_w = self.head_proj_weight[m * d : (m + 1) * d]       # [d_seed, d_seed]
@@ -1098,45 +1016,34 @@ class LeviathanGenerator(nn.Module):
         # ── Sigmoid(x/2) → coordenada latente en [0,1]^d_seed ────────────
         zh = torch.sigmoid(zh / 2.0).clamp(0.0, 1.0)              # [N, d_seed]
-        # ── KHRONOS chunked log-product ─────────────────────────────────
-        # Accumulators have only [N, krank], never [N, d_seed, krank].
-        log_mag_acc = torch.zeros(zh.shape[0], kr, device=zh.device, dtype=torch.float32)
-        neg_count_acc = torch.zeros(zh.shape[0], kr, device=zh.device, dtype=torch.int32)
         grid = self.knot_grid.float().view(1, 1, -1)              # [1, 1, n_knots]
-        for start in range(0, d, csz):
-            stop = min(start + csz, d)
-            # B-spline only for this seed-dimension chunk.
-            zh_c = zh[:, start:stop]                              # [N, c]
-            sc_c = self.head_scale[m, start:stop].float().view(1, -1, 1)
-            dist = (zh_c.unsqueeze(-1) - grid).abs() * sc_c        # [N, c, n_knots]
-            B_c = torch.where(
-                dist < 0.5,
-                0.75 - dist ** 2,
-                torch.where(dist < 1.5, 0.5 * (1.5 - dist) ** 2, torch.zeros_like(dist)),
-            )                                                      # [N, c, n_knots]
-            B_c = self._normalize_bspline_basis(B_c)
-            # phi_c[n, c, k] = Σ_g B_c[n, c, g] * (1 + wd[m, c, g, k])
-            effective_spline_c = 1.0 + self.head_spline_delta[m, start:stop].float()
-            phi_c = torch.einsum(
-                "ncg,cgk->nck",
-                B_c,
-                effective_spline_c,
-            )                                                      # [N, c, krank]
-            log_mag_acc = log_mag_acc + torch.log(phi_c.abs() + 1e-9).sum(dim=1)
-            neg_count_acc = neg_count_acc + (phi_c < 0).to(torch.int32).sum(dim=1)
-        prod_sign = 1.0 - 2.0 * (neg_count_acc % 2).float()       # [N, krank]
-        modes_m   = prod_sign * torch.exp(log_mag_acc)            # [N, krank]
         # ── Proyección de salida del cabezal ─────────────────────────────
         out_m = (
             modes_m.to(self.head_out_weight.dtype)
             @ self.head_out_weight[m]
-        )                                                          # [N, hidden_size]
         return out_m
     def _khronos_all_heads(
@@ -1277,34 +1184,15 @@ class LeviathanGenerator(nn.Module):
                 analysis.z_tilde = z_tilde.detach()
                 analysis.B_vals  = B_vals.detach()
-        # ── Per-head generator path (secuencial, un cabezal a la vez) ──────
-        # ORIGINAL PROBLEM: el path vectorizado anterior procesaba los M
-        # cabezales en paralelo con kernels fusionados:
-        #
-        #   _bspline_basis_all_heads → [N, M, d_seed, n_knots]   ← TENSOR GIGANTE
-        #   _khronos_all_heads       → per_dim [N, M, d_seed, krank] ← AÚN MAYOR
-        #
-        # Con N=B*S=32768, M=8, d_seed=128, n_knots=32, krank=16:
-        #   [N,M,d_seed,n_knots] = 32768 × 8 × 128 × 32 × 4 bytes ≈ 512 MB
-        #   [N,M,d_seed,krank]   = 32768 × 8 × 128 × 16 × 4 bytes ≈ 256 MB
-        # Estos tensores viven simultáneamente en el pool de CUDAGraphs,
-        # causando OOM en el backward cuando se suman las activaciones guardadas
-        # de las 12 capas del decoder.
-        #
-        # SOLUCIÓN (equivalente a la impl. JAX de Reza):
-        #   Loop Python sobre M=8 cabezales (count fijo → TorchDynamo unrollea
-        #   en 8 secuencias de ops estáticas sin graph breaks).
-        #   Cada cabezal materializa como máximo [N, d_seed, krank] ≈ 32 MB.
-        #   La suma se acumula in-place → el tensor del cabezal anterior puede
-        #   ser liberado por el allocator antes de procesar el siguiente.
         #
-        # Por qué NO vmap(chunk_size=1):
-        #   vmap requiere que la función sea "pura" (sin acceso a self.*).
-        #   head_norm_eps, knot_grid y los parámetros indexados [m] se pasan
-        #   implícitamente a través del closure. Con vmap habría que
-        #   stack_module_state + functional_call, lo que añade overhead de
-        #   instrumentación sin beneficio real ya que el loop estático es
-        #   igualmente trazable por el compilador y produce el mismo grafo.
         target_dtype = self.codebooks.dtype
         e = torch.zeros(N, self.hidden_size, device=token_ids.device, dtype=target_dtype)

     matches the authors' ``1 + wd_i`` parameterization so phi ≈ 1.0 at init
     and the product of d_seed factors starts near 1.0 instead of ~10^{-21}.
     FP8 note: Leviathan deliberately stores the shared JTok-M seed projection
     as raw Parameters rather than nn.Linear.  This keeps the generator outside
     TorchAO Float8Linear conversion even if an external FP8 filter is too broad.
     """
     def __init__(self, config: NeoLLMConfig):
         self.spline_degree = spline_degree
         self.krank         = krank
         self.hidden_size   = hidden_size
         # ── Stage 1: shared codebook lookup ──────────────────────────────
         # Produces z [N, d_seed] — the raw seed before any per-head
         # preprocessing. This is the only shared computation across heads.
         self.codebooks = nn.Parameter(torch.empty(k, b, d_seed))
         # Shared knot grid — fixed, not learned.
         # Used by both the generator heads and the JTok-M shared path.
         self.register_buffer(
             torch.empty(num_modes, krank, hidden_size)
         )
     def _base_k_decompose(self, token_ids: torch.Tensor) -> torch.Tensor:
         """
         Deterministic base-b decomposition: i → (i_0, ..., i_{k-1}).
+        Maps token indices directly to codebook coordinates via arithmetic:
+        token x → (x // b^{k-1}, ..., x % b).
         """
         ids = token_ids.long().clone()
         coords = torch.empty(
             *token_ids.shape, self.k,
         m: int,
     ) -> torch.Tensor:
         """
+        Forward completo para el cabezal m del generator, sin particionar la
+        dimensión ``d_seed`` en chunks.
+        Matemática aplicada directamente:
             phi[n, d, k] = Σ_g B[n, d, g] · (1 + wd[m, d, g, k])
             modes[n, k]  = Π_d phi[n, d, k]
             out[n, :]    = modes[n, :] @ W_out[m]
+        Esta versión materializa ``phi`` completo con forma
+        ``[N, d_seed, krank]`` para cada cabezal.  Es más directa y elimina el
+        manejo por chunks del producto KHRONOS, a costa de mayor uso de VRAM.
         """
+        d  = self.d_seed
+        kr = self.krank
         # ── Proyección lineal para el cabezal m ──────────────────────────
         proj_w = self.head_proj_weight[m * d : (m + 1) * d]       # [d_seed, d_seed]
         # ── Sigmoid(x/2) → coordenada latente en [0,1]^d_seed ────────────
         zh = torch.sigmoid(zh / 2.0).clamp(0.0, 1.0)              # [N, d_seed]
+        # ── KHRONOS full log-product, sin chunks ─────────────────────────
         grid = self.knot_grid.float().view(1, 1, -1)              # [1, 1, n_knots]
+        sc   = self.head_scale[m].float().view(1, -1, 1)          # [1, d_seed, 1]
+        dist = (zh.unsqueeze(-1) - grid).abs() * sc               # [N, d_seed, n_knots]
+        B = torch.where(
+            dist < 0.5,
+            0.75 - dist ** 2,
+            torch.where(dist < 1.5, 0.5 * (1.5 - dist) ** 2, torch.zeros_like(dist)),
+        )                                                        # [N, d_seed, n_knots]
+        B = self._normalize_bspline_basis(B)
+        effective_spline = 1.0 + self.head_spline_delta[m].float()
+        phi = torch.einsum(
+            "ndg,dgk->ndk",
+            B,
+            effective_spline,
+        )                                                        # [N, d_seed, krank]
+        log_mag   = torch.log(phi.abs() + 1e-9).sum(dim=1)        # [N, krank]
+        num_neg   = (phi < 0).to(torch.int32).sum(dim=1)          # [N, krank]
+        prod_sign = 1.0 - 2.0 * (num_neg % 2).float()             # [N, krank]
+        modes_m   = prod_sign * torch.exp(log_mag)                # [N, krank]
         # ── Proyección de salida del cabezal ─────────────────────────────
         out_m = (
             modes_m.to(self.head_out_weight.dtype)
             @ self.head_out_weight[m]
+        )                                                        # [N, hidden_size]
         return out_m
     def _khronos_all_heads(
                 analysis.z_tilde = z_tilde.detach()
                 analysis.B_vals  = B_vals.detach()
+        # ── Per-head generator path, sin chunking sobre d_seed ─────────────
+        # Cada cabezal LEV se evalúa completo:
+        #   B      [N, d_seed, n_knots]
+        #   phi    [N, d_seed, krank]
+        #   modes  [N, krank]
         #
+        # Esta versión elimina la acumulación por chunks del producto KHRONOS.
+        # Mantiene el loop por cabezal para conservar cabezales independientes,
+        # pero dentro de cada cabezal materializa la forma completa.
         target_dtype = self.codebooks.dtype
         e = torch.zeros(N, self.hidden_size, device=token_ids.device, dtype=target_dtype)