Copy Python verbatim from vortex

Browse files

Files changed (10) hide show

attention.py +999 -0
cache.py +62 -0
engine.py +597 -0
generation.py +373 -0
layers.py +272 -0
model.py +937 -0
positional_embeddings.py +114 -0
sample.py +60 -0
special_tokens_map.json +1 -0
utils.py +251 -0

attention.py ADDED Viewed

	@@ -0,0 +1,999 @@

+import math
+from functools import partial
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from .utils import get_dim_for_local_rank
+# Not bothering with ops right now
+# try:
+#     from vortex.ops import (
+#         local_flash_attn_kvpacked_func,
+#         local_flash_attn_qkvpacked_func,
+#         local_flash_attn_varlen_kvpacked_func,
+#         local_flash_attn_varlen_qkvpacked_func,
+#         local_flash_attn_with_kvcache,
+#     )
+# except ImportError:
+#     local_flash_attn_varlen_qkvpacked_func, local_flash_attn_varlen_kvpacked_func = (
+#         None,
+#         None,
+#     )
+#     local_flash_attn_qkvpacked_func, local_flash_attn_kvpacked_func = None, None
+#     local_flash_attn_with_kvcache = None
+local_flash_attn_varlen_qkvpacked_func, local_flash_attn_varlen_kvpacked_func = (
+    None,
+    None,
+)
+local_flash_attn_qkvpacked_func, local_flash_attn_kvpacked_func = None, None
+local_flash_attn_with_kvcache = None
+FusedDense, ColumnParallelLinear, RowParallelLinear = None, None, None
+from .rotary import RotaryEmbedding
+# From https://github.com/ofirpress/attention_with_linear_biases/blob/4b92f28a005ead2567abe2359f633e73e08f3833/fairseq/models/transformer.py#L742
+def get_alibi_slopes(nheads):
+    def get_slopes_power_of_2(nheads):
+        start = 2 ** (-(2 ** -(math.log2(nheads) - 3)))
+        ratio = start
+        return [start * ratio**i for i in range(nheads)]
+    if math.log2(nheads).is_integer():
+        return get_slopes_power_of_2(nheads)
+    else:
+        closest_power_of_2 = 2 ** math.floor(math.log2(nheads))
+        return (
+            get_slopes_power_of_2(closest_power_of_2)
+            + get_alibi_slopes(2 * closest_power_of_2)[0::2][: nheads - closest_power_of_2]
+        )
+class FlashSelfAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+    def __init__(
+        self,
+        layer_number,
+        causal=False,
+        softmax_scale=None,
+        attention_dropout=0.0,
+        window_size=(-1, -1),
+        alibi_slopes=None,
+        deterministic=False,
+    ):
+        super().__init__()
+        assert local_flash_attn_varlen_qkvpacked_func is not None, "FlashAttention is not installed"
+        assert local_flash_attn_qkvpacked_func is not None, "FlashAttention is not installed"
+        self.layer_number = layer_number
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.drop = nn.Dropout(attention_dropout)
+        self.register_buffer("alibi_slopes", alibi_slopes, persistent=False)
+        self.window_size = window_size
+        self.deterministic = deterministic
+    def forward(self, qkv, causal=None, cu_seqlens=None, max_seqlen=None):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value.
+                If cu_seqlens is None and max_seqlen is None, then qkv has shape (B, S, 3, H, D).
+                If cu_seqlens is not None and max_seqlen is not None, then qkv has shape
+                (total, 3, H, D), where total is the sum of the sequence lengths in the batch.
+            causal: if passed, will override self.causal
+            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+                of the sequences in the batch, used to index into qkv.
+            max_seqlen: int. Maximum sequence length in the batch.
+        Returns:
+        --------
+            out: (total, H, D) if cu_seqlens is not None and max_seqlen is not None,
+                else (B, S, H, D).
+        """
+        assert qkv.dtype in [torch.float16, torch.bfloat16]
+        assert qkv.is_cuda
+        causal = self.causal if causal is None else causal
+        unpadded = cu_seqlens is not None
+        if self.alibi_slopes is not None:
+            self.alibi_slopes = self.alibi_slopes.to(torch.float32)
+        if unpadded:
+            assert cu_seqlens.dtype == torch.int32
+            assert max_seqlen is not None
+            assert isinstance(max_seqlen, int)
+            return local_flash_attn_varlen_qkvpacked_func(
+                qkv,
+                cu_seqlens,
+                max_seqlen,
+                self.drop.p if self.training else 0.0,
+                softmax_scale=self.softmax_scale,
+                causal=causal,
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.window_size,
+                deterministic=self.deterministic,
+            )
+        else:
+            y = local_flash_attn_qkvpacked_func(
+                qkv,
+                self.drop.p if self.training else 0.0,
+                softmax_scale=self.softmax_scale,
+                causal=causal,
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.window_size,
+                deterministic=self.deterministic,
+            )
+            return y
+class FlashCrossAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+    def __init__(
+        self,
+        causal=False,
+        softmax_scale=None,
+        attention_dropout=0.0,
+        alibi_slopes=None,
+        window_size=(-1, -1),
+        deterministic=False,
+    ):
+        super().__init__()
+        assert local_flash_attn_varlen_kvpacked_func is not None, "FlashAttention is not installed"
+        assert local_flash_attn_kvpacked_func is not None, "FlashAttention is not installed"
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.drop = nn.Dropout(attention_dropout)
+        self.register_buffer("alibi_slopes", alibi_slopes, persistent=False)
+        self.window_size = window_size
+        self.deterministic = deterministic
+    def forward(
+        self,
+        q,
+        kv,
+        causal=None,
+        cu_seqlens=None,
+        max_seqlen=None,
+        cu_seqlens_k=None,
+        max_seqlen_k=None,
+    ):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            q: The tensor containing the query. (B, Sq, H, D)
+            kv: The tensor containing the key and value. (B, Sk, 2, H_k, D)
+            causal: if passed, will override self.causal
+            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+                of the sequences in the batch, used to index into q.
+            max_seqlen: int. Maximum sequence length in the batch of q.
+            cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+                of the sequences in the batch, used to index into kv.
+            max_seqlen_k: int. Maximum sequence length in the batch of k and v.
+        """
+        assert q.dtype in [torch.float16, torch.bfloat16]
+        assert q.is_cuda and kv.is_cuda
+        causal = self.causal if causal is None else causal
+        unpadded = cu_seqlens is not None
+        if self.alibi_slopes is not None:
+            self.alibi_slopes = self.alibi_slopes.to(torch.float32)
+        if unpadded:
+            assert cu_seqlens.dtype == torch.int32
+            assert max_seqlen is not None
+            assert isinstance(max_seqlen, int)
+            assert cu_seqlens_k is not None
+            assert cu_seqlens_k.dtype == torch.int32
+            assert max_seqlen_k is not None
+            assert isinstance(max_seqlen_k, int)
+            return local_flash_attn_varlen_kvpacked_func(
+                q,
+                kv,
+                cu_seqlens,
+                cu_seqlens_k,
+                max_seqlen,
+                max_seqlen_k,
+                self.drop.p if self.training else 0.0,
+                softmax_scale=self.softmax_scale,
+                causal=causal,
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.window_size,
+                deterministic=self.deterministic,
+            )
+        else:
+            batch_size, seqlen_q = q.shape[0], q.shape[1]
+            seqlen_k = kv.shape[1]
+            assert kv.shape[0] == batch_size and kv.shape[4] == q.shape[3]
+            return local_flash_attn_kvpacked_func(
+                q,
+                kv,
+                self.drop.p if self.training else 0.0,
+                causal=causal,
+                softmax_scale=self.softmax_scale,
+                alibi_slopes=self.alibi_slopes,
+                window_size=self.window_size,
+                deterministic=self.deterministic,
+            )
+class SelfAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0):
+        super().__init__()
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.drop = nn.Dropout(attention_dropout)
+    def forward(self, qkv, causal=None, key_padding_mask=None):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D)
+            causal: if passed, will override self.causal
+            key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
+                False means to mask out. (B, S)
+        """
+        q, k, v = qkv.unbind(dim=2)  # each: (B, T, H, D)
+        q = q.permute(0, 2, 1, 3)  # (B, H, T, D)
+        k = k.permute(0, 2, 1, 3)
+        v = v.permute(0, 2, 1, 3)
+        batch_size, num_heads, seqlen, d = q.shape
+        scale = self.softmax_scale if self.softmax_scale is not None else 1.0 / math.sqrt(d)
+        q = q * (scale * math.sqrt(d))
+        attn_mask = None
+        if key_padding_mask is not None:
+            attn_mask = torch.where(
+                repeat(key_padding_mask, "b s -> b t s", t=seqlen),
+                0.0,
+                -10000.0,
+            )
+        output = torch.nn.functional.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=attn_mask,
+            dropout_p=self.drop.p if self.training else 0.0,
+            is_causal=(self.causal if causal is None else causal),
+        )
+        output = output.permute(0, 2, 1, 3)
+        return output
+class CrossAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0):
+        super().__init__()
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.drop = nn.Dropout(attention_dropout)
+    def forward(self, q, kv, causal=None, key_padding_mask=None):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            q: The tensor containing the query. (B, Sq, H, D)
+            kv: The tensor containing the key and value. (B, Sk, 2, H_k, D)
+            causal: if passed, will override self.causal
+            key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
+                False means to mask out. (B, Sk)
+        """
+        batch_size, seqlen_q = q.shape[0], q.shape[1]
+        causal = self.causal if causal is None else causal
+        seqlen_k = kv.shape[1]
+        assert kv.shape[0] == batch_size and kv.shape[4] == q.shape[3]
+        if kv.shape[3] != q.shape[2]:  # MQA/GQA
+            kv = repeat(kv, "... hkv d -> ... (hkv g) d", g=q.shape[2] // kv.shape[3])
+        k, v = kv.unbind(dim=2)
+        softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
+        scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
+        if key_padding_mask is not None:
+            padding_mask = torch.full(
+                (batch_size, seqlen_k),
+                -10000.0,
+                dtype=scores.dtype,
+                device=scores.device,
+            )
+            padding_mask.masked_fill_(key_padding_mask, 0.0)
+            # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+            scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
+        if causal:
+            # causal mask needs to take into account the difference between seqlen_q and seqlen_k
+            row_idx = rearrange(torch.arange(seqlen_q, device=q.device, dtype=torch.long), "s -> s 1")
+            col_idx = torch.arange(seqlen_k, device=kv.device, dtype=torch.long)
+            sk = seqlen_k if key_padding_mask is None else rearrange(key_padding_mask.sum(-1), "b -> b 1 1 1")
+            causal_mask = col_idx > row_idx + sk - seqlen_q
+            scores = scores.masked_fill(causal_mask, -10000.0)
+        attention = torch.softmax(scores, dim=-1, dtype=v.dtype)
+        attention_drop = self.drop(attention)
+        output = torch.einsum("bhts,bshd->bthd", attention_drop, v)
+        return output
+class LinearResidual(nn.Linear):
+    """Wrap nn.Linear to return the residual as well. For compatibility with FusedDense."""
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input), input
+def _update_kv_cache(kv, inference_params, layer_idx):
+    """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
+    # Pre-allocate memory for key-values for inference.
+    num_heads, head_dim = kv.shape[-2:]
+    if layer_idx not in inference_params.key_value_memory_dict:
+        kv_cache = torch.empty(
+            inference_params.max_batch_size,
+            inference_params.max_seqlen,
+            2,
+            num_heads,
+            head_dim,
+            dtype=kv.dtype,
+            device=kv.device,
+        )
+        inference_params.key_value_memory_dict[layer_idx] = kv_cache
+    else:
+        kv_cache = inference_params.key_value_memory_dict[layer_idx]
+    # Adjust key and value for inference
+    batch_start = inference_params.batch_size_offset
+    batch_end = batch_start + kv.shape[0]
+    sequence_start = inference_params.seqlen_offset
+    sequence_end = sequence_start + kv.shape[1]
+    assert batch_end <= kv_cache.shape[0]
+    assert sequence_end <= kv_cache.shape[1]
+    assert kv_cache is not None
+    kv_cache[batch_start:batch_end, sequence_start:sequence_end, ...] = kv
+    return kv_cache[batch_start:batch_end, :sequence_end, ...]
+class MHA(nn.Module):
+    """Multi-head self-attention and cross-attention"""
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        num_heads_kv=None,
+        cross_attn=False,
+        qkv_proj_bias=True,
+        out_proj_bias=True,
+        dropout=0.0,
+        softmax_scale=None,
+        causal=False,
+        layer_idx=None,
+        dwconv=False,
+        rotary_emb_dim=0,
+        rotary_emb_base=10000.0,
+        rotary_emb_scale_base=None,
+        rotary_emb_interleaved=False,
+        use_alibi=False,
+        window_size=(-1, -1),
+        fused_bias_fc=False,
+        use_flash_attn=False,
+        return_residual=False,
+        checkpointing=False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        """
+        num_heads_kv: can be used to toggle MQA / GQA. If None, use num_heads.
+        return_residual: whether to return the input x along with the output. This is for
+            performance reason: for post-norm architecture, returning the input allows us
+            to fuse the backward of nn.Linear with the residual connection.
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.cross_attn = cross_attn
+        self.causal = causal
+        self.layer_idx = layer_idx
+        self.dwconv = dwconv
+        self.rotary_emb_dim = rotary_emb_dim
+        self.use_flash_attn = use_flash_attn
+        self.return_residual = return_residual
+        self.checkpointing = checkpointing
+        if use_alibi:
+            assert use_flash_attn, "ALiBi code path requires flash_attn"
+            alibi_slopes = torch.tensor(get_alibi_slopes(num_heads), device=device)
+        else:
+            alibi_slopes = None
+        if window_size != (-1, -1):
+            assert use_flash_attn, "Local (sliding window) attention code path requires flash_attn"
+        self.num_heads = num_heads
+        self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads
+        assert self.num_heads % self.num_heads_kv == 0, "num_heads must be divisible by num_heads_kv"
+        assert self.embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
+        self.head_dim = self.embed_dim // num_heads
+        qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)
+        kv_dim = 2 * self.head_dim * self.num_heads_kv
+        if self.rotary_emb_dim > 0:
+            assert not cross_attn, "MHA with rotary embedding does not support cross-attention yet"
+            assert RotaryEmbedding is not None, "rotary_emb is not installed"
+            self.rotary_emb = RotaryEmbedding(
+                self.rotary_emb_dim,
+                base=rotary_emb_base,
+                scale_base=rotary_emb_scale_base,
+                interleaved=rotary_emb_interleaved,
+                device=device,
+            )
+        if fused_bias_fc and FusedDense is None:
+            raise ImportError("fused_dense is not installed")
+        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
+        linear_resid_cls = LinearResidual if not fused_bias_fc else partial(FusedDense, return_residual=True)
+        wqkv_cls = linear_cls if not self.return_residual else linear_resid_cls
+        inner_attn_cls = (
+            partial(
+                FlashSelfAttention,
+                layer_number=self.layer_idx,
+                alibi_slopes=alibi_slopes,
+                window_size=window_size,
+            )
+            if use_flash_attn
+            else SelfAttention
+        )
+        inner_cross_attn_cls = (
+            partial(FlashCrossAttention, alibi_slopes=alibi_slopes, window_size=window_size)
+            if use_flash_attn
+            else CrossAttention
+        )
+        if not self.cross_attn:
+            self.Wqkv = wqkv_cls(embed_dim, qkv_dim, bias=qkv_proj_bias, **factory_kwargs)
+        else:
+            self.Wq = linear_cls(embed_dim, embed_dim, bias=qkv_proj_bias, **factory_kwargs)
+            self.Wkv = wqkv_cls(embed_dim, kv_dim, bias=qkv_proj_bias, **factory_kwargs)
+        if self.dwconv:
+            if self.num_heads_kv == self.num_heads:
+                self.dwconv_qkv = nn.Conv1d(qkv_dim, qkv_dim, kernel_size=3, padding=2, groups=qkv_dim)
+            else:
+                self.dwconv_q = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, padding=2, groups=embed_dim)
+                self.dwconv_kv = nn.Conv1d(kv_dim, kv_dim, kernel_size=3, padding=2, groups=kv_dim)
+        self.inner_attn = inner_attn_cls(
+            causal=causal,
+            softmax_scale=softmax_scale,
+            attention_dropout=dropout,
+        )
+        self.inner_cross_attn = inner_cross_attn_cls(
+            causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
+        )
+        self.out_proj = linear_cls(embed_dim, embed_dim, bias=out_proj_bias, **factory_kwargs)
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
+        dtype = self.out_proj.weight.dtype if dtype is None else dtype
+        device = self.out_proj.weight.device
+        return torch.empty(
+            batch_size,
+            max_seqlen,
+            2,
+            self.num_heads_kv,
+            self.head_dim,
+            dtype=dtype,
+            device=device,
+        )
+    def _update_kv_cache(self, kv, inference_params):
+        """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
+        assert not self.dwconv, "Generation does not support dwconv yet"
+        assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
+        return _update_kv_cache(kv, inference_params, self.layer_idx)
+    def _apply_rotary_update_kvcache_attention(self, q, kv, inference_params):
+        """
+        Fast path that combine 3 steps: apply rotary to Q and K, update kv cache, and apply attention.
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        kv: (batch_size, seqlen_k, 2, nheads_kv, head_dim)
+        """
+        assert inference_params is not None and inference_params.seqlen_offset > 0
+        assert self.use_flash_attn
+        if self.rotary_emb_dim > 0:
+            assert self.rotary_emb.scale is None, "This code path does not support xPos"
+            self.rotary_emb._update_cos_sin_cache(inference_params.max_seqlen, device=q.device, dtype=q.dtype)
+            rotary_cos, rotary_sin = (
+                self.rotary_emb._cos_cached,
+                self.rotary_emb._sin_cached,
+            )
+        else:
+            rotary_cos, rotary_sin = None, None
+        batch = q.shape[0]
+        kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+        cache_seqlens = (
+            inference_params.lengths_per_sample[:batch]
+            if inference_params.lengths_per_sample is not None
+            else inference_params.seqlen_offset
+        )
+        alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None)
+        context = local_flash_attn_with_kvcache(
+            q,
+            kv_cache[:, :, 0],
+            kv_cache[:, :, 1],
+            kv[:, :, 0],
+            kv[:, :, 1],
+            rotary_cos=rotary_cos,
+            rotary_sin=rotary_sin,
+            cache_seqlens=cache_seqlens,
+            softmax_scale=self.inner_cross_attn.softmax_scale,
+            causal=self.inner_cross_attn.causal,
+            rotary_interleaved=self.rotary_emb.interleaved if self.rotary_emb_dim > 0 else False,
+            alibi_slopes=alibi_slopes,
+        )
+        return context
+    def _update_kvcache_attention(self, q, kv, inference_params):
+        """Write kv to inference_params, then do attention"""
+        if inference_params.seqlen_offset == 0 or local_flash_attn_with_kvcache is None or not self.use_flash_attn:
+            # TODO: this only uses seqlen_offset and not lengths_per_sample.
+            kv = self._update_kv_cache(kv, inference_params)
+            return self.inner_cross_attn(q, kv)
+        else:
+            batch = q.shape[0]
+            kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+            cache_seqlens = (
+                inference_params.lengths_per_sample[:batch]
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+            alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None)
+            return local_flash_attn_with_kvcache(
+                q,
+                kv_cache[:, :, 0],
+                kv_cache[:, :, 1],
+                kv[:, :, 0],
+                kv[:, :, 1],
+                cache_seqlens=cache_seqlens,
+                softmax_scale=self.inner_cross_attn.softmax_scale,
+                causal=self.inner_cross_attn.causal,
+                alibi_slopes=alibi_slopes,
+            )
+    def forward(
+        self,
+        x,
+        x_kv=None,
+        key_padding_mask=None,
+        cu_seqlens=None,
+        max_seqlen=None,
+        mixer_subset=None,
+        inference_params=None,
+        **kwargs,
+    ):
+        """
+        Arguments:
+            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if
+                cu_seqlens is None and max_seqlen is None, else (total, hidden_dim) where total
+                is the is the sum of the sequence lengths in the batch.
+            x_kv: (batch, seqlen, hidden_dim), only applicable for cross-attention. If None, use x.
+            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+                of the sequences in the batch, used to index into x. Only applicable when using
+                FlashAttention.
+            max_seqlen: int. Maximum sequence length in the batch.
+            key_padding_mask: boolean mask, True means to keep, False means to mask out.
+                (batch, seqlen). Only applicable when not using FlashAttention.
+            mixer_subset: for cross-attention only. If not None, will take a subset of x
+                before applying the query projection. Useful for e.g., ViT where we only care
+                about the CLS token in the last layer.
+            inference_params: for generation. Adapted from Megatron-LM (and Apex)
+            https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
+        """
+        if cu_seqlens is not None:
+            assert max_seqlen is not None
+            assert key_padding_mask is None
+            assert self.use_flash_attn
+            assert not self.dwconv
+            assert self.rotary_emb_dim == 0
+        if key_padding_mask is not None:
+            assert cu_seqlens is None
+            assert max_seqlen is None
+            assert not self.use_flash_attn
+        if inference_params is not None:
+            assert key_padding_mask is None
+            assert cu_seqlens is None and max_seqlen is None
+            assert not self.dwconv
+        kwargs = (
+            {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen, **kwargs}
+            if self.use_flash_attn
+            else {"key_padding_mask": key_padding_mask, **kwargs}
+        )
+        seqlen_offset = (
+            0
+            if inference_params is None
+            else (
+                inference_params.lengths_per_sample
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+        )
+        rotary_max_seqlen = inference_params.max_seqlen if inference_params is not None else None
+        batch, seqlen = x.shape[:2]
+        if not self.cross_attn and self.num_heads_kv == self.num_heads:
+            assert x_kv is None and mixer_subset is None
+            if not self.return_residual:
+                qkv = self.Wqkv(x)
+            else:
+                qkv, x = self.Wqkv(x)
+            if self.dwconv:
+                qkv = rearrange(
+                    self.dwconv_qkv(rearrange(qkv, "b s d -> b d s"))[..., :-2],
+                    "b d s -> b s d",
+                ).contiguous()
+            qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim)
+            if (
+                inference_params is None
+                or inference_params.seqlen_offset == 0
+                or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0)
+                or not self.use_flash_attn
+            ):
+                if self.rotary_emb_dim > 0:
+                    qkv = self.rotary_emb(qkv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen)
+                if inference_params is None:
+                    if not self.checkpointing:
+                        context = self.inner_attn(qkv, **kwargs)
+                    else:
+                        context = torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, **kwargs)
+                else:
+                    context = self._update_kvcache_attention(qkv[:, :, 0], qkv[:, :, 1:], inference_params)
+            else:
+                context = self._apply_rotary_update_kvcache_attention(qkv[:, :, 0], qkv[:, :, 1:], inference_params)
+        else:
+            if self.cross_attn:
+                if not self.return_residual:
+                    q = self.Wq(x if mixer_subset is None else x[:, mixer_subset])
+                    kv = self.Wkv(x_kv if x_kv is not None else x)
+                else:
+                    if x_kv is not None:
+                        kv, x_kv = self.Wkv(x_kv)
+                    else:
+                        kv, x = self.Wkv(x)
+                    q = self.Wq(x if mixer_subset is None else x[:, mixer_subset])
+            else:
+                assert self.num_heads_kv != self.num_heads
+                if not self.return_residual:
+                    qkv = self.Wqkv(x)
+                else:
+                    qkv, x = self.Wqkv(x)
+                q = qkv[..., : self.num_heads * self.head_dim]
+                kv = qkv[..., self.num_heads * self.head_dim :]
+            q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim)
+            kv = rearrange(kv, "... (two hkv d) -> ... two hkv d", two=2, d=self.head_dim)
+            if self.dwconv:
+                q = rearrange(
+                    self.dwconv_q(rearrange(q, "b s d -> b d s"))[..., :-2],
+                    "b d s -> b s d",
+                ).contiguous()
+                kv = rearrange(
+                    self.dwconv_kv(rearrange(kv, "b s d -> b d s"))[..., :-2],
+                    "b d s -> b s d",
+                ).contiguous()
+            if (
+                inference_params is None
+                or inference_params.seqlen_offset == 0
+                or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0)
+                or not self.use_flash_attn
+            ):
+                if self.rotary_emb_dim > 0:
+                    q, kv = self.rotary_emb(q, kv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen)
+                if inference_params is None:
+                    if not self.checkpointing:
+                        context = self.inner_cross_attn(q, kv, **kwargs)
+                    else:
+                        context = torch.utils.checkpoint.checkpoint(self.inner_cross_attn, q, kv, **kwargs)
+                else:
+                    context = self._update_kvcache_attention(q, kv, inference_params)
+            else:
+                context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
+        out = self.out_proj(rearrange(context, "... h d -> ... (h d)"))
+        return out if not self.return_residual else (out, x)
+class ParallelMHA(nn.Module):
+    """Multi-head self-attention and cross-attention"""
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        process_group,
+        num_heads_kv=None,
+        qkv_proj_bias=True,
+        out_proj_bias=True,
+        dropout=0.0,
+        softmax_scale=None,
+        causal=False,
+        layer_idx=None,
+        rotary_emb_dim=0,
+        rotary_emb_base=10000.0,
+        rotary_emb_scale_base=None,
+        rotary_emb_interleaved=False,
+        use_alibi=False,
+        window_size=(-1, -1),
+        use_flash_attn=False,
+        checkpointing=False,
+        sequence_parallel=True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.causal = causal
+        self.layer_idx = layer_idx
+        self.rotary_emb_dim = rotary_emb_dim
+        self.use_flash_attn = use_flash_attn
+        self.checkpointing = checkpointing
+        self.process_group = process_group
+        self.world_size = process_group.size()
+        self.local_rank = torch.distributed.get_rank(process_group)
+        self.num_heads = num_heads
+        assert self.embed_dim % self.num_heads == 0, "embed_dim must be divisible by num_heads"
+        self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads
+        assert self.num_heads % self.num_heads_kv == 0, "num_heads must be divisible by num_heads_kv"
+        self.num_heads_per_rank = get_dim_for_local_rank(self.num_heads, self.world_size, self.local_rank)
+        self.num_heads_kv_per_rank = get_dim_for_local_rank(self.num_heads_kv, self.world_size, self.local_rank)
+        self.head_dim = self.embed_dim // num_heads
+        qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)
+        if use_alibi:
+            assert use_flash_attn, "ALiBi code path requires flash_attn"
+            num_heads_local = math.ceil(self.num_heads / self.world_size)
+            alibi_slopes = torch.tensor(
+                get_alibi_slopes(num_heads)[
+                    self.local_rank * num_heads_local : (self.local_rank + 1) * num_heads_local
+                ],
+                device=device,
+            )
+        else:
+            alibi_slopes = None
+        if window_size != (-1, -1):
+            assert use_flash_attn, "Local (sliding window) attention code path requires flash_attn"
+        if self.rotary_emb_dim > 0:
+            assert RotaryEmbedding is not None, "rotary_emb is not installed"
+            self.rotary_emb = RotaryEmbedding(
+                self.rotary_emb_dim,
+                base=rotary_emb_base,
+                scale_base=rotary_emb_scale_base,
+                interleaved=rotary_emb_interleaved,
+                device=device,
+            )
+        if ColumnParallelLinear is None or RowParallelLinear is None:
+            raise ImportError("fused_dense is not installed")
+        self.Wqkv = ColumnParallelLinear(
+            embed_dim,
+            qkv_dim,
+            process_group,
+            bias=qkv_proj_bias,
+            sequence_parallel=sequence_parallel,
+            multiple_of=self.head_dim * (self.num_heads // self.num_heads_kv + 2),
+            **factory_kwargs,
+        )
+        inner_attn_cls = (
+            partial(FlashSelfAttention, alibi_slopes=alibi_slopes, window_size=window_size)
+            if use_flash_attn
+            else SelfAttention
+        )
+        inner_cross_attn_cls = (
+            partial(FlashCrossAttention, alibi_slopes=alibi_slopes, window_size=window_size)
+            if use_flash_attn
+            else CrossAttention
+        )
+        self.inner_attn = inner_attn_cls(causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout)
+        self.inner_cross_attn = inner_cross_attn_cls(
+            causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
+        )
+        self.out_proj = RowParallelLinear(
+            embed_dim,
+            embed_dim,
+            process_group,
+            bias=out_proj_bias,
+            sequence_parallel=sequence_parallel,
+            multiple_of=self.head_dim,
+            **factory_kwargs,
+        )
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
+        dtype = self.out_proj.weight.dtype if dtype is None else dtype
+        device = self.out_proj.weight.device
+        return torch.empty(
+            batch_size,
+            max_seqlen,
+            2,
+            self.num_heads_kv_per_rank,
+            self.head_dim,
+            dtype=dtype,
+            device=device,
+        )
+    def _update_kv_cache(self, kv, inference_params):
+        """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
+        assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
+        return _update_kv_cache(kv, inference_params, self.layer_idx)
+    def _apply_rotary_update_kvcache_attention(self, q, kv, inference_params):
+        """
+        Fast path that combine 3 steps: apply rotary to Q and K, update kv cache, and apply attention.
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        kv: (batch_size, seqlen_k, 2, nheads_kv, head_dim)
+        """
+        assert inference_params is not None and inference_params.seqlen_offset > 0
+        assert self.use_flash_attn
+        if self.rotary_emb_dim > 0:
+            assert self.rotary_emb.scale is None, "This code path does not support xPos"
+            self.rotary_emb._update_cos_sin_cache(inference_params.max_seqlen, device=q.device, dtype=q.dtype)
+            rotary_cos, rotary_sin = (
+                self.rotary_emb._cos_cached,
+                self.rotary_emb._sin_cached,
+            )
+        else:
+            rotary_cos, rotary_sin = None, None
+        batch = q.shape[0]
+        kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+        cache_seqlens = (
+            inference_params.lengths_per_sample[:batch]
+            if inference_params.lengths_per_sample is not None
+            else inference_params.seqlen_offset
+        )
+        alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None)
+        context = local_flash_attn_with_kvcache(
+            q,
+            kv_cache[:, :, 0],
+            kv_cache[:, :, 1],
+            kv[:, :, 0],
+            kv[:, :, 1],
+            rotary_cos=rotary_cos,
+            rotary_sin=rotary_sin,
+            cache_seqlens=cache_seqlens,
+            softmax_scale=self.inner_cross_attn.softmax_scale,
+            causal=self.inner_cross_attn.causal,
+            rotary_interleaved=self.rotary_emb.interleaved if self.rotary_emb_dim > 0 else False,
+            alibi_slopes=alibi_slopes,
+        )
+        return context
+    def _update_kvcache_attention(self, q, kv, inference_params):
+        """Write kv to inference_params, then do attention"""
+        if inference_params.seqlen_offset == 0 or not self.use_flash_attn:
+            # TODO: this only uses seqlen_offset and not lengths_per_sample.
+            kv = self._update_kv_cache(kv, inference_params)
+            return self.inner_cross_attn(q, kv)
+        else:
+            batch = q.shape[0]
+            kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+            cache_seqlens = (
+                inference_params.lengths_per_sample[:batch]
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+            alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None)
+            context = local_flash_attn_with_kvcache(
+                q,
+                kv_cache[:, :, 0],
+                kv_cache[:, :, 1],
+                kv[:, :, 0],
+                kv[:, :, 1],
+                cache_seqlens=cache_seqlens,
+                softmax_scale=self.inner_cross_attn.softmax_scale,
+                causal=self.inner_cross_attn.causal,
+                alibi_slopes=alibi_slopes,
+            )
+            return context
+    def forward(self, x, seqlen=None, inference_params=None, **kwargs):
+        """
+        Arguments:
+            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if seqlen=None.
+                If seqlen is not None, x is (batch * seqlen, hidden_dim). This is so that when we
+                split x during sequence parallel, we split the batch * seqlen dimension
+                (in case batch is small).
+        """
+        qkv = self.Wqkv(x)
+        if seqlen is not None:
+            qkv = rearrange(qkv, "(b s) ... -> b s ...", s=seqlen)
+        seqlen_offset = (
+            0
+            if inference_params is None
+            else (
+                inference_params.lengths_per_sample
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+        )
+        rotary_max_seqlen = inference_params.max_seqlen if inference_params is not None else None
+        if self.num_heads_kv == self.num_heads:
+            qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, d=self.head_dim)
+            if (
+                inference_params is None
+                or inference_params.seqlen_offset == 0
+                or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0)
+                or not self.use_flash_attn
+            ):
+                if self.rotary_emb_dim > 0:
+                    qkv = self.rotary_emb(qkv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen)
+                if inference_params is None:
+                    if not self.checkpointing:
+                        context = self.inner_attn(qkv, **kwargs)
+                    else:
+                        context = torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, **kwargs)
+                else:
+                    context = self._update_kvcache_attention(qkv[:, :, 0], qkv[:, :, 1:], inference_params)
+            else:
+                context = self._apply_rotary_update_kvcache_attention(qkv[:, :, 0], qkv[:, :, 1:], inference_params)
+        else:
+            q = rearrange(
+                qkv[..., : self.num_heads_per_rank * self.head_dim],
+                "... (h d) -> ... h d",
+                d=self.head_dim,
+            )
+            kv = rearrange(
+                qkv[..., self.num_heads_per_rank * self.head_dim :],
+                "... (two hkv d) -> ... two hkv d",
+                two=2,
+                d=self.head_dim,
+            )
+            if (
+                inference_params is None
+                or inference_params.seqlen_offset == 0
+                or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0)
+                or not self.use_flash_attn
+            ):
+                if self.rotary_emb_dim > 0:
+                    q, kv = self.rotary_emb(q, kv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen)
+                if inference_params is None:
+                    if not self.checkpointing:
+                        context = self.inner_cross_attn(q, kv, **kwargs)
+                    else:
+                        context = torch.utils.checkpoint.checkpoint(self.inner_cross_attn, q, kv, **kwargs)
+                else:
+                    context = self._update_kvcache_attention(q, kv, inference_params)
+            else:
+                context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
+        context = rearrange(context, "b s h d -> b s (h d)")
+        if seqlen is not None:
+            context = rearrange(context, "b s d -> (b s) d")
+        out = self.out_proj(context)
+        return out

cache.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copied verbatim from vortex
+# Copyright (c) 2024, Michael Poli.
+from dataclasses import dataclass, field
+from typing import Optional
+from torch import Tensor
+# https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/generation.py
+@dataclass
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+    max_seqlen: int
+    max_batch_size: int
+    seqlen_offset: int = 0
+    batch_size_offset: int = 0
+    key_value_memory_dict: dict = field(default_factory=dict)
+    lengths_per_sample: Optional[Tensor] = None
+    def reset(self, max_seqlen, max_batch_size):
+        self.max_seqlen = max_seqlen
+        self.max_batch_size = max_batch_size
+        self.seqlen_offset = 0
+        if self.lengths_per_sample is not None:
+            self.lengths_per_sample.zero_()
+@dataclass
+class HyenaCascadeIIRInferenceParams:
+    """Inference parameters passed to long Hyena blocks with recurrent mode."""
+    fir_filter_length: int = 3
+    state_dim: int = 16
+    seqlen_offset: int = 0
+    fir_state_dict: dict = field(default_factory=dict)
+    state_dict: dict = field(default_factory=dict)
+    def reset(self):
+        self.fir_filter_length = 3
+        self.state_dim = 16
+        self.seqlen_offset = 0
+@dataclass
+class HyenaCascadeFIRInferenceParams:
+    """Inference parameters passed to short and medium Hyena blocks."""
+    fir_filter_length: int = 3
+    fir_inner_filter_length: int = 4
+    seqlen_offset: int = 0
+    fir_inner_state_dict: dict = field(default_factory=dict)
+    fir_state_dict: dict = field(default_factory=dict)
+    state_dict: dict = field(default_factory=dict)
+    def reset(self):
+        self.fir_filter_length = 3
+        self.fir_inner_filter_length = 4
+        self.seqlen_offset = 0

engine.py ADDED Viewed

	@@ -0,0 +1,597 @@

+# Copied verbatim from vortex
+# Copyright (c) 2024, Michael Poli.
+import gc
+import torch
+import torch.nn.functional as F
+try:
+    pass
+except:
+    pass
+from .utils import column_split
+from .rich_logging import activations_logger
+IIR_PREFILL_MODES = [
+    "recurrence",
+    "modal-fft",
+    "hybrid-modal-recurrence",
+    "modal-scan",
+    "canonical-fft",
+    "iir-fir-caching",
+]
+def adjust_filter_shape_for_broadcast(u, h):
+    h = h.squeeze()  # Standardize to [D, L] from [1, D, L] and [D, 1, L]
+    # Case: u: [B, D, L], k_f: [D, L]
+    if len(u.shape) > len(h.shape):
+        h = h.unsqueeze(0)
+    # Case: u: [B, D1, D2, L], k_f: [B, D, L]
+    if len(u.shape) > 3:
+        h = h.unsqueeze(1)
+    return h
+def fftconv_func(
+    u,
+    k,
+    D,
+    dropout_mask,
+    gelu=True,
+    k_rev=None,
+    bidirectional=False,
+    print_activations=False,
+    layer_idx=None,
+    **kwargs,
+):
+    seqlen = u.shape[-1]
+    fft_size = 2 * seqlen
+    k_f = torch.fft.rfft(k, n=fft_size) / fft_size
+    k_f = adjust_filter_shape_for_broadcast(u, k_f)
+    k = k.squeeze()
+    if bidirectional:
+        u_f = torch.fft.rfft(u.to(dtype=k.dtype), n=fft_size)
+        k, k2 = k.split(k.shape[1] // 2, dim=1)
+        k2_f = torch.fft.rfft(k2, n=fft_size) / fft_size
+        y1 = u_f * k_f
+        y2 = u_f.conj() * k2_f.conj()
+        y = torch.fft.irfft(y1 + y2, n=fft_size, norm="forward")[..., :seqlen]
+    else:
+        if k_rev is not None:
+            k_rev_f = torch.fft.rfft(k_rev, n=fft_size) / fft_size
+            k_f = k_f + k_rev_f.conj()
+        u_f = torch.fft.rfft(u.to(dtype=k.dtype), n=fft_size)
+        y = torch.fft.irfft(u_f * k_f, n=fft_size, norm="forward")[..., :seqlen]
+    if print_activations:
+        activations_logger.info(f"post fftconv pre bias {y} {y.min()} {y.max()}")
+    out = y + u * D.unsqueeze(-1)
+    if print_activations:
+        activations_logger.info(f"post fftconv post bias {out} {out.min()} {out.max()}")
+    return out.to(dtype=u.dtype)
+def canonicalize_modal_system(poles, residues):
+    """Canonicalize a modal system.
+    Args:
+        poles (Tensor): The poles of the system.
+        residues (Tensor): The residues of the system.
+    Returns:
+        Tuple[Tensor, Tensor]: The canonicalized poles and residues.
+    """
+    raise NotImplementedError
+def list_tensors(idx):
+    for obj in gc.get_objects():
+        try:
+            if torch.is_tensor(obj) and isinstance(obj, torch.Tensor):
+                # dump to log
+                print(type(obj), obj.size())
+                el = obj[0]
+                with open(f"tensors_{idx}.txt", "a") as f:
+                    f.write(f"{type(obj)} {obj.size()} {el}\n")
+        except Exception:
+            pass
+class HyenaInferenceEngine:
+    def __init__(
+        self,
+        fir_fn=None,
+        iir_prefill_style="modal-fft",
+        layer_idx=None,
+        ground_truth_activations_path=None,
+        print_activations=False,
+        hyena_flip_x1x2=False,
+    ) -> None:
+        self.fir_fn = fir_fn
+        assert iir_prefill_style in IIR_PREFILL_MODES, f"iir_prefill_style must be one of {IIR_PREFILL_MODES}"
+        self.iir_prefill_style = iir_prefill_style
+        self.layer_idx = layer_idx
+        self.low_mem_mode = False
+        self.ground_truth_activations_path = ground_truth_activations_path
+        self.print_activations = print_activations
+        self.hyena_flip_x1x2 = hyena_flip_x1x2
+    def parallel_fir(
+        self,
+        fir_fn,
+        u,
+        weight,
+        bias,
+        L,
+        dims,
+        groups=None,
+        gated_bias=False,
+        column_split_hyena=False,
+        dim_last=True,
+        fir_length=3,
+        gate=False,
+        inference_params=None,
+        prefill_mode=None,
+        padding_mask=None,
+    ):
+        L = u.shape[1] if dim_last else u.shape[2]
+        if gate:
+            hidden_size, num_attention_heads, hidden_size_per_attention_head, _, _ = dims
+            # Compatibility with training infra that column splits the projections
+            if column_split_hyena:
+                x2, x1, v = column_split(u, num_attention_heads, hidden_size_per_attention_head)
+            else:
+                x2, x1, v = u.split([hidden_size, hidden_size, hidden_size], dim=1)
+            if self.hyena_flip_x1x2:
+                x1, x2 = x2, x1
+            u = x1 * v
+            if self.print_activations:
+                activations_logger.info(f"q: {x2}, {x2.min()}, {x2.max()}")
+                activations_logger.info(f"k: {x1}, {x1.min()}, {x1.max()}")
+                activations_logger.info(f"v: {v}, {v.min()}, {v.max()}")
+                activations_logger.info(f"post pregate: {u}, {u.min()}, {u.max()}")
+        # prepare input layout, dimensions and dispatch to fir kernel
+        # Deprecated
+        if fir_fn != torch.nn.functional.conv1d:
+            if dim_last:
+                u = u.permute(0, 2, 1)  # B, D, L
+            z = fir_fn(u)[:, :L]  # B, L, D
+        elif fir_length >= 128:
+            with torch.autocast("cuda"):
+                z = fftconv_func(
+                    u.to(torch.float32),
+                    weight[:, :, :L].to(torch.float32),
+                    bias,
+                    None,
+                    gelu=False,
+                    bidirectional=False,
+                    print_activations=self.print_activations,
+                    groups=groups,
+                    layer_idx=self.layer_idx,
+                )
+                z = z.to(u.dtype)
+        else:
+            if dim_last:
+                u = u.permute(0, 2, 1)  # B, D, L
+            if groups is None:
+                g = u.shape[1]
+            else:
+                g = groups
+            z = fir_fn(
+                u.to(torch.float32),
+                weight.to(torch.float32),
+                bias=None,
+                stride=1,
+                padding=fir_length - 1,
+                groups=u.shape[1],  # always set to D, regardless of filter grouping
+            )[..., :L]
+            if self.print_activations:
+                activations_logger.info(f"post filter: {z}, {z.min()}, {z.max()}")
+            z = z.to(u.dtype)
+            if gated_bias is False:
+                if self.print_activations:
+                    activations_logger.info(f"post dw conv {z} {z.min()} {z.max()}")
+                    # if self.ground_truth_activations_path:
+                    #     z_savanna = torch.load(f"{self.ground_truth_activations_path}/post_dw_conv_{self.layer_idx}.pt")
+                    #     z_savanna = z_savanna.permute(1, 2, 0)
+                    #     z_diff = (z.squeeze() - z_savanna.squeeze()).abs().max()
+                    #     activations_logger.info(f"dw_conv_diff: {z_diff}")
+            if bias is not None:
+                if gated_bias:
+                    z = z + bias[None, :, None] * u
+                else:
+                    z = z + bias[None, :, None]
+        # handle padding post fir, the only place with biases
+        if type(padding_mask) == torch.Tensor:
+            z = z * padding_mask[:, None]
+        if gate:
+            # if self.layer_idx == 1:
+            #    breakpoint()
+            z = x2 * z
+            if self.print_activations:
+                activations_logger.info(f"hyena filter: {weight}, {weight.min()}, {weight.max()}")
+                activations_logger.info(f"post postgate: {z}, {z.min()}, {z.max()}")
+                # if self.ground_truth_activations_path:
+                #     q_savanna = torch.load(f"{self.ground_truth_activations_path}/q_{self.layer_idx}.pt")
+                #     k_savanna = torch.load(f"{self.ground_truth_activations_path}/k_{self.layer_idx}.pt")
+                #     v_savanna = torch.load(f"{self.ground_truth_activations_path}/v_{self.layer_idx}.pt")
+                #     q_diff = (x2 - q_savanna).abs()
+                #     k_diff = (x1 - k_savanna).abs()
+                #     v_diff = (v - v_savanna).abs()
+                #     activations_logger.info(f"q_diff: {q_diff.max()}, {q_diff.mean()}")
+                #     activations_logger.info(f"k_diff: {k_diff.max()}, {k_diff.mean()}")
+                #     activations_logger.info(f"v_diff: {v_diff.max()}, {v_diff.mean()}")
+                #     h_savanna = torch.load(f"/home/zymrael/checkpoints/evo2/activations/savanna/hyena_filter_{self.layer_idx}.pt")
+                #     h_diff = (weight[..., :h_savanna.shape[-1]].squeeze() - h_savanna.squeeze()).abs()
+                #     activations_logger.info(f"h_diff: {h_diff.max()}, {h_diff.mean()}")
+        if inference_params is not None:
+            fir_state = u[..., -fir_length + 1 :]
+        else:
+            fir_state = None
+        return z, fir_state
+    def parallel_iir(
+        self,
+        z_pre,
+        h,
+        D,
+        L,
+        poles,
+        residues,
+        t,
+        dims,
+        layer_idx,
+        inference_params=None,
+        prefill_style="fft",
+        fftconv_fn=None,
+        padding_mask=None,
+        use_flashfft=False,
+        column_split_hyena=False,
+        long_fir_threshold=None,
+    ):
+        """Compute the output state of the short convolutional filter."""
+        fft_size = 2 * L
+        hidden_size, num_attention_heads, hidden_size_per_attention_head, _, _ = dims
+        # Compatibility with training infra that column splits the projections
+        if column_split_hyena:
+            z = z_pre.reshape(
+                z_pre.shape[0],
+                num_attention_heads,
+                3 * hidden_size_per_attention_head,
+                z_pre.shape[2],
+            )
+            x2, x1, v = (
+                z[:, :, :hidden_size_per_attention_head],
+                z[
+                    :,
+                    :,
+                    hidden_size_per_attention_head : 2 * hidden_size_per_attention_head,
+                ],
+                z[:, :, 2 * hidden_size_per_attention_head :],
+            )
+            x2, x1, v = (
+                x2.reshape(x2.shape[0], -1, x2.shape[-1]),
+                x1.reshape(x1.shape[0], -1, x1.shape[-1]),
+                v.reshape(v.shape[0], -1, v.shape[-1]),
+            )
+        else:
+            x2, x1, v = z_pre.split([hidden_size, hidden_size, hidden_size], dim=1)
+        if self.hyena_flip_x1x2:
+            x1, x2 = x2, x1
+        x1v = x1 * v
+        if inference_params is not None and prefill_style == "recurrence":
+            y = self.prefill_via_direct_recurrence(
+                inference_params=inference_params,
+                x1v=x1v,
+                L=L,
+                poles=poles,
+                residues=residues,
+            )
+        else:
+            if use_flashfft and (L % 2) == 0:  # only works with even L
+                y = fftconv_fn(
+                    x1v.to(dtype=torch.bfloat16).contiguous(),
+                    h.to(dtype=torch.float32),
+                )
+                X_s = None
+            elif long_fir_threshold is None:
+                H = torch.fft.rfft(h.to(dtype=torch.float32), n=fft_size) / fft_size
+                X_s = torch.fft.fft(x1v.to(dtype=torch.float32), n=fft_size)
+                X = X_s[..., : H.shape[-1]]
+                if len(z_pre.shape) > 3:
+                    H = H.unsqueeze(1)
+                y = torch.fft.irfft(X * H, n=fft_size, norm="forward")[..., :L]
+            else:
+                assert h.shape[0] == 1, "batch size must be 1 for long_fir_threshold"
+                h = h[0][:, None]  # rearrange to d, 1, l for depthwise conv1d
+                h = h[..., :long_fir_threshold]
+                y = F.conv1d(
+                    x1v,
+                    h.to(dtype=x1v.dtype),
+                    stride=1,
+                    groups=x1v.shape[1],
+                    padding=h.shape[-1] - 1,
+                )[..., :L]
+        # if self.layer_idx == 2:
+        #    breakpoint()
+        y = y.to(dtype=x1v.dtype)
+        y = (y + x1v * D.unsqueeze(-1)) * x2
+        if self.print_activations:
+            activations_logger.info(f"hyena filter: {h}, {h.min()}, {h.max()}")
+            activations_logger.info(f"post hyena iir gate: {y}, {y.min()}, {y.max()}")
+            activations_logger.info(f"q: {x2}, {x2.min()}, {x2.max()}")
+            activations_logger.info(f"k: {x1}, {x1.min()}, {x1.max()}")
+            activations_logger.info(f"v: {v}, {v.min()}, {v.max()}")
+            # if self.ground_truth_activations_path:
+            #     q_savanna = torch.load(f"{self.ground_truth_activations_path}/q_{self.layer_idx}.pt")
+            #     k_savanna = torch.load(f"{self.ground_truth_activations_path}/k_{self.layer_idx}.pt")
+            #     v_savanna = torch.load(f"{self.ground_truth_activations_path}/v_{self.layer_idx}.pt")
+            #     q_diff = (x2 - q_savanna).abs()
+            #     k_diff = (x1 - k_savanna).abs()
+            #     v_diff = (v - v_savanna).abs()
+            #     activations_logger.info(f"q_diff: {q_diff.max()}, {q_diff.mean()}")
+            #     activations_logger.info(f"k_diff: {k_diff.max()}, {k_diff.mean()}")
+            #     activations_logger.info(f"v_diff: {v_diff.max()}, {v_diff.mean()}")
+            #     h_savanna = torch.load(f"/home/zymrael/checkpoints/evo2/activations/savanna/hyena_filter_{self.layer_idx}.pt")
+            #     h_diff = (h[..., :h_savanna.shape[-1]].squeeze() - h_savanna.squeeze()).abs()
+            #     activations_logger.info(f"h_diff: {h_diff.max()}, {h_diff.mean()}")
+        if inference_params is not None:
+            if prefill_style == "fft":
+                self.prefill_via_modal_fft(
+                    inference_params=inference_params,
+                    x1v=x1v,
+                    X_s=X_s,
+                    L=L,
+                    t=t,
+                    poles=poles,
+                    dims=dims,
+                    layer_idx=layer_idx,
+                    use_flashfft=use_flashfft,
+                    fftconv_fn=fftconv_fn,
+                )
+            elif prefill_style == "recurrence":
+                # recurrent prefill is done before
+                pass
+            else:
+                raise NotImplementedError
+            if self.low_mem_mode:
+                # TODO: smarter gc
+                del z_pre, x2, x1, v, x1v, h, poles, residues
+                torch.cuda.empty_cache()
+        return y.permute(0, 2, 1)
+    def step_fir(self, u, fir_state, weight, bias=None, gated_bias=False, flip_filter=False):
+        """Steps forward FIR filters in the architecture.
+        FIR filters generally include truncated convolutions in Hyena with an explicit or hybrid time-domain parametrization:
+        * Short FIR filters in Hyena featurizers
+        * Short and medium FIR filters in Hyena operators
+        Note:
+            `fir_state` contains the last FIR filter length - 1 elements of `u`: `u_(L-2), u_{L-1), ...`
+            We assume dimensions of `short_filter_weight` to be `[d, 1, short_filter_len]`.
+        """
+        weight = weight.squeeze()
+        cache_size = fir_state.shape[-1]
+        filter_length = weight.shape[-1]
+        if flip_filter:
+            weight = weight.flip(-1)
+            weight = weight[..., -cache_size - 1 :].unsqueeze(0)
+        else:
+            weight = weight[..., : cache_size + 1].unsqueeze(0)
+        input_dtype = u.dtype
+        weight = weight.to(torch.float32)
+        u = u.to(torch.float32)
+        fir_state = fir_state.to(torch.float32)
+        bias = bias.to(torch.float32) if bias is not None else None
+        h0, h = weight[..., -1], weight[..., :-1]
+        y = h0 * u + torch.sum(fir_state * h, dim=-1)
+        if bias is not None:
+            if gated_bias:
+                y = y + bias * u
+            else:
+                y = y + bias
+        # Update the state
+        if cache_size < filter_length - 1:
+            fir_state = torch.cat([fir_state, u[..., None]], dim=-1)
+        else:
+            fir_state = torch.roll(fir_state, -1, dims=2)
+            fir_state[..., -1] = u
+        return y.to(input_dtype), fir_state
+    def step_iir(self, x2, x1, v, D, residues, poles, iir_state, iir_groups=1):
+        # TODO: kernelize
+        x1v = x1 * v
+        poles = torch.exp(poles)  # poles arg contains log_poles
+        poles = poles[..., 0][None]  # squeeze dummy seqlen dim and add dummy batch dim
+        residues = residues[None]  # add dummy batch dim
+        iir_state = poles * iir_state + x1v[..., None]
+        res_state = torch.sum(residues * iir_state, dim=-1)
+        if iir_groups > 1:
+            raise NotImplementedError
+        # if self.layer_idx == 2:
+        #    breakpoint()
+        y = x2 * (res_state + D * x1v)
+        return y, iir_state
+    def prefill_via_fir_caching(self, u, inference_params, L, *args, **kwargs):
+        """Turns the IIR filter into a FIR and uses a cache for decoding."""
+        raise NotImplementedError(":)")
+    def prefill_via_direct_recurrence(self, inference_params, x1v, L, residues, poles, *args, **kwargs) -> torch.Tensor:
+        """
+        Compute the IIR state via explicit recurrence (modal form)
+        This is the most memory efficient prefilling method for Hyena filters.
+        Note:
+            dtypes: [state: float32, poles: float32, x1v: bfloat16, output: bfloat16]
+        """
+        state_dim = poles.shape[1]
+        x1v_ = x1v[..., None, None]  # b, d, l, sdim, reim
+        x1v_ = x1v_.repeat(1, 1, 1, state_dim, 2)  # b, d, l, sdim, reim
+        x1v_[..., 1] = 0
+        state = 0 * x1v_[:, :, 0]
+        output = 0 * x1v_[:, :, :, 0, 0]  # b, d, l
+        # suppress dummy seqlen dimension
+        poles = poles[:, :, 0][None]
+        residues = residues[:, :, 0][None].repeat(x1v_.shape[0], 1, 1, 1)  # b, d, sdim, reim
+        # state: b, d, sdim, reim
+        # poles: 1, d, sdim, reim
+        # x1v_: b, d, l, sdim, reim
+        for i in range(L):
+            state[..., 0] = poles[..., 0] * state[..., 0] - poles[..., 1] * state[..., 1] + x1v_[:, :, i, :, 0]
+            state[..., 1] = poles[..., 0] * state[..., 1] + poles[..., 1] * state[..., 0] + x1v_[:, :, i, :, 1]
+            output[:, :, i] = torch.sum(residues * state, dim=-2)[..., 0]  # .real
+        inference_params.state_dict[self.layer_idx] = state.to(dtype=torch.float32)
+        return output
+    def prefill_via_hybrid_recurrence(self, inference_params, u, log_poles, x1v_f_a, L, *args, **kwargs):
+        """
+        Compute the IIR state via hybrid recurrence-convolution over blocks
+        """
+        raise NotImplementedError(":)")
+    def prefill_via_scan(self, u, inference_params=None, *args, **kwargs):
+        raise NotImplementedError
+    def prefill_via_canonical_fft(self, u, inference_params=None, *args, **kwargs):
+        """
+        Compute the IIR state via a single FFT
+        This is the most memory efficient "parallelized" prefilling method for Hyena.
+        From: https://arxiv.org/abs/2310.18780
+        """
+        raise NotImplementedError(":)")
+    def prefill_via_modal_fft(
+        self,
+        inference_params,
+        x1v,
+        L,
+        poles,
+        t,
+        dims,
+        layer_idx,
+        X_s=None,
+        use_flashfft=False,
+        fftconv_fn=None,
+        state_dtype=torch.float32,
+        *args,
+        **kwargs,
+    ):
+        """
+        Compute the IIR state via a single FFT
+        """
+        # When the model has a long convolution derived from a recurrence in modal form and prefill_style is "fft",
+        # we split the filter into poles and residues and reuse FFT computation on the input.
+        hidden_size, _, _, state_size, hyena_filter_groups = dims
+        assert X_s is not None
+        bs = x1v.shape[0]
+        fft_size = 2 * L
+        # poles = torch.view_as_complex(poles.to(torch.float32))
+        state_s = (poles.to(torch.float32) * t).exp()
+        # state_s = poles**t
+        state_S = torch.fft.fft(state_s, n=fft_size).repeat(bs, 1, 1, 1)  # B, D, state_dim, 2 * L
+        if hyena_filter_groups > 1:
+            state_S = state_S.repeat_interleave(hidden_size // hyena_filter_groups, 1)
+        state = torch.fft.ifft(X_s[..., None, :] * state_S, n=fft_size)
+        inference_params.state_dict[layer_idx] = state[..., L - 1].to(dtype=state_dtype)
+    def _compute_state(self, log_poles, u, t, L, *args, **kwargs):
+        """
+        Compute the IIR state given an input `u` and log_poles of the modal system.
+        """
+        bs = u.shape[0]
+        fft_size = 2 * L
+        U = torch.fft.rfft(u.to(torch.float32), n=fft_size)
+        fft_size = 2 * L
+        x = (log_poles * t).exp()
+        # [batch, hidden_size, state_dim, 2 * seqlen]
+        X = torch.fft.fft(x, n=fft_size).repeat(bs, 1, 1, 1)
+        state = torch.fft.ifft(U[..., None, :] * X, n=fft_size)[..., :L]
+        return state
+# I don't think this class is used anywhere? Comment out
+class HyenaFilter:
+    """Handles Hyena filter computations including FFT and direct convolution."""
+    def __init__(self, use_flash_fft=False):
+        self.use_flash_fft = use_flash_fft
+    def fft_conv(self, u, k, D, **kwargs):
+        """FFT-based convolution implementation."""
+        seqlen = u.shape[-1]
+        fft_size = 2 * seqlen
+        k_f = self._prepare_filter(k, u, fft_size)
+        y = self._compute_fft_conv(u, k_f, fft_size, seqlen, **kwargs)
+        return y + u * D.unsqueeze(-1)
+    def _prepare_filter(self, k, u, fft_size):
+        """Prepare filter for FFT convolution."""
+        k_f = torch.fft.rfft(k, n=fft_size) / fft_size
+        return adjust_filter_shape_for_broadcast(u, k_f)

generation.py ADDED Viewed

	@@ -0,0 +1,373 @@

+# Copied verbatim from vortex
+# Copyright (c) 2024, Michael Poli.
+from dataclasses import dataclass
+import torch
+import sys
+import numpy as np
+from .sample import sample
+from .tokenizer import CharLevelTokenizer
+from .utils import print_rank_0
+class Generator:
+    def __init__(self, model, tokenizer, top_k=50, top_p=0.7, temperature=1):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.top_k = top_k
+        self.top_p = top_p
+        self.temperature = temperature
+        self.untils = ["\n\n"]
+    def generate(
+        self,
+        device: str,
+        input_string: str = None,
+        input_ids: torch.Tensor = None,
+        num_tokens: int = 32,
+        cached_generation: bool = True,
+        force_prompt_threshold: int = None,
+        max_seqlen: int = None,
+        print_generation: bool = True,
+        verbose: bool = False,
+        skip_special_tokens: bool = False,
+        stop_at_eos: bool = True,
+        inference_params_dict: dict = None,
+        token_callback=lambda i: None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Generates using the model with optional cached sampling replay.
+        This method enables passing in and returning the `inference_params_dict` for
+        replaying cached sampling from a given state, for example for beam search.
+        Args:
+            device: The device to run the model on.
+            input_string: The input prompt to generate from.
+            input_ids: The input prompt token ids to generate from.
+            num_tokens: The number of tokens to generate.
+            cached_generation: Whether to use cached generation. Defaults to False.
+            force_prompt_threshold: Number of tokens to prefill in parallel before
+                switching to prompt forcing. Used to reduce peak memory usage and
+                support longer prompts. Defaults to None.
+            max_seqlen: Maximum sequence length to generate. Determines the max size
+                of the cache if larger. Otherwise automatically determined using
+                prompt length + max_tokens. Defaults to None.
+            print_generation: Whether to print generated tokens. Defaults to False.
+            verbose: Whether to print verbose output. Defaults to False.
+            skip_special_tokens: Whether to skip special tokens. Defaults to True.
+            stop_at_eos: Whether to stop generation at EOS token. Defaults to True.
+            inference_params_dict: Dictionary of inference parameters to use for
+                replaying cached sampling. Defaults to None.
+            token_callback: Optional callback function called after each token is
+                generated. Defaults to None.
+        Returns:
+            dict: The inference parameters dictionary used for generation, which can
+                be used to replay the exact same sampling sequence.
+        """
+        if isinstance(self.tokenizer.eos, int):
+            eos_token_ids = torch.LongTensor([self.tokenizer.eos]).to(device)
+        else:
+            eos_token_ids = self.tokenizer.tokenize(self.tokenizer.eos).to(device)
+        if input_ids is None:
+            input = self.tokenizer.tokenize(input_string)
+            if isinstance(input, list):
+                input = torch.LongTensor(input).unsqueeze(0).to(device)
+            else:
+                input = input.unsqueeze(0).to(device)
+        else:
+            input = input_ids
+        x = input
+        if max_seqlen is not None:
+            x = x[:, -max_seqlen:]
+        num_tokens = int(num_tokens)
+        batch_size = x.shape[0]
+        prompt_length = x.shape[1]
+        prompt_forcing = inference_params_dict is None and force_prompt_threshold is not None and prompt_length > force_prompt_threshold
+        if prompt_forcing:
+            forced_prompt_length = prompt_length - force_prompt_threshold
+            x_force = x[:, force_prompt_threshold:]
+            x = x[:, :force_prompt_threshold]
+        else:
+            forced_prompt_length = 0
+        tot_length = prompt_length + num_tokens
+        if max_seqlen is not None:
+            if max_seqlen > tot_length:
+                tot_length = max_seqlen
+        generation = torch.empty(
+            x.shape[0],
+            num_tokens,
+            dtype=torch.long,
+            device=x.device,
+        )
+        scores = torch.empty(
+            x.shape[0],
+            num_tokens,
+            self.tokenizer.vocab_size,
+            dtype=torch.float,
+            device=x.device,
+        )
+        if inference_params_dict is not None:
+            cached_generation = True
+            prefilled = True
+            # Ensure that the cached data is loaded on the correct device.
+            if any(data.device != x.device for data in inference_params_dict["hcl"].fir_state_dict.values()):
+                for key, data in inference_params_dict["mha"].key_value_memory_dict.items():
+                    inference_params_dict["mha"].key_value_memory_dict[key] = data.to(x.device)
+                for key, data in inference_params_dict["hcl"].fir_state_dict.items():
+                    inference_params_dict["hcl"].fir_state_dict[key] = data.to(x.device)
+                for key, data in inference_params_dict["hcl"].state_dict.items():
+                    inference_params_dict["hcl"].state_dict[key] = data.to(x.device)
+                for key, data in inference_params_dict["hcm"].fir_inner_state_dict.items():
+                    inference_params_dict["hcm"].fir_inner_state_dict[key] = data.to(x.device)
+                for key, data in inference_params_dict["hcm"].fir_state_dict.items():
+                    inference_params_dict["hcm"].fir_state_dict[key] = data.to(x.device)
+                for key, data in inference_params_dict["hcm"].state_dict.items():
+                    inference_params_dict["hcm"].state_dict[key] = data.to(x.device)
+                for key, data in inference_params_dict["hcs"].fir_state_dict.items():
+                    inference_params_dict["hcs"].fir_state_dict[key] = data.to(x.device)
+                for key, data in inference_params_dict["hcs"].fir_inner_state_dict.items():
+                    inference_params_dict["hcs"].fir_inner_state_dict[key] = data.to(x.device)
+                for key, data in inference_params_dict["hcs"].state_dict.items():
+                    inference_params_dict["hcs"].state_dict[key] = data.to(x.device)
+            inference_params_dict["mha"].max_batch_size = batch_size
+        elif cached_generation:
+            inference_params_dict = self.model.initialize_inference_params(max_seqlen=tot_length)
+            inference_params_dict["mha"].max_batch_size = batch_size
+            prefilled = False
+        else:
+            inference_params_dict = None
+            prefilled = False
+        if verbose:
+            mem_after_tok = torch.cuda.memory_allocated(device=x.device) / 1e9
+            print_rank_0(f"Memory after tokenization: {mem_after_tok} GB")
+            print_rank_0("Starting generation...")
+            if input_string is not None:
+                print_rank_0("Prompt: " + input_string)
+            else:
+                print_rank_0(f"Prompt ids: {input_ids} {input_ids.shape}")
+        i = 0
+        for i in range(forced_prompt_length + num_tokens):
+            post_prefill = prefilled or (cached_generation and i > 0)
+            # prefill then process only the last token
+            if post_prefill:
+                x = x[:, -1:]
+                seqlen_offset = inference_params_dict["mha"].seqlen_offset
+                if seqlen_offset == 0:
+                    if prompt_forcing:
+                        seqlen_offset = force_prompt_threshold
+                    else:
+                        seqlen_offset = input.shape[-1]
+                    inference_params_dict["mha"].seqlen_offset = seqlen_offset
+                    inference_params_dict["hcl"].seqlen_offset = seqlen_offset
+                    inference_params_dict["hcm"].seqlen_offset = seqlen_offset
+                    inference_params_dict["hcs"].seqlen_offset = seqlen_offset
+                else:
+                    inference_params_dict["mha"].seqlen_offset += 1
+                    inference_params_dict["hcl"].seqlen_offset += 1
+                    inference_params_dict["hcm"].seqlen_offset += 1
+                    inference_params_dict["hcs"].seqlen_offset += 1
+            # do forward pass with no gradient
+            with torch.inference_mode():
+                logits, inference_params_dict = self.model(
+                    x,
+                    inference_params_dict=inference_params_dict,
+                )
+            token_callback(i)
+            last_logits = logits[:, -1]
+            if prompt_forcing and i < forced_prompt_length:
+                new_idx = x_force[:, i]
+            else:
+                new_idx = sample(
+                    last_logits,
+                    top_k=self.top_k,
+                    top_p=self.top_p,
+                    temperature=self.temperature,
+                )
+            if stop_at_eos and (generation[0, -1:] == eos_token_ids).all():
+                print("Stopping generation at EOS")
+            if print_generation and verbose and batch_size == 1:
+                print(
+                    f"{self.tokenizer.detokenize([new_idx.item()])}",
+                    end=" ",
+                    flush=True,
+                )
+            if prompt_forcing:
+                if i >= forced_prompt_length:
+                    scores[:, i - forced_prompt_length] = last_logits
+                    generation[:, i - forced_prompt_length] = new_idx
+            else:
+                scores[:, i] = last_logits
+                generation[:, i] = new_idx
+            if post_prefill:
+                x = new_idx[:, None]
+            else:
+                x = torch.cat([x, new_idx[:, None]], dim=-1)
+        if verbose:
+            y = self.tokenizer.detokenize_batch(generation[:, : i + 1])
+            for until in self.untils:
+                if until in y:
+                    y = y.split(until)[0]
+                    break
+            print(f"\nInput: {input_string}, Output: {y}")
+            mem_end = torch.cuda.memory_allocated(device=x.device) / 1e9
+            print(f"Memory after generation: {mem_end} GB")
+        return generation[:, : i + 1], scores[:, : i + 1], inference_params_dict
+def logits_to_logprobs(logits: torch.Tensor, tokens: torch.Tensor) -> torch.Tensor:
+    """Convert logits to log probabilities."""
+    probs = torch.log_softmax(logits, dim=-1)
+    return torch.gather(probs, -1, tokens.unsqueeze(-1)).squeeze(-1)
+def prepare_batch(
+    seqs: list[str], tokenizer: CharLevelTokenizer, prepend_bos: bool = False, device: str = "cuda:0"
+) -> tuple[torch.Tensor, list[int]]:
+    """Prepare a batch of sequences for the model."""
+    if prepend_bos:
+        seqs = [tokenizer.bos + seq for seq in seqs]
+    tokens = [tokenizer.tokenize(seq) for seq in seqs]
+    if isinstance(tokens[0], list):
+        tokens = [torch.tensor(t, dtype=torch.long) for t in tokens]
+    max_len = max(len(t) for t in tokens)
+    batch = torch.zeros((len(tokens), max_len), dtype=torch.long)
+    for i, t in enumerate(tokens):
+        batch[i, : len(t)] = t
+    return batch.to(device), [len(t) for t in tokens]
+@dataclass(kw_only=True)
+class GenerationOutput:
+    sequences: list[str]
+    logits: list[torch.Tensor]
+    logprobs_mean: list[float]
+def generate(
+    *,
+    prompt_seqs: list[str],
+    model,
+    tokenizer: CharLevelTokenizer,
+    n_tokens: int = 100,
+    temperature: float = 0.0,
+    top_k: int = 1,
+    top_p: float = 1.0,
+    batched: bool = True,
+    prepend_bos: bool = False,
+    force_prompt_threshold: int = 1000,
+    cached_generation: bool = True,
+    verbose: int = 1,
+    device: str = "cuda:0",
+    **kwargs,
+) -> GenerationOutput:
+    """
+    Performs generation from a list of prompts.
+    If all prompts are the same length, this can do batched generation.
+    Also supports cached generation for efficient sampling.
+    """
+    model.eval()
+    g = Generator(
+        model,
+        tokenizer,
+        top_k=top_k,
+        top_p=top_p,
+        temperature=temperature,
+    )
+    uniform_lengths = all(len(s) == len(prompt_seqs[0]) for s in prompt_seqs)
+    if batched and uniform_lengths:
+        input_ids_list = [
+            prepare_batch(
+                prompt_seqs,
+                tokenizer,
+                prepend_bos=prepend_bos,
+                device=device,
+            )[0]
+        ]
+    else:
+        sys.stderr.write("WARNING: Batched generation is turned off.\n")
+        input_ids_list = [
+            prepare_batch(
+                [prompt_seq],
+                tokenizer,
+                prepend_bos=prepend_bos,
+                device=device,
+            )[0]
+            for prompt_seq in prompt_seqs
+        ]
+    generated_seqs, generated_scores, logitss = [], [], []
+    for input_ids in input_ids_list:
+        batch_size = input_ids.shape[0]
+        output_ids, logits, _ = g.generate(
+            input_ids=input_ids,
+            num_tokens=n_tokens,
+            device=device,
+            print_generation=(verbose > 1),
+            verbose=(verbose > 1),
+            stop_at_eos=False,
+            force_prompt_threshold=force_prompt_threshold,
+            cached_generation=cached_generation,
+            **kwargs,
+        )
+        if verbose > 1:
+            print("input_ids.shape", input_ids.shape)
+            print("output_ids.shape", output_ids.shape)
+            print("logits.shape", logits.shape)
+        generated_seqs_batch = list(tokenizer.detokenize_batch(output_ids))
+        assert len(generated_seqs_batch) == batch_size
+        generated_seqs += generated_seqs_batch
+        logitss.append(logits)
+        logprobs = logits_to_logprobs(logits, output_ids)
+        logprobs = logprobs.float().cpu().numpy()
+        generated_scores += [np.mean(logprobs[idx]) for idx in range(batch_size)]
+    assert len(generated_seqs) == len(generated_scores) == len(prompt_seqs)
+    if verbose:
+        for seq, score, prompt in zip(generated_seqs, generated_scores, prompt_seqs):
+            print(f'Prompt: "{prompt}",\tOutput: "{seq}",\tScore: {score}')
+    return GenerationOutput(
+        sequences=generated_seqs,
+        logits=logitss,
+        logprobs_mean=generated_scores,
+    )

layers.py ADDED Viewed

	@@ -0,0 +1,272 @@

+# Copied verbatim from vortex (minus the commented out code)
+# Copyright (c) 2024, Michael Poli.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from typing import Callable
+from .utils import grab_first_if_tuple
+from transformer_engine.pytorch import Linear
+from transformer_engine.common.recipe import Format, DelayedScaling
+import transformer_engine.pytorch as te
+# Not bothering with ops right now (which is an interface with custom Triton
+# kernels)
+# try:
+#     from hyena_ops import hyena_se_fwd, hyena_mr_fwd, hyena_li_fwd
+# except ImportError:
+#     hyena_se_fwd, hyena_mr_fwd, hyena_li_fwd = None, None, None
+hyena_se_fwd, hyena_mr_fwd, hyena_li_fwd = None, None, None
+def set_format_recipe():
+    fp8_format = Format.HYBRID  # E4M3 during forward pass, E5M2 during backward pass
+    fp8_recipe = DelayedScaling(fp8_format=fp8_format, amax_history_len=16, amax_compute_algo="max")
+    return fp8_format, fp8_recipe
+class TELinear(Linear):
+    """
+    Wrapper for Transformer-Engine's `Linear` layer.
+    Note that if Megatron's parallel_state has not been initialized
+    yet, the tp_group passed to TE will be None and must be set later
+    via set_tensor_parallel_group().
+    """
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        init_method: Callable,
+        bias: bool = True,
+        skip_bias_add: bool = False,
+        use_fp8: bool = False,
+        **kwargs,
+    ):
+        # Parameters are initialized at higher precision even if fp8
+        # is used
+        params_dtype = torch.bfloat16
+        # TE returns a zero length Tensor when bias=False and
+        # return_bias=True, but we prefer None.  So in that case we
+        # tell TE to not return the bias, and return None
+        # ourselves. This way our forward always returns two values
+        # and we don't have to deal with the zero length Tensor.
+        self.te_return_bias = skip_bias_add and bias
+        self.use_fp8_input_projections = use_fp8
+        if use_fp8:
+            self.fp8_format, self.fp8_recipe = set_format_recipe()
+        super().__init__(
+            in_features=input_size,
+            out_features=output_size,
+            sequence_parallel=False,
+            fuse_wgrad_accumulation=False,
+            tp_group=None,
+            tp_size=1,
+            init_method=init_method,
+            params_dtype=params_dtype,
+            parallel_mode=None,
+            bias=bias,
+            return_bias=self.te_return_bias,
+            **kwargs,
+        )
+    def forward(self, x):
+        if self.use_fp8_input_projections:
+            with te.fp8_autocast(enabled=True, fp8_recipe=self.fp8_recipe):
+                out = super().forward(x)
+        else:
+            out = super().forward(x)
+        # TE only returns a tuple when return_bias is True, otherwise
+        # it returns a single Tensor, we always want to return two
+        # values regardless of the arguments.
+        if self.te_return_bias:
+            return out
+        return out, None
+class FlexLinear:
+    """
+    Megatron and Transformer Engine linear layer compatible with fp8, bf16, fp16 and fp32
+    """
+    def __new__(
+        self,
+        input_size,
+        output_size,
+        config,
+        parallel_mode: str,
+        bias: bool = False,
+        skip_bias_add: bool = True,
+        use_fp8: bool = False,
+        input_is_parallel=False,  # for row parallel
+        gather_output: bool = True,  # for column parallel
+        parallel_output: bool = False,  # for row parallel
+        **kwargs,
+    ):
+        # use_fp8 = config.use_fp8_linears
+        self.config = config
+        instance = None
+        if use_fp8:
+            instance = TELinear(
+                input_size=input_size,
+                output_size=output_size,
+                config=self.config,
+                parallel_mode=parallel_mode,
+                bias=bias,
+                skip_bias_add=skip_bias_add,
+                **kwargs,
+            )
+        return instance
+class RMSNorm(torch.nn.Module):
+    def __init__(self, config):
+        super(RMSNorm, self).__init__()
+        self.eps, self.hidden_size = config.eps, config.hidden_size
+        self.scale = torch.nn.Parameter(torch.ones(self.hidden_size, dtype=config.params_dtype))
+        self.register_parameter("scale", self.scale)
+        self.use_flash_rmsnorm = config.get("use_flash_rmsnorm", False)
+        if self.use_flash_rmsnorm:
+            from flash_attn.ops.rms_norm import rms_norm as rmsnorm_func
+            self.rmsnorm_func = rmsnorm_func
+    def forward(self, x):
+        if self.use_flash_rmsnorm:
+            return self.rmsnorm_func(x, self.scale, self.eps)
+        else:
+            y = x / (x.norm(2, dim=-1, keepdim=True) * self.hidden_size ** (-1.0 / 2) + self.eps)
+            return self.scale * y
+class ParallelGatedMLP(nn.Module):
+    def __init__(
+        self,
+        config,
+        layer_idx,
+    ):
+        super().__init__()
+        self.layer_idx = layer_idx
+        multiple_of = config.get("inner_size_multiple_of", 64)
+        self.act_type = config.get("mlp_activation", "gelu")
+        if self.act_type == "gelu":
+            self.act = F.gelu
+        elif self.act_type == "silu":
+            self.act = F.silu
+        else:
+            raise NotImplementedError
+        if self.layer_idx > 0 and config.get("evo2_style_activations", False):
+            self.act = nn.Identity()
+        self.multiple_of = multiple_of * config.model_parallel_size
+        inner_size = int(2 * config.hidden_size * 4 / 3)
+        inner_size = self.multiple_of * ((inner_size + self.multiple_of - 1) // self.multiple_of)
+        inner_size = config.get("inner_mlp_size", inner_size)
+        self.l1 = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=inner_size,
+            bias=False,
+        )
+        self.l2 = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=inner_size,
+            bias=False,
+        )
+        self.l3 = nn.Linear(
+            in_features=inner_size,
+            out_features=config.hidden_size,
+            bias=False,
+        )
+    def forward(self, z):
+        z1, z2 = self.l1(z), self.l2(z)
+        z1, z2 = grab_first_if_tuple(z1), grab_first_if_tuple(z2)
+        y = self.l3(self.act(z1) * z2)
+        return grab_first_if_tuple(y)
+class Embedding(nn.Module):
+    _train_dtype = "bf16"
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
+    def embed(self, input_ids, position_ids=None, tokentype_ids=None):
+        embeddings = self.word_embeddings(input_ids)
+        return embeddings
+    def unembed(self, u):
+        weight = self.word_embeddings.weight
+        return torch.matmul(u, weight)
+class VocabParallelEmbedding(nn.Embedding):
+    "Adapted from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/embedding.py"
+    def __init__(self, config):
+        vocab_size, process_group, padding_idx = (
+            config.vocab_size,
+            config.get("process_group", None),
+            config.get("padding_idx", None),
+        )
+        self.process_group = process_group
+        if process_group is not None:
+            world_size = torch.distributed.get_world_size(process_group)
+            if vocab_size % world_size != 0:
+                raise ValueError(f"vocab_size ({vocab_size}) must be divisible by " f"world_size ({world_size})")
+            if world_size > 1 and padding_idx is not None:
+                raise RuntimeError("ParallelEmbedding does not support padding_idx")
+        else:
+            world_size = 1
+        super().__init__(
+            vocab_size // world_size,
+            embedding_dim=config.hidden_size,
+            padding_idx=padding_idx,
+        )
+    def forward(self, input: Tensor) -> Tensor:
+        if self.process_group is None:
+            return super().forward(input)
+        else:
+            rank = torch.distributed.get_rank(self.process_group)
+            vocab_size = self.num_embeddings
+            vocab_start_index, vocab_end_index = (
+                rank * vocab_size,
+                (rank + 1) * vocab_size,
+            )
+            # Create a mask of valid vocab ids (1 means it needs to be masked).
+            input_ids_mask = (input < vocab_start_index) | (input >= vocab_end_index)
+            input = input - vocab_start_index
+            input[input_ids_mask] = 0
+            embeddings = self.forward(input)
+            embeddings[input_ids_mask] = 0.0
+            # Reduce to the global process group
+            torch.distributed.all_reduce(embeddings, group=self.process_group)
+            return embeddings
+    def unembed(self, u: Tensor) -> Tensor:
+        if self.process_group is None:
+            return u @ self.weight.T
+        else:
+            raise NotImplementedError
+class VocabParallelUnembedding(VocabParallelEmbedding):
+    def forward(self, input: Tensor) -> Tensor:
+        return self.unembed(input)

model.py ADDED Viewed

	@@ -0,0 +1,937 @@

+# Copied verbatim from vortex
+# Copyright (c) 2024, Michael Poli.
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .cache import (
+    InferenceParams,
+    HyenaCascadeFIRInferenceParams,
+    HyenaCascadeIIRInferenceParams,
+)
+from .engine import HyenaInferenceEngine
+from .layers import (
+    ParallelGatedMLP,
+    RMSNorm,
+    VocabParallelEmbedding,
+    VocabParallelUnembedding,
+    TELinear,
+)
+from .utils import (
+    Lambda,
+    column_split,
+    interleave,
+    print_rank_0,
+    move_to_device,
+    fixup_fp8_extra_states,
+    fixup_te_workspace,
+)
+from .rich_logging import activations_logger, enable_activations_logging
+import logging
+from tqdm import tqdm
+from attention import MHA
+try:
+    from vortex.model.positional_embeddings import swap_mha_rope
+except ImportError:
+    "could not import swap_mha_rope from src.positional_embeddings"
+class AttentionBlock(nn.Module):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__()
+        self.config = config
+        self.pre_norm, self.post_norm = RMSNorm(config), RMSNorm(config)
+        self.layer_idx = layer_idx
+        self.print_activations = config.get("print_activations", False)
+        self.proj_groups = config.get("proj_groups", 1)
+        dtype = config.get("attn_block_dtype", torch.bfloat16)
+        mlp_dtype = config.get("mlp_dtype", torch.bfloat16)
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.hidden_size_per_attention_head = config.hidden_size // config.num_attention_heads
+        self.counter = 0
+        self.inner_mha_cls = MHA(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_heads_kv=config.num_attention_heads // self.proj_groups,
+            rotary_emb_dim=config.hidden_size // config.num_attention_heads,
+            qkv_proj_bias=config.get("qkv_proj_bias", True),
+            rotary_emb_base=config.get("rotary_emb_base", 1000000),
+            causal=True,
+            layer_idx=layer_idx,
+            out_proj_bias=config.get("mha_out_proj_bias", True),
+            use_flash_attn=self.config.use_flash_attn,
+        ).to(dtype=dtype)
+        # check if using interpolated rotary pos emb from config, and swap the rope emb
+        if config.get("use_interpolated_rotary_pos_emb", False):
+            swap_mha_rope(
+                mha=self.inner_mha_cls,
+                kwargs_new_rope={"scaling_factor": config.get("rotary_emb_scaling_factor", 1.0)},
+            )
+        if self.config.get("smeared_gqa", False):
+            self.inner_mha_cls.num_heads_kv = self.inner_mha_cls.num_heads
+        self.inner_mha_cls.rotary_emb.register_buffer("inv_freq", self.inner_mha_cls.rotary_emb.inv_freq)
+        self.mlp = ParallelGatedMLP(config, layer_idx).to(dtype=mlp_dtype)
+    def forward(self, u, inference_params=None, padding_mask=None, *args, **kwargs):
+        if (
+            type(padding_mask) == torch.Tensor
+        ):  # workaround for masking bug in FA. This works because Wqkv does not have bias
+            # and attention scores will be also automatically zeroed.
+            u = u * padding_mask[..., None]
+        if self.print_activations:
+            activations_logger.info(f"pre mha: {u}")
+        u = (
+            self.inner_mha_cls(
+                self.pre_norm(u),
+                inference_params=inference_params,
+            )
+            + u
+        )
+        if self.print_activations:
+            activations_logger.info(f"post mha: {u}")
+        if type(padding_mask) == torch.Tensor:  # guard against bias
+            u = u * padding_mask[..., None]
+        if self.print_activations:
+            activations_logger.info(f"pre mlp: {u} {u.min()} {u.max()} {self.mlp.__class__}")
+            activations_logger.info(
+                f"post mlp norm: {self.post_norm(u)} {self.post_norm(u).min()} {self.post_norm(u).max()}"
+            )
+            activations_logger.info(
+                f"post mlp: {self.mlp(self.post_norm(u))} {self.mlp(self.post_norm(u)).min()} {self.mlp(self.post_norm(u)).max()}"
+            )
+        u = self.mlp(self.post_norm(u)) + u
+        return u, None
+class HyenaCascade(nn.Module):
+    def __init__(self, config, layer_idx, hyena_filter_groups=None, fir_inner_filter_length=None) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hyena_filter_groups = hyena_filter_groups
+        self.print_activations = config.get("print_activations", False)
+        self.ground_truth_activations_path = config.get("ground_truth_activations_path", None)
+        self.use_flashfft = config.get("use_flashfft", False)
+        self.state_size = config.state_size
+        self.hidden_size = config.hidden_size
+        self.num_filters = config.num_filters
+        self.inference_mode = config.get("inference_mode", True)
+        self.counter = 0
+        self.column_split_hyena = config.get("column_split_hyena", True)
+        self.hyena_flip_x1x2 = config.get("hyena_flip_x1x2", False)
+        assert self.hidden_size % self.num_filters == 0 and self.num_filters <= self.hidden_size
+        # attention heads are not used except to split post short_filter
+        # projections in the same way as the checkpoint
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size_per_attention_head = self.hidden_size // self.num_attention_heads
+        self.fir_inner_filter_length = fir_inner_filter_length
+        self.short_filter_length = config.short_filter_length
+        self.short_filter_weight = nn.Parameter(torch.randn(3 * config.hidden_size, 1, config.short_filter_length))
+        self.short_filter_bias = nn.Parameter(torch.randn(3 * config.hidden_size)) if config.short_filter_bias else None
+        self.engine = HyenaInferenceEngine(
+            layer_idx=layer_idx,
+            ground_truth_activations_path=self.ground_truth_activations_path,
+            print_activations=self.print_activations,
+            hyena_flip_x1x2=config.get("hyena_flip_x1x2", False),
+        )
+        self.use_flash_depthwise = config.get("use_flash_depthwise", False)
+        self.data_dtype = None
+        if self.use_flash_depthwise:
+            try:
+                from flashfftconv import FlashDepthwiseConv1d
+                self.fir_fn = FlashDepthwiseConv1d(
+                    channels=3 * self.hidden_size,
+                    kernel_size=self.short_filter_length,
+                    padding=self.short_filter_length - 1,
+                    weights=self.short_filter_weight,
+                    bias=self.short_filter_bias,
+                    device=None,
+                    dtype=self.config.get("depthwise_dtype", torch.bfloat16),
+                )
+            except ImportError:
+                "flashfftconv not installed"
+        else:
+            self.fir_fn = F.conv1d
+            self.fir_inner_fn = F.conv1d
+        self.fftconv_fn = None
+        self.long_fir_threshold = config.get("long_fir_threshold", None)
+        if self.long_fir_threshold is not None:
+            assert self.use_flashfft is False, "long_fir_threshold not compatible with fused flashfft"
+        self.num_systems = self.hyena_filter_groups
+        self.channels_per_group = self.hidden_size // self.hyena_filter_groups
+        if self.fir_inner_filter_length:
+            self.h = nn.Parameter(torch.randn(self.hyena_filter_groups, 1, fir_inner_filter_length))
+            if fir_inner_filter_length >= 128:
+                self.D = nn.Parameter(torch.zeros(self.hidden_size))
+            if fir_inner_filter_length < 128:
+                self.D = None
+        else:
+            log_poles = torch.randn(self.num_systems, self.state_size, 1, dtype=torch.float32)
+            # TODO: bring over init from internals
+            # poles[..., 0] = 1e-2 * torch.randn(self.num_systems, self.state_size, 1)
+            # poles[..., 1] = 1e-3 * torch.randn(self.num_systems, self.state_size, 1)
+            self.log_poles = nn.Parameter(log_poles)
+            self.residues = nn.Parameter(torch.randn(self.num_systems, self.state_size, dtype=torch.float32))
+            self.D = nn.Parameter(torch.zeros(self.hidden_size))
+            self.h = None
+        self.t = None
+    def forward(self, u, inference_params=None, padding_mask=None, *args, **kwargs):
+        if inference_params is not None and self.layer_idx in inference_params.fir_state_dict.keys():
+            return self.sequential_forward(u, inference_params)
+        else:
+            return self.parallel_forward(u, inference_params, padding_mask)
+    def parallel_forward(self, u, inference_params=None, padding_mask=None):
+        L = u.shape[1]
+        dims = (
+            self.hidden_size,
+            self.num_attention_heads,
+            self.hidden_size_per_attention_head,
+            self.state_size,
+            self.hyena_filter_groups,
+        )
+        if self.print_activations:
+            activations_logger.info(f"pre 1 parallel fir: {u}, {u.min()}, {u.max()}")
+        z_pre, fir_state = self.engine.parallel_fir(
+            self.fir_fn,
+            u,
+            self.short_filter_weight,
+            self.short_filter_bias,
+            L,
+            dims=dims,
+            gate=False,
+            column_split_hyena=self.column_split_hyena,
+            fir_length=self.short_filter_length,
+            inference_params=inference_params,
+            padding_mask=padding_mask,
+            dim_last=True,
+        )
+        if inference_params:
+            inference_params.fir_state_dict[self.layer_idx] = fir_state
+        if self.config.interleave:
+            z_pre = interleave(z_pre)
+        if self.h is None:
+            h, _, _, _ = self.compute_filter(L, u.device)
+        else:
+            h = self.h
+        D = self.D
+        if self.hyena_filter_groups > 1:
+            h = h.repeat_interleave(self.hidden_size // self.hyena_filter_groups, 0)
+        # if inference_params is not None, we plan to perform generation:
+        # prefilling is handled by the engine.
+        if self.fir_inner_filter_length is not None:
+            if self.print_activations:
+                activations_logger.info(
+                    f"pre 2 parallel fir: {z_pre}, {z_pre.min()}, {z_pre.max()}, {self.fir_inner_filter_length}"
+                )
+            y, fir_inner_state = self.engine.parallel_fir(
+                self.fir_inner_fn,
+                z_pre,
+                h,
+                D,
+                L,
+                dims=dims,
+                gate=True,
+                gated_bias=self.fir_inner_filter_length >= 128,
+                dim_last=False,
+                column_split_hyena=self.column_split_hyena,
+                fir_length=self.fir_inner_filter_length,
+                inference_params=inference_params,
+                padding_mask=padding_mask,
+                groups=self.hyena_filter_groups,
+            )
+            if self.print_activations:
+                activations_logger.info(f"post 2 parallel fir: {y}, {y.min()}, {y.max()}")
+            y = y.permute(0, 2, 1)
+            if inference_params:
+                inference_params.fir_inner_state_dict[self.layer_idx] = fir_inner_state
+        else:
+            if self.print_activations:
+                activations_logger.info(f"pre 2 parallel iir: {z_pre}, {z_pre.min()}, {z_pre.max()}")
+            y = self.engine.parallel_iir(
+                z_pre,
+                h,
+                D,
+                L,
+                t=self.t,
+                poles=self.log_poles,
+                residues=self.residues,
+                dims=dims,
+                inference_params=inference_params,
+                layer_idx=self.layer_idx,
+                prefill_style=self.config.get("prefill_style", "fft"),
+                use_flashfft=self.use_flashfft,
+                fftconv_fn=self.fftconv_fn,
+                column_split_hyena=self.column_split_hyena,
+                long_fir_threshold=self.long_fir_threshold,
+                padding_mask=padding_mask,
+            )
+            if self.print_activations:
+                activations_logger.info(f"post 2 parallel iir: {y}, {y.min()}, {y.max()}")
+        return y, inference_params
+    def sequential_forward(self, u, inference_params):
+        if self.data_dtype is None:
+            self.data_dtype = u.dtype
+        if len(u.shape) > 2:
+            u = u[:, -1]
+        z_pre, fir_state = self.engine.step_fir(
+            u,
+            inference_params.fir_state_dict[self.layer_idx],
+            weight=self.short_filter_weight,
+            bias=self.short_filter_bias,
+        )
+        inference_params.fir_state_dict[self.layer_idx] = fir_state
+        if self.config.interleave:
+            z_pre = interleave(z_pre)
+        x2, x1, v = (
+            column_split(z_pre, self.num_attention_heads, self.hidden_size_per_attention_head)
+            if self.column_split_hyena
+            else z_pre.split([self.hidden_size, self.hidden_size, self.hidden_size], dim=1)
+        )
+        if self.hyena_flip_x1x2:
+            x1, x2 = x2, x1
+        if self.fir_inner_filter_length is not None:
+            if self.hyena_filter_groups > 1:
+                h = self.h.repeat_interleave(self.hidden_size // self.hyena_filter_groups, 0)
+            else:
+                h = self.h
+            y, fir_inner_state = self.engine.step_fir(
+                x1 * v,
+                inference_params.fir_inner_state_dict[self.layer_idx],
+                weight=h,
+                bias=self.D,
+                flip_filter=self.fir_inner_filter_length >= 128,
+                gated_bias=self.fir_inner_filter_length >= 128,
+            )
+            y = y * x2
+            inference_params.fir_inner_state_dict[self.layer_idx] = fir_inner_state
+        else:
+            y, iir_state = self.engine.step_iir(
+                x2,
+                x1,
+                v,
+                self.D,
+                self.residues,
+                self.log_poles,
+                inference_params.state_dict[self.layer_idx],
+                iir_groups=1,
+            )
+            inference_params.state_dict[self.layer_idx] = iir_state
+        y = y.to(dtype=self.data_dtype)
+        return y[:, None], inference_params
+    def update_time(self, L, device):
+        """
+        Set [0, 1, ..., L-1] where L is the length of the current batch of inputs.
+        If L is greater than the length of the previous batch, then the time vector is
+        reinitialized. Otherwise, the time vector is truncated from cache.
+        """
+        if self.t is None:
+            self.t = torch.arange(L, device=device)[None, None]
+        elif self.t.shape[-1] < L:
+            self.t = torch.arange(L, device=device)[None, None]
+        else:
+            self.t = self.t[..., :L]
+    def compute_filter(self, L, device):
+        self.update_time(L, device)
+        filter_dtype = torch.float32
+        residues, log_poles = (
+            self.residues.to(filter_dtype),
+            self.log_poles.to(filter_dtype),
+        )
+        h = (residues[..., None] * (log_poles * self.t).exp()).sum(1)[None]  # B, D, L
+        return h, filter_dtype, log_poles, residues
+class ParallelGatedConvBlock(nn.Module):
+    def __init__(self, config, layer_idx, hyena_filter_groups=None, fir_inner_filter_length=None) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.print_activations = config.get("print_activations", False)
+        self.ground_truth_activations_path = config.get("ground_truth_activations_path", None)
+        self.low_mem_mode = config.get("low_mem_mode", False)
+        self.fir_inner_filter_length = fir_inner_filter_length
+        self.hyena_filter_groups = hyena_filter_groups if hyena_filter_groups is not None else config.hidden_size
+        dtype = config.get("hyena_block_dtype", torch.bfloat16)
+        mlp_dtype = config.get("mlp_dtype", torch.bfloat16)
+        self.pre_norm, self.post_norm = (
+            RMSNorm(config).to(dtype=dtype),
+            RMSNorm(config).to(dtype=dtype),
+        )
+        self.filter = HyenaCascade(
+            config,
+            layer_idx,
+            hyena_filter_groups=self.hyena_filter_groups,
+            fir_inner_filter_length=fir_inner_filter_length,
+        ).to(dtype=dtype)
+        # For posterity/debugging: TELinear can be easily replaced by
+        # nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=config.qkv_proj_bias).to(dtype=dtype)
+        # which sometimes is very useful when debugging FP8.
+        self.projections = TELinear(
+            config.hidden_size,
+            3 * config.hidden_size,
+            bias=config.qkv_proj_bias,
+            init_method=torch.nn.init.xavier_uniform_,
+            use_fp8=config.get("use_fp8_input_projections", False),
+        )
+        self.out_filter_dense = nn.Linear(config.hidden_size, config.hidden_size, bias=config.hyena_out_proj_bias).to(
+            dtype
+        )
+        self.mlp = ParallelGatedMLP(config, layer_idx).to(dtype=mlp_dtype)
+        # self.proj_norm_fn = self.proj_norm
+        # self.res_mlp_norm_fn = self.res_mlp_norm
+        if self.config.get("compile", False):
+            self.proj_norm_fn = torch.compile(self.proj_norm, fullgraph=True, dynamic=False, mode="reduce-overhead")
+            self.res_mlp_norm_fn = torch.compile(
+                self.res_mlp_norm, fullgraph=True, dynamic=False, mode="reduce-overhead"
+            )
+    def pad_to_multiple(self, x, multiple=16):
+        """Pad input tensor to multiple of 16 only when FP8 is enabled"""
+        if not self.config.get("use_fp8_input_projections", False):
+            return x
+        batch_size, seq_len, hidden_dim = x.size()
+        pad_len = (multiple - (seq_len % multiple)) % multiple
+        if pad_len == 0:
+            return x
+        return F.pad(x, (0, 0, 0, pad_len))
+    def proj_norm(self, x):
+        if self.print_activations:
+            activations_logger.info(f"pre mixer norm: {x} {x.min()} {x.max()} {self.projections.__class__}")
+            activations_logger.info(
+                f"post mixer norm: {self.pre_norm(x)} {self.pre_norm(x).min()} {self.pre_norm(x).max()}"
+            )
+            if self.ground_truth_activations_path:
+                pre_norm_savanna = torch.load(
+                    f"{self.ground_truth_activations_path}/pre_mixer_norm_{self.layer_idx}.pt"
+                )
+                post_norm_savanna = torch.load(
+                    f"{self.ground_truth_activations_path}/post_mixer_norm_{self.layer_idx}.pt"
+                )
+                activation_diff = (x.squeeze() - pre_norm_savanna.squeeze()).abs()
+                activations_logger.info(
+                    f"pre mixer norm activation_diff: {activation_diff.max()}, {activation_diff.mean()}"
+                )
+                activation_diff = (self.pre_norm(x).squeeze() - post_norm_savanna.squeeze()).abs()
+                activations_logger.info(
+                    f"post mixer norm activation_diff: {activation_diff.max()}, {activation_diff.mean()}"
+                )
+                activations_logger.info(
+                    f"pre norm scale: {self.pre_norm.scale}, {self.pre_norm.scale.min()}, {self.pre_norm.scale.max()}"
+                )
+        normalized = self.pre_norm(x)
+        normalized = self.pad_to_multiple(normalized)
+        with torch.cuda.device(x.device):
+            projected = self.projections(normalized)
+        if isinstance(projected, tuple):
+            projected = projected[0]
+        original_seq_len = x.size(1)
+        # Slice back to original sequence length if padding was added
+        if projected.size(1) > original_seq_len:
+            projected = projected[:, :original_seq_len, :]
+        return projected
+    def res_mlp_norm(self, x):
+        if self.print_activations:
+            activations_logger.info(f"pre mlp: {x} {x.min()} {x.max()} {self.mlp.__class__}")
+            activations_logger.info(
+                f"post mlp norm: {self.post_norm(x)} {self.post_norm(x).min()} {self.post_norm(x).max()}"
+            )
+            activations_logger.info(
+                f"post mlp: {self.mlp(self.post_norm(x))} {self.mlp(self.post_norm(x)).min()} {self.mlp(self.post_norm(x)).max()}"
+            )
+            if self.ground_truth_activations_path:
+                pre_mlp_savanna = torch.load(f"{self.ground_truth_activations_path}/pre_mlp_{self.layer_idx}.pt")
+                post_mlp_savanna = torch.load(f"{self.ground_truth_activations_path}/post_mlp_norm_{self.layer_idx}.pt")
+                activation_diff = (x.squeeze() - pre_mlp_savanna.squeeze()).abs()
+                activations_logger.info(f"pre mlp activation_diff: {activation_diff.max()}, {activation_diff.mean()}")
+                activation_diff = (self.post_norm(x).squeeze() - post_mlp_savanna.squeeze()).abs()
+                activations_logger.info(
+                    f"post mlp norm activation_diff: {activation_diff.max()}, {activation_diff.mean()}"
+                )
+        return self.mlp(self.post_norm(x)) + x
+    def forward(self, u, inference_params=None, padding_mask=None, *args, **kwargs):
+        z = self.proj_norm(u)
+        if type(padding_mask) == torch.Tensor:  # guard against bias
+            z = z * padding_mask[..., None]
+        if self.print_activations:
+            activations_logger.info(f"pre filter: {z} {z.min()} {z.max()} {self.filter.__class__}")
+            if self.ground_truth_activations_path:
+                z_savanna = torch.load(f"{self.ground_truth_activations_path}/pre_filter_{self.layer_idx}.pt")
+                activation_diff = (z - z_savanna.squeeze()).abs()
+                activations_logger.info(
+                    f"pre filter activation_diff: {activation_diff.max()}, {activation_diff.mean()}"
+                )
+        z, inference_params = self.filter(z, inference_params=inference_params, padding_mask=padding_mask)
+        if self.print_activations:
+            activations_logger.info(f"post postgate: {z} {z.min()} {z.max()} {self.filter.__class__}")
+            activations_logger.info(
+                f"post out proj: {self.out_filter_dense(z)} {self.out_filter_dense(z).min()} {self.out_filter_dense(z).max()} {self.out_filter_dense.__class__}"
+            )
+            activations_logger.info(
+                f"post mixer dense and residual: {self.out_filter_dense(z) + u} {(self.out_filter_dense(z) + u).min()} {(self.out_filter_dense(z) + u).max()}"
+            )
+            activations_logger.info(
+                f"post mixer dense: {self.out_filter_dense(z)} {self.out_filter_dense(z).min()} {self.out_filter_dense(z).max()}"
+            )
+            activations_logger.info(f"post mixer: {z} {z.min()} {z.max()}")
+            if self.ground_truth_activations_path:
+                z_savanna = torch.load(f"{self.ground_truth_activations_path}/post_filter_{self.layer_idx}.pt")
+                activation_diff = (z - z_savanna.squeeze()).abs()
+                activations_logger.info(
+                    f"post filter activation_diff: {activation_diff.max()}, {activation_diff.mean()}"
+                )
+                z_savanna = torch.load(f"{self.ground_truth_activations_path}/post_out_proj_{self.layer_idx}.pt")
+                z_ = F.linear(z, self.out_filter_dense.weight)
+                activation_diff = (z_ - z_savanna.squeeze()).abs()
+                activations_logger.info(
+                    f"post out proj activation_diff: {activation_diff.max()}, {activation_diff.mean()}"
+                )
+        z_in = self.out_filter_dense(z) + u
+        # if self.layer_idx == 0:
+        #    z_in = z_savanna.squeeze() + u + self.out_filter_dense.bias
+        if type(padding_mask) == torch.Tensor:  # guard against bias
+            z_in = z_in * padding_mask[..., None]
+        y = self.res_mlp_norm(z_in)
+        return y, inference_params
+def get_block(config, layer_idx, flash_fft=None):
+    if layer_idx in config.attn_layer_idxs:
+        return AttentionBlock(config, layer_idx)
+    elif layer_idx in config.hcl_layer_idxs:
+        block = ParallelGatedConvBlock(config, layer_idx)
+        if config.get("use_flashfft", "False"):
+            block.filter.fftconv_fn = flash_fft
+        return block
+    elif layer_idx in config.hcm_layer_idxs:
+        block = ParallelGatedConvBlock(
+            config,
+            layer_idx,
+            hyena_filter_groups=config.hcm_filter_groups,
+            fir_inner_filter_length=config.hcm_filter_length,
+        )
+        return block
+    elif layer_idx in config.hcs_layer_idxs:
+        block = ParallelGatedConvBlock(
+            config,
+            layer_idx,
+            hyena_filter_groups=config.hcs_filter_groups,
+            fir_inner_filter_length=config.hcs_filter_length,
+        )
+        return block
+    else:
+        raise NotImplementedError
+class StripedHyena(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        fixup_te_workspace()  # Workaround global cublas workspaces in TE
+        self.config = config
+        self.print_activations = config.get("print_activations", False)
+        if self.print_activations:
+            enable_activations_logging()
+        self.logger = logging.getLogger(self.__class__.__name__)
+        self.ground_truth_activations_path = config.get("ground_truth_activations_path", None)
+        self.logger.info(f"Initializing StripedHyena with config: {config}")
+        with torch.device("cuda:0" if torch.cuda.is_available() else "cpu"):
+            self.embedding_layer = VocabParallelEmbedding(config)
+        if config.get("use_flashfft", "True"):
+            try:
+                from flashfftconv import FlashFFTConv
+                self.flash_fft = FlashFFTConv(config.seqlen, dtype=torch.bfloat16)
+            except ImportError:
+                "flashfftconv not installed"
+        else:
+            self.flash_fft = None
+        if not self.config.get('evo2_style_activations', False):
+            self.logger.warning(
+                "⚠️  Not using Evo2 style activations  ⚠️\n"
+                "⚠️ Set 'evo2_style_activations: True' in config if you are using Evo 2 checkpoints ⚠️"
+            )
+        self.logger.info(f"Initializing {config.num_layers} blocks...")
+        self.blocks = nn.ModuleList()
+        self.block_idx_to_device = {}
+        # Calculate layers per GPU
+        num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1
+        layers_per_gpu = math.ceil(config.num_layers / num_gpus)
+        self.logger.info(f"Distributing across {num_gpus} GPUs, approximately {layers_per_gpu} layers per GPU")
+        for layer_idx in tqdm(range(config.num_layers)):
+            # Determine which GPU should handle this layer
+            device_idx = min(layer_idx // layers_per_gpu, num_gpus - 1)
+            device = f"cuda:{device_idx}" if torch.cuda.is_available() else "cpu"
+            with torch.device(device):
+                # TELinear uses `device="cuda"` device to allocate empty bias
+                # tensor. This makes sure that the empty tensor is allocated on the
+                # correct device. (torch.device(), unlike torch.cuda.device(),
+                # doesn't override current CUDA device.)
+                with torch.cuda.device(device):
+                    block = get_block(config, layer_idx, flash_fft=self.flash_fft)
+                    move_to_device(block, device)
+            self.blocks.append(block)
+            self.block_idx_to_device[layer_idx] = device
+            self.logger.info(f"Assigned {layer_idx=} to {device=}")
+            self.logger.info(
+                f"Parameter count for block {layer_idx}: {sum(p.numel() for p in self.blocks[-1].parameters())}"
+            )
+        with torch.device(self.block_idx_to_device[0]):
+            with torch.cuda.device(self.block_idx_to_device[0]):
+                self.norm = RMSNorm(config) if config.get("final_norm", True) else None
+                if config.tie_embeddings:
+                    # Lambda usage is to be able to use forward() on caller side, which in
+                    # turn is needed for PyTorch hooks to work properly.
+                    self.unembed = Lambda(self.embedding_layer.unembed)
+                else:
+                    if config.tie_embeddings:
+                        # Technically we can support this mode, just need to
+                        # copy tensors across GPUs then. But let's implement it
+                        # once/if needed.
+                        self.logger.info("Ignoring tie_embeddings for now.")
+                    self.unembed = VocabParallelUnembedding(config)
+        self.logger.info("Initialized model")
+    def forward(self, x, inference_params_dict=None, padding_mask=None):
+        L = x.shape[1]
+        if self.print_activations:
+            activations_logger.info(f"pre embedding: {x}, {x.min()}, {x.max()}")
+        x = self.embedding_layer(x)
+        if self.print_activations:
+            activations_logger.info(f"post embedding: {x}, {x.min()}, {x.max()}")
+        if inference_params_dict is not None:
+            x, inference_params_dict_out = self.stateful_forward(
+                x,
+                inference_params_dict=inference_params_dict,
+            )
+        else:
+            x, inference_params_dict_out = self.stateless_forward(x, padding_mask=padding_mask)
+        if self.print_activations:
+            activations_logger.info(f"pre norm: {x}, {x.min()}, {x.max()}")
+        # By convention, we return results on the first device
+        x = x.to(self.block_idx_to_device[0])
+        x = self.norm(x)
+        if self.print_activations:
+            activations_logger.info(f"post norm: {x}, {x.min()}, {x.max(), {self.norm.scale}}")
+        x = self.unembed(x)
+        return x, inference_params_dict_out
+    def block_idx_to_name(self, block_idx):
+        if block_idx in self.config.attn_layer_idxs:
+            return "mha"
+        elif block_idx in self.config.hcl_layer_idxs:
+            return "hcl"
+        elif block_idx in self.config.hcm_layer_idxs:
+            return "hcm"
+        elif block_idx in self.config.hcs_layer_idxs:
+            return "hcs"
+        else:
+            raise ValueError(f"Block index {block_idx} not found")
+    def cross_device_transfer(self, x, block_idx):
+        if self.block_idx_to_device[max(block_idx - 1, 0)] != self.block_idx_to_device[block_idx]:
+            x = x.to(self.block_idx_to_device[block_idx])
+        return x
+    def stateful_forward(self, x, inference_params_dict=None):
+        for block_idx, block in enumerate(self.blocks):
+            inference_params = inference_params_dict[self.block_idx_to_name(block_idx)]
+            if self.print_activations:
+                activations_logger.info(f"pre block {block_idx}: {x}, {x.min()}, {x.max()} {block.__class__}")
+                if self.ground_truth_activations_path:
+                    x_savanna = torch.load(f"{self.ground_truth_activations_path}/pre_block_{block_idx}.pt")
+                    activation_diff = (x - x_savanna.squeeze()).abs()
+                    activations_logger.info(
+                        f"pre block {block_idx} activation_diff: {activation_diff.max()}, {activation_diff.mean()}"
+                    )
+            x = self.cross_device_transfer(x, block_idx)
+            x, _ = block(x, inference_params=inference_params)
+            if self.print_activations:
+                activations_logger.info(f"post block {block_idx}: {x}, {x.min()}, {x.max()}")
+                if self.ground_truth_activations_path:
+                    x_savanna = torch.load(f"{self.ground_truth_activations_path}/post_block_{block_idx}.pt")
+                    activation_diff = (x - x_savanna.squeeze()).abs()
+                    activations_logger.info(
+                        f"post block {block_idx} activation_diff: {activation_diff.max()}, {activation_diff.mean()}"
+                    )
+        return x, inference_params_dict
+    def stateless_forward(self, x, padding_mask=None):
+        if type(padding_mask) == torch.Tensor:
+            x = x * padding_mask[..., None]
+        for block_idx, block in enumerate(self.blocks):
+            if self.print_activations:
+                activations_logger.info(f"pre block {block_idx}: {x}, {x.min()}, {x.max()} {block.__class__}")
+                if self.ground_truth_activations_path:
+                    x_savanna = torch.load(f"{self.ground_truth_activations_path}/pre_block_{block_idx}.pt")
+                    activation_diff = (x - x_savanna.squeeze()).abs()
+                    activations_logger.info(
+                        f"pre block {block_idx} activation_diff: {activation_diff.max()}, {activation_diff.mean()}"
+                    )
+            x = self.cross_device_transfer(x, block_idx)
+            x, _ = block(x, inference_params=None, padding_mask=padding_mask)
+            if self.print_activations:
+                activations_logger.info(f"post block {block_idx}: {x}, {x.min()}, {x.max()}")
+                if self.ground_truth_activations_path:
+                    x_savanna = torch.load(f"{self.ground_truth_activations_path}/post_block_{block_idx}.pt")
+                    activation_diff = (x - x_savanna.squeeze()).abs()
+                    activations_logger.info(
+                        f"post block {block_idx} activation_diff: {activation_diff.max()}, {activation_diff.mean()}"
+                    )
+        return x, None
+    def initialize_inference_params(self, max_seqlen=None):
+        ## Input seqlen takes priority over config!
+        ## WARNING: This avoids potential errors but means the model can be used beyond length it was trained at
+        config_seqlen = self.config.get("max_seqlen", None)
+        if config_seqlen is None:
+            print("No max_seqlen found in config!!! using default value of 8192")
+            config_seqlen = 8192
+        new_max_seqlen = max_seqlen if max_seqlen != None else config_seqlen
+        # self.config["max_seqlen"] = new_max_seqlen
+        ## Note: changing the stored config max_seqlen will change the max_seqlen used in flash attention, leading to minor logit differences
+        print(f"Initializing inference params with max_seqlen={new_max_seqlen}")
+        inference_params_dict = {
+            "mha": InferenceParams(
+                max_seqlen=new_max_seqlen,
+                max_batch_size=self.config.get("max_batch_size", 1),
+                seqlen_offset=0,
+            ),
+            "hcl": HyenaCascadeIIRInferenceParams(
+                fir_filter_length=self.config.short_filter_length,
+                state_dim=self.config.state_size,
+                seqlen_offset=0,
+            ),
+            "hcm": HyenaCascadeFIRInferenceParams(
+                fir_filter_length=self.config.short_filter_length,
+                fir_inner_filter_length=self.config.hcm_filter_length,
+                seqlen_offset=0,
+            ),
+            "hcs": HyenaCascadeFIRInferenceParams(
+                fir_filter_length=self.config.short_filter_length,
+                fir_inner_filter_length=self.config.hcs_filter_length,
+                seqlen_offset=0,
+            ),
+        }
+        return inference_params_dict
+    def precompute_filters(self, L, device):
+        for block_idx, block in enumerate(self.blocks):
+            if type(block) == ParallelGatedConvBlock:
+                if type(block.filter) == HyenaCascade:
+                    L = block.filter.long_fir_threshold or L
+                    print_rank_0(f"Precomputing filters, L={L}...")
+                    filter_dtype = torch.float16 if L >= 2048 else torch.float32
+                    block.filter._set_time(L, device)
+                    residues, poles = (
+                        block.filter.residues.to(torch.float16),
+                        block.filter.poles.to(torch.float16),
+                    )
+                    block.filter.h = (residues * poles**block.filter.t).real.sum(1)[None]
+                    block.filter.h = block.filter.h.to(dtype=filter_dtype)
+    def load_poles_residues(self, path):
+        "Load different poles and residues for each layer."
+        for block_idx, block in enumerate(self.blocks):
+            if type(block) == ParallelGatedConvBlock:
+                if type(block.filter) == HyenaCascade:
+                    self.logger.info(f"Loading approximatepoles and residues for block {block_idx}")
+                    poles = torch.load(path + f"/approx_poles_{block_idx+1}.pt", map_location="cpu")
+                    poles = torch.view_as_real(poles)
+                    residues = torch.load(path + f"/approx_residues_{block_idx+1}.pt", map_location="cpu")
+                    residues = torch.view_as_real(residues)
+                    poles = poles.permute(1, 0, 2).unsqueeze(-2)
+                    residues = residues.permute(1, 0, 2).unsqueeze(-2)
+                    block.filter.poles = nn.Parameter(poles)
+                    block.filter.residues = nn.Parameter(residues)
+    def custom_load_state_dict(self, state_dict, strict=True):
+        """
+        Post-processes the state_dict to convert savanna checkpoints to vortex checkpoints.
+        """
+        self.logger.debug(f"Loading state dict: {state_dict}, (ignoring extra keys) with strict: {strict}")
+        model_dict = self.state_dict()
+        # Find keys that are in model_dict but not in state_dict
+        missing_in_state_dict = model_dict.keys() - state_dict.keys()
+        # Find keys that are in state_dict but not in model_dict
+        extra_in_state_dict = state_dict.keys() - model_dict.keys()
+        if missing_in_state_dict:
+            print(f"Keys missing in state_dict: {missing_in_state_dict}")
+        if extra_in_state_dict:
+            print(f"Extra keys in state_dict: {extra_in_state_dict}")
+        filtered_dict = {k: v for k, v in state_dict.items() if k in model_dict}
+        if all("._extra_state" in k for k in missing_in_state_dict):
+            self.logger.info("Checkpoint has no FP8 extra state, will be using initial state.")
+            for k in missing_in_state_dict:
+                filtered_dict[k] = None
+        self.load_state_dict(filtered_dict, strict=strict)
+        fixup_fp8_extra_states(self)
+        if self.config.get("column_split", True):
+            self.logger.info("Adjusting Wqkv for column split (permuting rows)")
+            for layer_idx, block in enumerate(self.blocks):
+                if type(block) == AttentionBlock:
+                    target_device = block.inner_mha_cls.Wqkv.weight.device
+                    Wqkv = state_dict[f"blocks.{layer_idx}.inner_mha_cls.Wqkv.weight"]
+                    try:
+                        bias = state_dict[f"blocks.{layer_idx}.inner_mha_cls.Wqkv.bias"]
+                    except:
+                        bias = None
+                    size_att_head = block.hidden_size_per_attention_head
+                    Wqkv = Wqkv.permute(1, 0)
+                    Wqkv = Wqkv.reshape(block.hidden_size, block.num_attention_heads, 3, size_att_head)
+                    Wq, Wk, Wv = Wqkv.unbind(dim=-2)
+                    Wq = Wq.reshape(block.hidden_size, -1)
+                    Wk = Wk.reshape(block.hidden_size, -1)
+                    Wv = Wv.reshape(block.hidden_size, -1)
+                    Wqkv = torch.cat([Wq, Wk, Wv], dim=-1)
+                    Wqkv = Wqkv.permute(1, 0)
+                    # Single device transfer at the end
+                    block.inner_mha_cls.Wqkv.weight.data = Wqkv.to(target_device)
+                    if bias is not None:
+                        bias = bias.cpu()  # Process on CPU
+                        bias = bias.reshape(block.num_attention_heads, 3, size_att_head)
+                        bias_q, bias_k, bias_v = bias.unbind(dim=-2)
+                        bias_q = bias_q.reshape(block.hidden_size)
+                        bias_k = bias_k.reshape(block.hidden_size)
+                        bias_v = bias_v.reshape(block.hidden_size)
+                        bias = torch.cat([bias_q, bias_k, bias_v], dim=0)
+                        try:
+                            block.inner_mha_cls.Wqkv.bias.data = bias.to(target_device)
+                        except:
+                            pass
+    def to_bfloat16_except_pr_lc(self, to_float32=False):
+        """Convert all parameters to bfloat16 except for the poles and residues.
+        Particularly important for longer prompts.
+        """
+        excluded_shapes = [(4096, 1, 128)]
+        for k, p in self.named_parameters():
+            if "projections" not in k:  # avoid TE linears
+                if "log_poles" not in k and "residues" not in k and p.shape not in excluded_shapes:
+                    p.data = p.data.to(torch.bfloat16)
+                else:
+                    if to_float32:
+                        p.data = p.data.to(torch.float32)
+        for k, b in self.named_buffers():
+            if "inv_freq" in k:
+                if to_float32:
+                    b.data = b.data.to(torch.float32)

positional_embeddings.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copied verbatim from vortex
+"""
+Armin Thomas, Jan 2023.  Modified by Eric Nguyen.
+Wrappers for linearly interpolated rope embeddings to use inside of MHA layers of Flash Attn.
+"""
+import torch
+from einops import rearrange
+from .rotary import RotaryEmbedding
+# simple wrapper for flash-attn RoPE with linear scaling:
+class LinearlyScaledRotaryEmbedding(RotaryEmbedding):
+    def __init__(
+        self,
+        dim: int,
+        scaling_factor: float = 1.0,
+        base=10000.0,
+        interleaved=False,
+        scale_base=None,
+        pos_idx_in_fp32=True,
+        device=None,
+    ):
+        super().__init__(
+            dim=dim,
+            base=base,
+            interleaved=interleaved,
+            scale_base=scale_base,
+            pos_idx_in_fp32=pos_idx_in_fp32,
+            device=device,
+        )
+        self._linear_scaling_factor = scaling_factor
+    # adpated from: https://github.com/Dao-AILab/flash-attention/blob/43ceab630bc6c27712428da5a33fc9cb5c369d91/flash_attn/layers/rotary.py#L368
+    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
+        # Reset the tables if the sequence length has changed,
+        # if we're on a new device (possibly due to tracing for instance),
+        # or if we're switching from inference mode to training
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached is None
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+            or (self.training and self._cos_cached.is_inference())
+        ):
+            self._seq_len_cached = seqlen
+            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
+            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
+            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
+            if self.pos_idx_in_fp32:
+                t = torch.arange(seqlen, device=device, dtype=torch.float32)
+                # linear scaling:
+                t = t / self._linear_scaling_factor
+                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
+                # will be large. Having it in bf16 will lose a lot of precision and cause the
+                # cos & sin output to change significantly.
+                # We want to recompute self.inv_freq if it was not loaded in fp32
+                if self.inv_freq.dtype != torch.float32:
+                    inv_freq = self._compute_inv_freq(device=device)
+                else:
+                    inv_freq = self.inv_freq
+            else:
+                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+                # linear scaling:
+                t = t / self._linear_scaling_factor
+                inv_freq = self.inv_freq
+            # Don't do einsum, it converts fp32 to fp16 under AMP
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            freqs = torch.outer(t, inv_freq)
+            if self.scale is None:
+                self._cos_cached = torch.cos(freqs).to(dtype)
+                self._sin_cached = torch.sin(freqs).to(dtype)
+            else:
+                power = (
+                    torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) - seqlen // 2
+                ) / self.scale_base
+                scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
+                # We want the multiplication by scale to happen in fp32
+                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
+                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
+                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
+                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
+# swap out RoPE of existing mha:
+def swap_mha_rope(
+    mha,
+    new_rope: torch.nn.Module = LinearlyScaledRotaryEmbedding,
+    kwargs_new_rope: dict = None,
+):
+    # determine mha dtype and device:
+    dtype = mha.Wq.weight.dtype if mha.cross_attn else mha.Wqkv.weight.dtype
+    device = mha.Wq.weight.device if mha.cross_attn else mha.Wqkv.weight.device
+    # determine RoPE settings:
+    kwargs_old_rope = dict(
+        dim=mha.rotary_emb.dim,
+        base=mha.rotary_emb.base,
+        interleaved=mha.rotary_emb.interleaved,
+        scale_base=mha.rotary_emb.scale_base,
+        pos_idx_in_fp32=mha.rotary_emb.pos_idx_in_fp32,
+        device=mha.rotary_emb.inv_freq.device,
+    )
+    # delete old RoPE:
+    del mha.rotary_emb
+    # create new RoPE:
+    kwargs_new_rope = kwargs_new_rope or {"scaling_factor": 1.0}
+    scaled_rope = new_rope(**kwargs_new_rope, **kwargs_old_rope).to(dtype)
+    # attach new RoPE to mha:
+    mha.rotary_emb = scaled_rope
+    # make new sure RoPE is correctly registered:
+    assert isinstance(mha.rotary_emb, new_rope)
+    return mha

sample.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Copied verbatim from vortex
+import torch
+# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
+# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L231
+def modify_logits_for_top_k_filtering(logits, top_k):
+    """Set the logits for none top-k values to -inf. Done in-place."""
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits.masked_fill_(indices_to_remove, float("-Inf"))
+# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
+# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L170
+def modify_logits_for_top_p_filtering(logits, top_p):
+    """Set the logits for none top-p values to -inf. Done in-place."""
+    if top_p <= 0.0 or top_p >= 1.0:
+        return
+    # First sort and calculate cumulative sum of probabilities.
+    sorted_logits, sorted_indices = torch.sort(logits, descending=False)
+    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+    # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
+    sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
+    # scatter sorted tensors to original indexing
+    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+    logits.masked_fill_(indices_to_remove, float("-inf"))
+# https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/generation.py
+def sample(logits, top_k=1, top_p=0.0, temperature=1.0):
+    """Sample from top-k logits.
+    Arguments:
+        logits: Tensor of shape (batch_size, vocab_size)
+    """
+    logits = torch.nan_to_num(logits)
+    logits = torch.where(logits == float("-inf"), 0, logits)
+    logits = torch.where(logits == float("inf"), 0, logits)
+    if top_k == 1:  # Short-circuit for greedy decoding
+        return logits.argmax(dim=-1)
+    else:
+        if top_p > 0.0:
+            assert top_p <= 1.0, "top-p should be in (0, 1]."
+        if top_k > 0:
+            top_k = min(top_k, logits.size(-1))  # Safety check
+            logits_top, indices = torch.topk(logits, top_k, dim=-1)
+            if temperature != 1.0:
+                logits_top /= temperature
+            modify_logits_for_top_p_filtering(logits_top, top_p)
+            return indices[
+                torch.arange(indices.shape[0], device=indices.device),
+                torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1),
+            ]
+        else:
+            # Clone so that when we modify for top_p we don't change the original logits
+            logits_top = logits / temperature if temperature != 1.0 else logits.clone()
+            modify_logits_for_top_p_filtering(logits_top, top_p)
+            return torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

utils.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# Copied veratim from vortex
+import torch
+import logging
+log = logging.getLogger(__name__)
+def get_dim_for_local_rank(dim: int, world_size: int, local_rank: int, multiple_of: int = 1) -> int:
+    """Get the dim for the local rank derived from splitting dim on world_size processes.
+    The split may not be even across the world_size processes.
+    """
+    multiple = dim // multiple_of
+    div = multiple // world_size
+    mod = multiple % world_size
+    local_multiple = div + int(local_rank < mod)
+    return local_multiple * multiple_of
+def grab_first_if_tuple(x):
+    if x.__class__.__name__ == "tuple":
+        return x[0]
+    else:
+        return x
+def interleave(z_pre):
+    if len(z_pre.shape) == 3:  # non-cached
+        x1 = z_pre[:, 0::3, :]
+        x2 = z_pre[:, 1::3, :]
+        v = z_pre[:, 2::3, :]
+        z_pre = torch.cat([x1, x2, v], dim=1)
+        return z_pre
+    else:
+        x1 = z_pre[..., 0::3]
+        x2 = z_pre[..., 1::3]
+        v = z_pre[..., 2::3]
+        z_pre = torch.concat([x1, x2, v], dim=-1)
+        return z_pre
+def column_split(x, num_heads, head_size):
+    """Split a tensor with `num_heads` alongside the head dimension, instead of
+    across heads. Fixed to three projections
+    """
+    # FIXME: merge cases
+    if len(x.shape) == 2:
+        x_reshaped = x.reshape(
+            x.shape[0],
+            num_heads,
+            3 * head_size,
+        )
+        x2, x1, v = (
+            x_reshaped[..., :head_size],
+            x_reshaped[..., head_size : 2 * head_size],
+            x_reshaped[..., 2 * head_size :],
+        )
+        x2, x1, v = (
+            x2.reshape(x2.shape[0], -1),
+            x1.reshape(x1.shape[0], -1),
+            v.reshape(v.shape[0], -1),
+        )
+        return x2, x1, v
+    else:
+        x = x.reshape(
+            x.shape[0],
+            num_heads,
+            3 * head_size,
+            x.shape[2],
+        )
+        x2, x1, v = (
+            x[:, :, :head_size],
+            x[
+                :,
+                :,
+                head_size : 2 * head_size,
+            ],
+            x[:, :, 2 * head_size :],
+        )
+        x2, x1, v = (
+            x2.reshape(x2.shape[0], -1, x2.shape[-1]),
+            x1.reshape(x1.shape[0], -1, x1.shape[-1]),
+            v.reshape(v.shape[0], -1, v.shape[-1]),
+        )
+        return x2, x1, v
+def load_checkpoint(model, checkpoint_path):
+    if checkpoint_path is None:
+        log.warning("Using random weights (dry-run)")
+        return
+    log.info(f"Loading {checkpoint_path}")
+    # We must allowlist BytesIO, as fp8-enabled checkpoints store this type
+    # in Transformer Engine layers' _extra keys. If not, weights_only=True
+    # will not be happy.
+    import io
+    torch.serialization.add_safe_globals([io.BytesIO])
+    with torch.inference_mode():
+        state = torch.load(
+            checkpoint_path,
+            # Make sure we override device location that is specified in the
+            # checkpoint dictionary (e.g. checkpoints may have "cuda:0"
+            # as a location for all layers, which then wouldn't work for
+            # multi-GPU case.)
+            map_location="cpu",
+            # This is an optimization: with that, we don't actually read
+            # whole checkpoints dictionary from disk to CPU memory in one
+            # go; instead, pytorch would only load relevant layers to CPU
+            # memory when we are about to copy them to GPU.
+            mmap=True,
+            # Make sure PyTorch is not issuing a warning regarding potential
+            # security issues.
+            weights_only=True,
+        )
+        model.to_bfloat16_except_pr_lc(to_float32=True)
+        model.custom_load_state_dict(state)
+        model.to_bfloat16_except_pr_lc()
+def move_to_device(module, device):
+    """Recursively moves all parameters and buffers to the specified device."""
+    for child in module.children():
+        move_to_device(child, device)
+    for param in module.parameters(recurse=False):
+        if param.device != device:
+            param.data = param.data.to(device)
+    for buf in module.buffers(recurse=False):
+        if buf.device != device:
+            buf.data = buf.data.to(device)
+    module.to(device)
+def fixup_fp8_extra_states(module):
+    """Recursively fixes device location of TE's Linear fp8 extra states."""
+    for child in module.children():
+        fixup_fp8_extra_states(child)
+    # TE Linear uses default "cuda" device to load extra state, which causes
+    # trouble when the layer is moved to another GPU. Instead, this is how
+    # TE Linear should load extra_state: using parameters' device.
+    torch_load = torch.load
+    def overriden_load(state, map_location):
+        device = next(module.parameters()).device
+        return torch_load(state, map_location=device)
+    if hasattr(module, "fp8_meta"):
+        log.debug(f"Reloading fp8 extra state to a proper device for {module}")
+        from unittest.mock import patch
+        with patch("torch.load", new=overriden_load):
+            module.set_extra_state(module.get_extra_state())
+def fixup_te_workspace():
+    """TE uses single workspace tensor for all calls, disregarding that inputs
+    may be on separate GPUs. This patches TE's Linear module to use per-device
+    workspaces."""
+    from functools import lru_cache
+    @lru_cache
+    def te_cublas_get_workspace_per_device(device):
+        log.info(f"Fixup applied: Allocating cublas workspace for {device=}")
+        import transformer_engine.pytorch.module.base as tebase
+        with torch.cuda.device(device):
+            tebase._cublas_workspace = None  # Force get_workspace() to reallocate tensor
+            return tebase.get_workspace()
+    def get_workspace():
+        return te_cublas_get_workspace_per_device(torch.cuda.current_device())
+    import transformer_engine.pytorch.module.linear as telinear
+    telinear.get_workspace = get_workspace
+def get_init_from_string(init_str):
+    if type(init_str) == str:
+        if init_str == "torch.nn.init.zeros_":
+            return torch.nn.init.zeros_
+        elif init_str == "torch.nn.init.xavier_uniform_":
+            return torch.nn.init.xavier_uniform_
+        elif init_str == "torch.nn.init.xavier_normal_":
+            return torch.nn.init.xavier_normal_
+        else:
+            raise ValueError(f"Unrecognized init {init_str}")
+def print_rank_0(message, debug=False, end="\n"):
+    """Print from rank 0 only."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True, end=end)
+    else:
+        print(message, flush=True, end=end)
+class dotdict(dict):
+    """dot.notation access to dictionary attributes"""
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator)
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+class Lambda(torch.nn.Module):
+    def __init__(self, func):
+        super().__init__()
+        self.func = func
+    def forward(self, x):
+        return self.func(x)
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+    first and last index of the vocabulary belonging to the `rank`
+    partition: Note that indices in [first, last]"""
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size)