yagizdevre commited on Jan 30, 2025

Commit

6ff2080

1 Parent(s): 9064a3d

added configs

Browse files

Files changed (22) hide show

.gitattributes copy +35 -0
__init__.py +2 -0
added_tokens.json +3 -0
attn.py +191 -0
attn_masks.py +188 -0
attn_mods.py +127 -0
config.json +51 -0
configuration_minitransformer.py +44 -0
convolve.py +84 -0
filters.py +106 -0
layers.py +41 -0
merges.txt +0 -0
mlp.py +22 -0
modeling_minitransformer.py +218 -0
modules.py +6 -0
rotary_emb.py +99 -0
special_tokens_map.json +23 -0
stu.py +87 -0
tokenizer.json +0 -0
tokenizer_config.json +27 -0
utils.py +105 -0
vocab.json +0 -0

.gitattributes copy ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .configuration_minitransformer import MiniTransformerConfig
2	+ from .modeling_minitransformer import MiniTransformer

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "<|endofprompt|>": 200018
+}

attn.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import math
+import torch
+from torch.nn.attention.flex_attention import flex_attention, create_block_mask
+import torch.nn as nn
+import torch.nn.functional as F
+from .rotary_emb import apply_rotary_emb
+from .utils import nearest_power_of_two
+try:
+    from flash_attn import flash_attn_func as fa2
+except ImportError as e:
+    print(
+        f"Unable to import Triton-based flash attention: {e}. No alternative currently available."
+    )
+    # TODO: Add FlexAttention + local attention mask when it's in stable release
+class Attention(nn.Module):
+    def __init__(self, config):
+        super(Attention, self).__init__()
+        if isinstance(config.torch_dtype, str):
+            torch_dtype = getattr(torch, config.torch_dtype)
+        else:
+            torch_dtype = config.torch_dtype
+        assert torch.cuda.is_available(), "CUDA is required."
+        assert config.n_embd % config.n_heads == 0
+        self.n_heads = config.n_heads
+        self.device = torch.device("cuda")
+        self.bsz = config.bsz
+        self.attn = nn.Linear(
+            config.n_embd, 3 * config.n_embd, bias=config.bias, dtype=torch_dtype
+        )
+        self.o_proj = nn.Linear(
+            config.n_embd, config.n_embd, bias=config.bias, dtype=torch_dtype
+        )
+        self.o_proj.SCALE_INIT = 1
+        self.dropout = config.dropout
+        self.resid_dropout = nn.Dropout(self.dropout)
+        self.alibi_slopes = self._get_alibi_slopes(self.n_heads)
+        self.window_size = config.window_size
+        self.softcap = config.softcap
+    def _generate_slopes(self, n: int):
+        start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+        return [start * (start**i) for i in range(n)]
+    def _get_alibi_slopes(self, n_heads: int, interpolation_factor: float = 0.25):
+        # If n_heads is a power of 2, generate slopes directly
+        if math.log2(n_heads).is_integer():
+            slopes = self._generate_slopes(n_heads)
+        else:
+            # Get slopes for the nearest power of two
+            n = nearest_power_of_two(n_heads, round_up=False)
+            slopes_power_of_two = self._generate_slopes(n)
+            # Generate extra slopes
+            extra_slopes = self._generate_slopes(2 * n)
+            extra_slopes_trunc = extra_slopes[0::2][: n_heads - n]
+            slopes = slopes_power_of_two + extra_slopes_trunc
+        slopes = torch.tensor(slopes, device=self.device)
+        slopes = slopes * interpolation_factor  # https://arxiv.org/pdf/2310.13017
+        return slopes.to(torch.float32)  # Ensure slopes are in float32
+    def forward(self, x):
+        bsz, seq_len, d_in = x.size()
+        qkv = self.attn(x)
+        q, k, v = torch.chunk(qkv, 3, dim=2)
+        q = q.view(bsz, seq_len, self.n_heads, d_in // self.n_heads)
+        k = k.view(bsz, seq_len, self.n_heads, d_in // self.n_heads)
+        v = v.view(bsz, seq_len, self.n_heads, d_in // self.n_heads)
+        y = fa2(  # https://arxiv.org/pdf/2307.08691
+            q,
+            k,
+            v,
+            dropout_p=self.dropout if self.training else 0.0,
+            causal=True,
+            window_size=(self.window_size, 0),
+            alibi_slopes=self.alibi_slopes,  # https://arxiv.org/pdf/2108.12409
+            softcap=self.softcap,  # https://arxiv.org/pdf/2408.00118
+        )
+        y = y.contiguous().view(bsz, seq_len, d_in)
+        y = self.resid_dropout(self.o_proj(y))
+        return y
+class AttentionSDPA(nn.Module):
+    def __init__(self, config):
+        super(Attention, self).__init__()
+        if isinstance(config.torch_dtype, str):
+            torch_dtype = getattr(torch, config.torch_dtype)
+        else:
+            torch_dtype = config.torch_dtype
+        assert torch.cuda.is_available(), "CUDA is required."
+        assert config.n_embd % config.n_heads == 0
+        self.n_heads = config.n_heads
+        self.device = torch.device("cuda") # Technically don't need CUDA for SDPA
+        self.bsz = config.bsz
+        self.attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias, dtype=torch_dtype)
+        self.o_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias, dtype=torch_dtype)
+        self.dropout = config.dropout
+        self.resid_dropout = nn.Dropout(self.dropout)
+    def forward(self, x):
+        bsz, seq_len, d_in = x.size()
+        qkv = self.attn(x)
+        q, k, v = torch.chunk(qkv, 3, dim=2)
+        q = q.view(bsz, seq_len, self.n_heads, d_in // self.n_heads).transpose(1, 2)
+        k = k.view(bsz, seq_len, self.n_heads, d_in // self.n_heads).transpose(1, 2)
+        v = v.view(bsz, seq_len, self.n_heads, d_in // self.n_heads).transpose(1, 2)
+        y = F.scaled_dot_product_attention(
+            q, k, v,
+            is_causal=True,
+            dropout_p=self.dropout if self.training else 0.0
+        )
+        y = y.transpose(1, 2).contiguous().view(bsz, seq_len, d_in)
+        y = self.resid_dropout(self.o_proj(y))
+        return y
+class FlexAttention(nn.Module):
+    """
+    Generalized Multihead Attention and supports various attention masks.
+    Supports Rotary Positional Embeddings.
+    """
+    def __init__(self, config, mask_mod, score_mod=None):
+        """
+        Initializes the Attention class.
+        Args:
+            dim (int): Embedding size.
+            num_heads (int): Number of heads.
+            mask_mod (Callable): Mask to modify attention scores, e.g. causal.
+        """
+        super().__init__()
+        self.dim, self.num_heads = config.dim, config.num_heads
+        assert config.dim % config.num_heads == 0, f"dim ({self.dim}) must be divisible num_heads ({self.num_heads})"
+        self.head_dim = config.dim // config.num_heads
+        self.wq = nn.Linear(config.dim, config.dim)
+        self.wk = nn.Linear(config.dim, config.dim)
+        self.wv = nn.Linear(config.dim, config.dim)
+        self.mask_mod = mask_mod
+        self.score_mod = score_mod
+        self.block_mask = create_block_mask(
+            mask_mod=self.mask_mod,
+            B=None, # Broadcast
+            H=None, # Broadcast
+            Q_LEN=config.seq_len,
+            KV_LEN=config.seq_len,
+            device=config.device,
+        )
+        self.o_proj = nn.Linear(config.dim, config.dim)
+        self.o_proj.SCALE_INIT = 1
+    def forward(
+        self,
+        x: torch.Tensor = None,
+        q: torch.Tensor = None,
+        k: torch.Tensor = None,
+        v: torch.Tensor = None,
+        freqs_cis: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if x is not None:
+            q = k = v = x
+        if any(t is None for t in [q, k, v]):
+            raise ValueError("Must provide either x for self-attention or q/k/v for cross-attention.")
+        bsz, q_len, _ = q.shape
+        _, k_len, _ = k.shape
+        _, v_len, _ = v.shape
+        Q = self.wq(q).reshape(bsz, self.num_heads, q_len, self.head_dim)
+        K = self.wk(k).reshape(bsz, self.num_heads, k_len, self.head_dim)
+        V = self.wv(v).reshape(bsz, self.num_heads, v_len, self.head_dim)
+        Q, K = apply_rotary_emb(Q, K, freqs_cis=freqs_cis)
+        output = flex_attention(Q, K, V, block_mask=self.block_mask, score_mod=self.score_mod)
+        output = output.reshape(bsz, q_len, self.dim)
+        output = self.o_proj(output)
+        return output

attn_masks.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import torch
+from torch.nn.attention.flex_attention import _mask_mod_signature
+def causal_mask(
+    batch_size: int,
+    num_heads: int,
+    q_idx: torch.Tensor,
+    kv_idx: torch.Tensor
+) -> torch.Tensor:
+    """
+    Returns a boolean tensor indicating which positions in the attention matrix
+    are valid for causal (autoregressive) attention. By default, it's True for
+    positions (i, j) where i >= j.
+    Args:
+        batch_size (int): Batch size (unused here).
+        num_heads (int): Number of heads (unused here).
+        q_idx (torch.Tensor): Tensor indexing the query positions.
+        kv_idx (torch.Tensor): Tensor indexing the key/value positions.
+    Returns:
+        torch.Tensor: A boolean tensor where True indicates that the query at
+        position i can attend to the key at position j, respecting i >= j.
+    """
+    return q_idx >= kv_idx
+def generate_sliding_window_mask(window_size: int, causal: bool = True) -> _mask_mod_signature:
+    """
+    Creates a sliding window mask function.
+    If `causal=True`, each query token at position i can attend only to tokens j
+    in [i - window_size, i].
+    If `causal=False`, each query token i can attend to any token j in
+    [i - window_size, i + window_size], i.e. a symmetric window of size `window_size`.
+    Args:
+        window_size (int): The maximum distance from i that i can attend to.
+        causal (bool): Whether to enforce causal ordering (i >= j). Defaults to True.
+    Returns:
+        _mask_mod_signature: A callable mask function that takes
+        (batch_size, num_heads, q_idx, kv_idx) and returns a boolean tensor
+        indicating allowed attention connections.
+    """
+    def sliding_window_mask(
+        batch_size: int,
+        num_heads: int,
+        q_idx: torch.Tensor,
+        kv_idx: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        If causal is True:
+            within_window = (q_idx - kv_idx) <= window_size, and q_idx >= kv_idx.
+        If causal is False:
+            within_window = abs(q_idx - kv_idx) <= window_size.
+        """
+        if causal:
+            # standard "look back" window
+            distance = q_idx - kv_idx
+            within_window = (distance >= 0) & (distance <= window_size)
+        else:
+            # symmetrical window around i
+            distance = (q_idx - kv_idx).abs()
+            within_window = distance <= window_size
+        return within_window
+    name_ext = "causal" if causal else "noncausal"
+    sliding_window_mask.__name__ = f"sliding_window_{window_size}_{name_ext}"
+    return sliding_window_mask
+def generate_dilated_sliding_window_mask(
+    window_size: int,
+    dilation: int = 2,
+    causal: bool = True
+) -> _mask_mod_signature:
+    """
+    Creates a dilated sliding window mask function.
+    If `causal=True`, each query token i can attend tokens j in [i - window_size, i]
+    such that (i - j) % dilation == 0.
+    If `causal=False`, each query token i can attend tokens j in [i - window_size,
+    i + window_size] for which |i - j| % dilation == 0.
+    Args:
+        window_size (int): The maximum distance from i to j (backwards if causal=True,
+                           otherwise symmetric around i).
+        dilation (int): The stride for skipping positions.
+        causal (bool): Whether to enforce causal ordering (i >= j). Defaults to True.
+    Returns:
+        _mask_mod_signature: A callable mask function that takes
+        (batch_size, num_heads, q_idx, kv_idx) and returns a boolean tensor
+        indicating allowed attention connections.
+    """
+    def dilated_sliding_window_mask(
+        batch_size: int,
+        num_heads: int,
+        q_idx: torch.Tensor,
+        kv_idx: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        If causal is True:
+            distance = q_idx - kv_idx
+            0 <= distance <= window_size and distance % dilation == 0.
+        If causal is False:
+            distance = (q_idx - kv_idx).abs()
+            distance <= window_size and distance % dilation == 0.
+        """
+        if causal:
+            distance = q_idx - kv_idx
+            within_window = (distance >= 0) & (distance <= window_size)
+        else:
+            distance = (q_idx - kv_idx).abs()
+            within_window = distance <= window_size
+        meets_dilation = (distance % dilation) == 0
+        return within_window & meets_dilation
+    mode_str = "causal" if causal else "noncausal"
+    dilated_sliding_window_mask.__name__ = (
+        f"dilated_sliding_window_{window_size}_dilation_{dilation}_{mode_str}"
+    )
+    return dilated_sliding_window_mask
+def main():
+    """
+    Demonstrates usage of each mask by printing attention grids. We include a few
+    basic checks to ensure the masks behave as expected. We show both the causal
+    and non-causal versions for the sliding window and dilated masks.
+    """
+    B, H = 1, 1
+    Q_LEN, KV_LEN = 8, 8
+    # coordinate grids
+    q_idx = torch.arange(Q_LEN).unsqueeze(-1).expand(Q_LEN, KV_LEN)
+    kv_idx = torch.arange(KV_LEN).unsqueeze(0).expand(Q_LEN, KV_LEN)
+    print("= Causal Mask =")
+    c_mask = causal_mask(B, H, q_idx, kv_idx)
+    print(c_mask.int(), "\n")
+    print("= Sliding Window (window_size=2, causal=True) =")
+    sw_causal_fn = generate_sliding_window_mask(window_size=2, causal=True)
+    sw_causal = sw_causal_fn(B, H, q_idx, kv_idx)
+    print(sw_causal.int(), "\n")
+    print("= Sliding Window (window_size=2, causal=False) =")
+    sw_noncausal_fn = generate_sliding_window_mask(window_size=2, causal=False)
+    sw_noncausal = sw_noncausal_fn(B, H, q_idx, kv_idx)
+    print(sw_noncausal.int(), "\n")
+    print("= Dilated Sliding Window (window_size=4, dilation=2, causal=True) =")
+    ds_causal_fn = generate_dilated_sliding_window_mask(window_size=4, dilation=2, causal=True)
+    ds_causal = ds_causal_fn(B, H, q_idx, kv_idx)
+    print(ds_causal.int(), "\n")
+    print("= Dilated Sliding Window (window_size=4, dilation=2, causal=False) =")
+    ds_noncausal_fn = generate_dilated_sliding_window_mask(window_size=4, dilation=2, causal=False)
+    ds_noncausal = ds_noncausal_fn(B, H, q_idx, kv_idx)
+    print(ds_noncausal.int(), "\n")
+    # Quick checks:
+    # (1) Causal means no i < j
+    assert torch.all(c_mask == (q_idx >= kv_idx)), "Causal mask mismatch!"
+    # (2) For windowed masks with causal=True, check a random row
+    i = 5
+    row_sw = sw_causal[i]
+    allowed_js = torch.where(row_sw)[0]
+    if len(allowed_js) > 0:
+        # difference i-j <= 2
+        assert (i - allowed_js.min()) <= 2, "Window mismatch for sliding_window_mask(causal=True)."
+    # (3) Dilated mask with causal=True should skip every other position if dilation=2
+    i = 6
+    row_ds = ds_causal[i]
+    allowed_js = torch.where(row_ds)[0]
+    for j in allowed_js:
+        diff = i - j
+        assert diff % 2 == 0, f"Dilation mismatch: got diff={diff}."
+    print("All checks passed.")
+if __name__ == "__main__":
+    main()

attn_mods.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import torch
+from torch import Tensor
+from torch.nn.attention.flex_attention import _score_mod_signature
+from torch._inductor.lowering import make_pointwise, register_lowering
+# Some internal torch.compile details
+from torch._inductor.virtualized import ops
+from functools import partial
+@torch.library.custom_op("approx::tanh", mutates_args=())
+def _tanh_approx(inp: Tensor) -> Tensor:
+    return torch.tanh(inp)
+@_tanh_approx.register_fake
+def _(inp: torch.Tensor) -> torch.Tensor:
+    return torch.tanh(inp)
+def _tanh_approx_lowering(inp):
+    fn = partial(ops.inline_asm_elementwise, asm="tanh.approx.f32 $0, $1;")
+    return make_pointwise(fn)(inp)
+register_lowering(torch.ops.approx.tanh)(_tanh_approx_lowering)
+class _TanhApprox(torch.autograd.Function):
+    @staticmethod
+    def forward(x):
+        return torch.ops.approx.tanh(x)
+    @staticmethod
+    def setup_context(ctx, inputs, output):
+        (x,) = inputs
+        result = output
+        ctx.save_for_backward(result)
+    @staticmethod
+    def backward(ctx, grad_output):
+        (result,) = ctx.saved_tensors
+        return grad_output * (1 - result * result)
+    @staticmethod
+    def vmap(info, in_dims, x):
+        return torch.tanh(x), 0
+_tanh_approx = _TanhApprox.apply
+def generate_tanh_softcap(soft_cap: int, approx: bool = False) -> _score_mod_signature:
+    """Returns an tanh bias score_mod given the number of heads H
+    Args:
+        soft_cap: The soft cap value to use for normalizing logits
+        approx: Whether to use the `tanh.approx.` ptx instruction
+    Returns:
+        tanh_softcap: score_mod
+    """
+    tanh = _tanh_approx if approx else torch.tanh
+    def tanh_softcap(score, b, h, q_idx, kv_idx):
+        return soft_cap * tanh(score / soft_cap)
+    prefix = "tanh_softcap_approx" if approx else "tanh_softcap"
+    tanh_softcap.__name__ = f"{prefix}_{soft_cap}"
+    return tanh_softcap
+def generate_alibi_bias(H: int) -> _score_mod_signature:
+    """Returns an alibi bias score_mod given the number of heads H
+    Args:
+        H: number of heads
+    Returns:
+        alibi_bias: alibi bias score_mod
+    """
+    def alibi_mod(score, b, h, q_idx, kv_idx):
+        scale = torch.exp2(-((h + 1) * 8.0 / H))
+        bias = (kv_idx - q_idx) * scale
+        return score + bias
+    return alibi_mod
+def generate_tanh_softcap_alibi(H: int, soft_cap: float, approx: bool = False) -> _score_mod_signature:
+    """Returns a combined ALiBi and tanh softcapping score_mod.
+    Args:
+        H (int): number of heads for ALiBi scaling
+        soft_cap (float): the soft cap value for normalizing/logit clipping
+        approx (bool): Whether to use the 'tanh.approx' PTX-based approximation
+    Returns:
+        A combined score_mod function that first applies ALiBi,
+        then performs softcap + tanh (optionally approximate).
+    """
+    tanh_func = _tanh_approx if approx else torch.tanh
+    def alibi_tanh_softcap(score, b, h, q_idx, kv_idx):
+        # Compute ALiBi bias
+        scale = torch.exp2(-((h + 1) * 8.0 / H))
+        bias = (kv_idx - q_idx) * scale
+        score = score + bias
+        # Apply softcap
+        score = score / soft_cap
+        # Apply tanh
+        score = tanh_func(score)
+        # Rescale by soft_cap
+        score = score * soft_cap
+        return score
+    # Give the score_mod a unique name:
+    if approx:
+        alibi_tanh_softcap.__name__ = f"tanh_softcap_alibi_approx_{soft_cap}"
+    else:
+        alibi_tanh_softcap.__name__ = f"tanh_softcap_alibi_{soft_cap}"
+    return alibi_tanh_softcap

config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "model_type": "minitransformer",
+  "_name_or_path": "Transformer_500M",
+  "architectures": ["MiniTransformer"],
+  "dim": 896,
+  "num_heads": 8,
+  "num_layers": 12,
+  "seq_len": 8192,
+  "window_size": 8192,
+  "vocab_size": 200064,
+  "mlp_scale": 12,
+  "bias": false,
+  "dropout": 0.0,
+  "weight_tying": true,
+  "num_epochs": 1,
+  "global_bsz": 524288,
+  "bsz": 2,
+  "warmup_steps": 1907,
+  "eval_peruse_alibiiod": 50,
+  "save_period": 500,
+  "max_lr": 3.0e-4,
+  "min_lr": 3.0e-5,
+  "max_norm": 1.0,
+  "dilation": 1,
+  "fsdp": true,
+  "ddp": false,
+  "mixed_precision": true,
+  "torch_dtype": "bfloat16",
+  "cpu_offload": false,
+  "sharding_strategy": "full_shard",
+  "state_dict_type": "full",
+  "auto_wrap_policy": "partial",
+  "backward_prefetch": "backward_pre",
+  "forward_prefetch": false,
+  "sync_module_states": true,
+  "use_orig_params": true,
+  "device_id": null,
+  "precision": {
+    "param": "bfloat16",
+    "reduce": "bfloat16",
+    "buffer": "bfloat16"
+  },
+  "fsdp_modules": [
+    "AttentionLayer"
+  ],
+  "use_activation_checkpointing": true,
+  "softcap": 50.0,
+  "theta": 10000.0,
+  "use_alibi": false,
+  "torch_compile": false
+}

configuration_minitransformer.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import torch
+from transformers import PretrainedConfig, AutoConfig
+class MiniTransformerConfig(PretrainedConfig):
+    model_type = "minitransformer"
+    def __init__(
+        self,
+        bsz: int = 1,
+        dim: int = 896,
+        num_heads: int = 8,
+        num_layers: int = 12,
+        seq_len: int = 8192,
+        window_size: int = 8192,
+        vocab_size: int = 200064,
+        mlp_scale: int = 12,
+        bias: bool = False,
+        dropout: float = 0.0,
+        softcap: float = 50.0,
+        theta: float = 10_000.0,
+        use_alibi: bool = False,
+        torch_dtype: torch.dtype = torch.bfloat16,
+        device: torch.device = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.bsz = bsz
+        self.dim = dim
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.seq_len = seq_len
+        self.window_size = window_size
+        self.vocab_size = vocab_size
+        self.hidden_size = dim
+        self.mlp_scale = mlp_scale
+        self.intermediate_size = self.dim * self.mlp_scale
+        self.bias = bias
+        self.dropout = dropout
+        self.softcap = softcap
+        self.theta = theta
+        self.use_alibi = use_alibi
+        self.torch_dtype = torch_dtype
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')  # Store as string

convolve.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch
+import torch.nn.functional as F
+from .utils import nearest_power_of_two
+from flashfftconv import FlashFFTConv
+def convolve(u: torch.Tensor, v: torch.Tensor, n: int, use_approx: bool = True) -> tuple[torch.Tensor, torch.Tensor]:
+    bsz, seq_len, d_in = u.shape
+    sgn = torch.full((1, seq_len, 1), 1, device=u.device, dtype=torch.float32)
+    sgn[:, 1::2] *= -1
+    # Cast u and v to float32 for FFT
+    u = u.to(torch.float32)
+    v = v.to(torch.float32)
+    if use_approx:
+        _, d_out = v.shape
+        v = v.view(1, -1, d_out, 1)
+    else:
+        _, K = v.shape
+        sgn = sgn.unsqueeze(-1)
+        v = v.view(1, -1, K, 1, 1)
+        u = u.view(bsz, -1, 1, d_in).expand(bsz, -1, K, d_in)
+    v = torch.fft.rfft(v, n=n, dim=1)
+    U = torch.stack([u, u * sgn], dim=-1)
+    U = torch.fft.rfft(U, n=n, dim=1)
+    U_conv = torch.fft.irfft(v * U, n=n, dim=1)[:, :seq_len]
+    U_plus, U_minus = torch.unbind(U_conv, dim=-1)
+    U_minus = U_minus * sgn
+    # Convert back to original dtype
+    U_plus = U_plus.to(u.dtype)
+    U_minus = U_minus.to(u.dtype)
+    return U_plus, U_minus
+def flash_convolve(
+    u: torch.Tensor, v: torch.Tensor, flash_fft: FlashFFTConv, use_approx: bool = True,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    dtype = u.dtype  # Store the original dtype
+    u = u.to(torch.float32)
+    v = v.to(torch.float32)
+    bsz, seq_len, d_in = u.shape
+    _, K = v.shape
+    padded_len = nearest_power_of_two(seq_len, round_up=True)
+    pad_len = padded_len - seq_len
+    sgn = torch.full((1, 1, padded_len), 1, device=u.device, dtype=torch.float32)
+    sgn[:, :, 1::2] = -1
+    if use_approx:
+        u_padded = F.pad(u.transpose(1, 2), (0, pad_len)).contiguous()
+        v_padded = F.pad(v.transpose(0, 1), (0, pad_len)).contiguous()
+        u_conv = torch.stack([u_padded, u_padded * sgn], dim=0).reshape(2 * bsz, d_in, padded_len)
+    else:
+        u_k_padded = F.pad(u.transpose(1, 2), (0, pad_len)).repeat_interleave(K, dim=1).contiguous()
+        v_padded = F.pad(v.transpose(0, 1), (0, pad_len)).repeat(d_in, 1).contiguous()
+        u_conv = torch.stack([u_k_padded, u_k_padded * sgn], dim=0).reshape(2 * bsz, K * d_in, padded_len)
+    U_conv = flash_fft(u_conv, v_padded)
+    # Trim the output back to the original sequence length
+    U_conv = U_conv[..., :seq_len]
+    u_plus, u_minus = torch.chunk(U_conv, 2, dim=0)
+    if use_approx:
+        u_minus = u_minus * sgn[:, :, :seq_len]
+        U_plus, U_minus = u_plus.transpose(1, 2), u_minus.transpose(1, 2)
+    else:
+        sgn = sgn[:, :, :seq_len].unsqueeze(-1).transpose(1, 2)
+        U_plus = u_plus.view(bsz, d_in, K, seq_len).permute(0, 3, 2, 1).contiguous()
+        U_minus = u_minus.view(bsz, d_in, K, seq_len).permute(0, 3, 2, 1).contiguous() * sgn
+    # Convert back to original dtype
+    U_plus = U_plus.to(dtype)
+    U_minus = U_minus.to(dtype)
+    return U_plus, U_minus

filters.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import math
+import numpy as np
+import torch
+from .utils import logger
+from .utils import get_hankel
+def get_spectral_filters(
+    seq_len: int,
+    K: int,
+    use_hankel_L: bool = False,
+    device: torch.device = None,
+    dtype: torch.dtype = torch.bfloat16,
+) -> torch.Tensor:
+    # Generate the Hankel matrix using PyTorch
+    Z = get_hankel(seq_len, use_hankel_L, device=device, dtype=dtype)
+    # Cast Z to torch.float32 for the eigenvalue decomposition
+    Z_float32 = Z.to(torch.float32)
+    # Perform eigen decomposition using torch.float32
+    sigma, phi = torch.linalg.eigh(Z_float32)
+    # Cast the results back to the original dtype (torch.bfloat16)
+    sigma = sigma.to(dtype=dtype)
+    phi = phi.to(dtype=dtype)
+    # Select the top K eigenvalues and eigenvectors
+    sigma_k, phi_k = sigma[-K:], phi[:, -K:]
+    # Compute the spectral filters
+    phi_k = phi_k * sigma_k ** 0.25
+    # Ensure the filters are in the correct dtype and device
+    filters = phi_k.to(device=device, dtype=dtype)
+    return filters
+def compute_dimensions(n: int) -> tuple[int, int, int]:
+    if n <= 2:
+        raise ValueError("n must be greater than 2")
+    T_prime = (math.ceil(math.sqrt(n - 2)))**2 + 2
+    sqrt_T_prime = math.ceil(math.sqrt(T_prime - 2))
+    k_max = sqrt_T_prime
+    return T_prime, sqrt_T_prime, k_max
+def get_tensorized_spectral_filters_explicit(n: int, k: int, device: torch.device) -> torch.Tensor:
+    T_prime, sqrt_T_prime, k_max = compute_dimensions(n)
+    k = min(k, k_max)
+    Z = get_hankel(sqrt_T_prime).to(device)
+    sigma, phi = torch.linalg.eigh(Z)
+    sigma_k = sigma[-k:]
+    phi_k = phi[:, -k:]
+    result = torch.zeros(sqrt_T_prime * sqrt_T_prime, device=device)
+    for i in range(k):
+        for j in range(k):
+            phi_i = phi_k[:, i] * (sigma_k[i] ** 0.25)
+            phi_j = phi_k[:, j] * (sigma_k[j] ** 0.25)
+            kron = torch.kron(phi_i, phi_j)
+            result += kron
+    return result
+def get_tensorized_spectral_filters(
+    n: int = 8192,
+    k: int = 24,
+    use_hankel_L: bool = False,
+    device: torch.device = None,
+    dtype: torch.dtype = torch.bfloat16,
+) -> torch.Tensor:
+    """
+    Compute tensorized spectral filters for given sequence length and filter count.
+    Args:
+        n: Sequence length
+        k: Number of filters
+        use_hankel_L: Hankel_main ⊗ Hankel_L? Default is Hankel_main ⊗ Hankel_main.
+        device: Computation device
+        dtype: Computation dtype
+    """
+    assert torch.cuda.is_available(), "CUDA is required."
+    T_prime, sqrt_T_prime, k_max = compute_dimensions(n)
+    k = min(k, k_max)
+    Z = get_hankel(sqrt_T_prime)
+    sigma, phi = torch.linalg.eigh(Z)
+    phi_i = phi[:, -k:] * sigma[-k:] ** 0.25
+    if use_hankel_L: # TODO: We may want to use Hankel_L above too if use_hankel_L is true, make another variable for this (mix != use_hankel_L)
+        logger.info("Mixing Hankel_L with Hankel_main to generate tensorized filters.")
+        Z_L = get_hankel(sqrt_T_prime, True)
+        sigma_L, phi_L = torch.linalg.eigh(Z_L)
+        phi_j = phi_L[:, -k:] * sigma_L[-k:] ** 0.25
+    else:
+        phi_j = phi_i
+    filters = torch.kron(phi_i, phi_j)
+    return filters.to(device=device, dtype=dtype)

layers.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+import torch.nn as nn
+from .attn import FlexAttention
+from .modules import MLP
+from .modules import Attention
+try:
+    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP as TritonMLP
+    triton_mlp = True
+except ImportError as e:
+    print(
+        f"Unable to import Triton-based MLP: {e}. Falling back to vanilla SwiGLU MLP instead."
+    )
+    triton_mlp = False
+try:
+    from liger_kernel.transformers.rms_norm import LigerRMSNorm as TritonNorm
+    triton_norm = True
+except ImportError as e:
+    print(
+        f"Unable to import Triton-based RMSNorm: {e}. Falling back to PyTorch implementation."
+    )
+    from torch.nn import RMSNorm
+    triton_norm = False
+class AttentionLayer(nn.Module):
+    def __init__(self, config, mask_mod, score_mod=None) -> None:
+        super(AttentionLayer, self).__init__()
+        self.attn_norm = nn.RMSNorm(config.dim)
+        self.attn = FlexAttention(
+            config=config,
+            mask_mod=mask_mod,
+            score_mod=score_mod,
+        )
+        self.mlp_norm = nn.RMSNorm(config.dim)
+        self.mlp = MLP(config)
+    def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor=None) -> torch.Tensor:
+        x = x + self.attn(self.attn_norm(x), freqs_cis=freqs_cis)
+        x = x + self.mlp(self.mlp_norm(x))
+        return x

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

mlp.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch.nn as nn
+from torch.nn import functional as F
+import torch
+class MLP(nn.Module):
+    def __init__(self, config):
+        # https://arxiv.org/pdf/2002.05202
+        super().__init__()
+        self.hidden_size = config.dim
+        self.intermediate_size = config.dim * config.mlp_scale
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        gate = self.gate_proj(x)
+        gate = F.gelu(gate, approximate="tanh")
+        up = self.up_proj(x)
+        fuse = gate * up
+        outputs = self.down_proj(fuse)
+        outputs = self.dropout(outputs)
+        return outputs

modeling_minitransformer.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutput
+from .modules import Attention
+from .utils import nearest_power_of_two
+from .layers import AttentionLayer
+from .configuration_minitransformer import MiniTransformerConfig
+from .attn_masks import causal_mask
+from .attn_mods import generate_tanh_softcap
+from .rotary_emb import precompute_freqs_cis
+try:
+    from liger_kernel.transformers.rms_norm import LigerRMSNorm as TritonNorm
+    triton_norm = True
+except ImportError as e:
+    print(
+        f"Unable to import Triton-based RMSNorm: {e}. Falling back to PyTorch implementation."
+    )
+    from torch.nn import RMSNorm
+    triton_norm = False
+# Load the tokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_name = "Hazan-Lab/Transformer_500M"
+tokenizer = AutoTokenizer.from_pretrained(
+    model_name,
+    trust_remote_code=True
+)
+class MiniTransformer(PreTrainedModel):
+    config_class = MiniTransformerConfig
+    def __init__(self, config) -> None:
+        super(MiniTransformer, self).__init__(config)
+        self.num_layers = config.num_layers
+        assert config.dim % config.num_heads == 0, f"dim ({self.dim}) must be divisible num_heads ({self.num_heads})"
+        self.head_dim = config.dim // config.num_heads
+        logit_softcap = generate_tanh_softcap(soft_cap=config.softcap)
+        # From pytorch/pytorch#123411, we set persistent=True for torch.compile and PP compatibility
+        self.register_buffer("freqs_cis", precompute_freqs_cis(
+            head_dim=self.head_dim,
+            max_seq_len=config.seq_len,
+            theta=config.theta,
+        ), persistent=True)
+        self.tok_emb = nn.Embedding(config.vocab_size, config.dim)
+        self.dropout = nn.Dropout(config.dropout)
+        self.layers = nn.ModuleList()
+        for _ in range(self.num_layers):
+            layer = AttentionLayer(config, mask_mod=causal_mask, score_mod=logit_softcap)
+            self.layers.append(layer)
+        self.norm = nn.RMSNorm(config.dim)
+        self.lm_head = nn.Linear(config.dim, config.vocab_size, bias=config.bias)
+        # self.tok_emb.weight = self.lm_head.weight
+        self.std = (config.dim) ** -0.5
+        self.apply(self._init_weights)
+        print("Model Parameter Count: %.2fM\n" % (self._get_num_params() / 1e6,))
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        labels: torch.Tensor = None,
+        **kwargs
+    ) -> CausalLMOutput:
+        # Compute embeddings
+        tok_emb = self.tok_emb(input_ids)
+        for layer in self.layers:
+            tok_emb = layer(tok_emb, self.freqs_cis)
+        # Normalize and project to vocabulary
+        tok_emb = self.norm(tok_emb)
+        logits = self.lm_head(tok_emb)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens predict the next token
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1)
+            )
+        return CausalLMOutput(
+            loss=loss,
+            logits=logits,
+        )
+    def _get_num_params(self):
+        n_params = sum(p.numel() for p in self.parameters())
+        if hasattr(self, "pos_emb") and self.pos_emb is not None:
+            n_params -= self.pos_emb.weight.numel()
+        if self.tok_emb.weight is self.lm_head.weight:
+            n_params -= self.tok_emb.weight.numel()
+        return n_params
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            if hasattr(module, "SCALE_INIT"):
+                self.std *= (2 * self.num_layers) ** -0.5
+            torch.nn.init.normal_(module.weight, mean=0.0, std=self.std)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=self.std)
+    @staticmethod
+    def top_k_top_p_filtering(
+        logits: torch.Tensor,
+        top_k: int = 50,
+        top_p: float = 0.95,
+        filter_value: float = float("-inf"),
+    ):
+        """
+        Filters a distribution of logits using top-k and/or nucleus (top-p) filtering.
+        """
+        # top_k
+        if top_k > 0:
+            top_k = min(top_k, logits.size(-1))
+            # Remove all logits that are not in the top k
+            indices_to_remove = logits < torch.topk(logits, top_k, dim=-1).values[:, -1, None]
+            logits[indices_to_remove] = filter_value
+        # top_p (nucleus)
+        if 0 < top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+            # Remove tokens with cumulative probability above the threshold
+            sorted_indices_to_remove = cumulative_probs > top_p
+            # Shift the indices to the right to keep also the first token above the threshold
+            sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+            sorted_indices_to_remove[:, 0] = False
+            indices_to_remove = sorted_indices_to_remove.scatter(
+                dim=1, index=sorted_indices, src=sorted_indices_to_remove
+            )
+            logits[indices_to_remove] = filter_value
+        return logits
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        max_new_tokens: int = 50,
+        temperature: float = 0.5,
+        top_k: int = 50,
+        top_p: float = 0.95,
+        eos_token_id: int = None,
+        pad_token_id: int = 0,
+        **kwargs
+    ):
+        """
+        Naive token-by-token generation loop that uses top-k/top-p filtering and optional temperature.
+        Args:
+            input_ids (torch.LongTensor): shape (batch_size, sequence_length).
+            max_new_tokens (int): max number of tokens to generate (beyond input_ids length).
+            temperature (float): sampling temperature (>=0).
+            top_k (int): Top-K sampling cutoff.
+            top_p (float): Nucleus sampling cutoff.
+            eos_token_id (int): If set, stop generation when this token is produced.
+            pad_token_id (int): If set, can be used to pad sequences. (Not fully used here.)
+            kwargs: Unused arguments (like num_beams) for compatibility.
+        Returns:
+            torch.LongTensor: shape (batch_size, sequence_length + generated_tokens).
+        """
+        device = input_ids.device
+        print("1=====================")
+        print(tokenizer.decode(input_ids[0], skip_special_tokens=True))
+        print("1=====================")
+        # We'll accumulate new tokens into generated_ids
+        generated_ids = input_ids.clone()
+        for _ in range(max_new_tokens):
+            # Forward pass to get logits for the last token
+            outputs = self.forward(generated_ids)
+            logits = outputs.logits[:, -1, :]  # shape: (batch_size, vocab_size)
+            # Scale logits by temperature
+            if temperature != 1.0:
+                logits = logits / temperature
+            # Filter logits using top-k and/or top-p
+            logits = self.top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+            # Convert to probabilities
+            probabilities = F.softmax(logits, dim=-1)
+            # Sample from the distribution
+            next_token = torch.multinomial(probabilities, num_samples=1)  # (batch_size, 1)
+            # Append next token
+            generated_ids = torch.cat([generated_ids, next_token], dim=1)
+            # If eos_token_id is set and any sample produced it, we optionally could break early
+            if eos_token_id is not None:
+                # Check if all sequences in the batch ended
+                # or if you want to do a more fine-grained approach
+                if (next_token == eos_token_id).all():
+                    break
+        print("2=====================")
+        print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
+        print("2=====================")
+        return generated_ids

modules.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .attn import Attention
+from .attn import AttentionSDPA
+from .mlp import MLP
+from .stu import STU

rotary_emb.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+def precompute_freqs_cis(head_dim: int, max_seq_len: int, theta: float = 10000.0):
+    # For half the dimensions, build the scale factor:
+    freq_seq = torch.arange(0, head_dim, 2).float() / head_dim
+    freqs = 1.0 / (theta ** freq_seq)
+    # Outer product with positions
+    t = torch.arange(max_seq_len, dtype=torch.float32)
+    angles = torch.outer(t, freqs)
+    # Build a complex exponential e^{i * theta}
+    freqs_cis = torch.polar(
+        torch.ones_like(angles),
+        angles
+    )
+    return freqs_cis
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    """
+    x is [B, n_heads, seq_len, head_dim_as_complex],
+    so we want to broadcast freqs_cis from [max_seq_len, half_dim]
+    to [1, 1, seq_len, half_dim].
+    """
+    seq_len = x.shape[2]
+    freqs_cis = freqs_cis[:seq_len]  # slice down to current seq_len
+    return freqs_cis.view(1, 1, seq_len, -1)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # Convert real -> complex by grouping last dim in pairs
+    # shape => [B, n_heads, seq_len, head_dim//2, 2] => complex => [B, n_heads, seq_len, head_dim//2]
+    xq_complex = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_complex = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    # Broadcast the frequencies to match [B, n_heads, seq_len, head_dim//2]
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_complex)
+    # Multiply => apply rotation
+    xq_complex = xq_complex * freqs_cis
+    xk_complex = xk_complex * freqs_cis
+    # Convert back to real => shape [B, n_heads, seq_len, head_dim]
+    xq_out = torch.view_as_real(xq_complex).reshape(*xq.shape)
+    xk_out = torch.view_as_real(xk_complex).reshape(*xk.shape)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+def main():
+    import math
+    from torch.testing import assert_close
+    # Test 1: No rotation at position 0
+    dim = 2
+    freqs_cis = precompute_freqs_cis(dim=dim, max_seq_len=1, theta=1.0)
+    xq = torch.tensor([[[[1.0, 0.0]]]])
+    xq_out, _ = apply_rotary_emb(xq, xq.clone(), freqs_cis)
+    assert_close(xq_out, xq, msg="Test 1 failed")
+    print("Test 1 passed.")
+    # Test 2: Verify rotation at positions [0..4] in 2D
+    L = 5
+    freqs_cis = precompute_freqs_cis(dim=dim, max_seq_len=L, theta=1.0)
+    xq = torch.tensor([[[[1.0, 0.0] for _ in range(L)]]])
+    xq_out, _ = apply_rotary_emb(xq, xq.clone(), freqs_cis)
+    expected = torch.tensor([[[[math.cos(p), math.sin(p)] for p in range(L)]]])
+    assert_close(xq_out, expected, rtol=1e-6, atol=1e-6, msg="Test 2 failed")
+    print("Test 2 passed.")
+    # Test 3: Higher dimension at position 0
+    xq = torch.tensor([[[[1.0, 0.0, 1.0, 0.0]]]])
+    freqs_cis = precompute_freqs_cis(dim=4, max_seq_len=1, theta=1.0)
+    xq_out, _ = apply_rotary_emb(xq, xq.clone(), freqs_cis)
+    assert_close(xq_out, xq, msg="Test 3 failed")
+    print("Test 3 passed.")
+    # Test 4: Random shape & norm checks
+    torch.manual_seed(1337)
+    B, H, L, D = 2, 3, 5, 8
+    xq = torch.randn(B, H, L, D)
+    xk = torch.randn(B, H, L, D)
+    freqs_cis = precompute_freqs_cis(dim=D, max_seq_len=L, theta=1.0)
+    xq_out, xk_out = apply_rotary_emb(xq, xk, freqs_cis)
+    assert xq_out.shape == (B, H, L, D), "Test 4 Q shape failed"
+    assert xk_out.shape == (B, H, L, D), "Test 4 K shape failed"
+    for b in range(B):
+        for h in range(H):
+            for l in range(L):
+                assert torch.allclose(xq[b,h,l].norm(), xq_out[b,h,l].norm(), atol=1e-5), "Test 4 Q norm failed"
+                assert torch.allclose(xk[b,h,l].norm(), xk_out[b,h,l].norm(), atol=1e-5), "Test 4 K norm failed"
+    print("Test 4 passed.\nAll tests passed successfully!")
+if __name__ == "__main__":
+    main()

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

stu.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+import torch.nn as nn
+from .convolve import convolve, flash_convolve
+try:
+    from flashfftconv import FlashFFTConv
+    flash_fft_available = True
+except ImportError as e:
+    print(
+        f"Unable to import FlashFFTConv: {e}. Falling back to PyTorch implementation."
+    )
+    flash_fft_available = False
+class STU(nn.Module):
+    def __init__(self, config, phi, n) -> None:
+        super(STU, self).__init__()
+        self.config = config
+        if isinstance(config.torch_dtype, str):
+            torch_dtype = getattr(torch, config.torch_dtype)
+        else:
+            torch_dtype = config.torch_dtype
+        self.phi = phi.to(device=config.device, dtype=torch_dtype)
+        self.n = n
+        self.K = config.num_eigh
+        self.d_in = config.n_embd
+        self.d_out = config.n_embd
+        self.use_hankel_L = config.use_hankel_L
+        self.use_approx = config.use_approx
+        self.flash_fft = (
+            FlashFFTConv(self.n, dtype=torch.bfloat16)
+            if config.use_flash_fft and flash_fft_available
+            else None
+        )
+        if self.use_approx:
+            self.M_inputs = nn.Parameter(
+                torch.empty(self.d_in, self.d_out, dtype=torch_dtype)
+            )
+            self.M_filters = nn.Parameter(
+                torch.empty(self.K, self.d_in, dtype=torch_dtype)
+            )
+        else:
+            self.M_phi_plus = nn.Parameter(
+                torch.empty(self.K, self.d_in, self.d_out, dtype=torch_dtype)
+            )
+            if not self.use_hankel_L:
+                self.M_phi_minus = nn.Parameter(
+                    torch.empty(self.K, self.d_in, self.d_out, dtype=torch_dtype)
+                )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        dtype = self.M_inputs.dtype
+        x = x.to(dtype=dtype)
+        if self.use_approx:
+            # Contract inputs and filters over the K and d_in dimensions, then convolve
+            x_proj = x @ self.M_inputs
+            phi_proj = self.phi @ self.M_filters
+            x_proj = x_proj.to(dtype=dtype)
+            phi_proj = phi_proj.to(dtype=dtype)
+            if self.flash_fft:
+                spectral_plus, spectral_minus = flash_convolve(
+                    x_proj, phi_proj, self.flash_fft, self.use_approx
+                )
+            else:
+                spectral_plus, spectral_minus = convolve(
+                    x_proj, phi_proj, self.n, self.use_approx
+                )
+        else:
+            # Convolve inputs and filters,
+            if self.flash_fft:
+                U_plus, U_minus = flash_convolve(
+                    x, self.phi, self.flash_fft, self.use_approx
+                )
+            else:
+                U_plus, U_minus = convolve(x, self.phi, self.n, self.use_approx)
+            # Then, contract over the K and d_in dimensions
+            spectral_plus = torch.tensordot(
+                U_plus, self.M_phi_plus, dims=([2, 3], [0, 1])
+            )
+            if not self.use_hankel_L:
+                spectral_minus = torch.tensordot(
+                    U_minus, self.M_phi_minus, dims=([2, 3], [0, 1])
+                )
+        return spectral_plus if self.use_hankel_L else spectral_plus + spectral_minus

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "199999": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200018": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "model_max_length": 128000,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

utils.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import math
+import numpy as np
+import torch
+import logging
+import os
+import sys
+from colorama import Fore, Style, init
+from dotenv import load_dotenv
+load_dotenv()
+init(autoreset=True)
+def nearest_power_of_two(x: int, round_up: bool = False) -> int:
+    return (
+        1 << math.floor(math.log2(x)) if not round_up else 1 << math.ceil(math.log2(x))
+    )
+def get_hankel(seq_len: int, use_hankel_L: bool = False, device: torch.device = None, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    entries = torch.arange(1, seq_len + 1, dtype=dtype, device=device)
+    i_plus_j = entries[:, None] + entries[None, :]
+    if use_hankel_L:
+        sgn = (-1.0) ** (i_plus_j - 2.0) + 1.0
+        denom = (i_plus_j + 3.0) * (i_plus_j - 1.0) * (i_plus_j + 1.0)
+        Z = sgn * (8.0 / denom)
+    elif not use_hankel_L:
+        Z = 2.0 / (i_plus_j**3 - i_plus_j)
+    else:
+        raise ValueError("use_hankel_L must be a boolean")
+    return Z
+class ColorFormatter(logging.Formatter):
+    """
+    A custom log formatter that applies color based on the log level using the Colorama library.
+    Attributes:
+        LOG_COLORS (dict): A dictionary mapping log levels to their corresponding color codes.
+    """
+    # Colors for each log level
+    LOG_COLORS = {
+        logging.DEBUG: Fore.LIGHTMAGENTA_EX + Style.BRIGHT,
+        logging.INFO: Fore.CYAN,
+        logging.WARNING: Fore.YELLOW + Style.BRIGHT,
+        logging.ERROR: Fore.RED + Style.BRIGHT,
+        logging.CRITICAL: Fore.RED + Style.BRIGHT + Style.NORMAL,
+    }
+    # Colors for other parts of the log message
+    TIME_COLOR = Fore.GREEN
+    FILE_COLOR = Fore.BLUE
+    LEVEL_COLOR = Style.BRIGHT
+    def __init__(self, fmt=None):
+        super().__init__(fmt or "%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s", "%Y-%m-%d %H:%M:%S")
+    def format(self, record):
+        """
+        Formats a log record with the appropriate color based on the log level.
+        Args:
+            record (logging.LogRecord): The log record to format.
+        Returns:
+            str: The formatted log message with colors applied.
+        """
+        # Apply color based on the log level
+        level_color = self.LOG_COLORS.get(record.levelno, Fore.WHITE)
+        time_str = f"{self.TIME_COLOR}{self.formatTime(record)}{Style.RESET_ALL}"
+        levelname_str = f"{level_color}{record.levelname}{Style.RESET_ALL}"
+        file_info_str = f"{self.FILE_COLOR}{record.filename}:{record.lineno}{Style.RESET_ALL}"
+        # Format the log message with color
+        log_msg = f"{time_str} - {levelname_str} - {file_info_str} - {record.msg}"
+        return log_msg
+def setup_logger():
+    """
+    Sets up a logger with a custom color formatter that logs to standard output (stdout).
+    The logger is configured with the ColorFormatter to format log messages with color based on the log level.
+    The log level is set to INFO by default, but this can be changed to show more or less detailed messages.
+    Returns:
+        logging.Logger: A logger instance that logs formatted messages to stdout.
+    """
+    handler = logging.StreamHandler(sys.stdout)
+    # Set custom formatter
+    formatter = ColorFormatter()
+    handler.setFormatter(formatter)
+    logger = logging.getLogger(__name__)
+    # Set to DEBUG to capture all logging levels
+    DEBUG = os.environ.get("DEBUG", "False").lower() in ("true", "1", "t")
+    logger.setLevel(logging.DEBUG) if DEBUG else logger.setLevel(logging.INFO)
+    logger.addHandler(handler)
+    logger.propagate = False  # Prevents multiple logging if re-initialized
+    return logger
+logger = setup_logger()  # Initialize once to prevent multiple loggers

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff