Spaces:

sochastic
/

Evo-App

Runtime error

App Files Files Community

sochasticbackup commited on Nov 11, 2025

Commit

ea3734f

1 Parent(s): 2997d61

second init with torch

Browse files

Files changed (15) hide show

.gitattributes +0 -35
.gitignore +32 -202
README.md +1 -1
evo/configs/evo-1-8k-base_inference.yml +2 -2
requirements.txt +2 -2
stripedhyena/__init__.py +0 -0
stripedhyena/cache.py +46 -0
stripedhyena/engine.py +388 -0
stripedhyena/generation.py +158 -0
stripedhyena/layers.py +146 -0
stripedhyena/model.py +445 -0
stripedhyena/positional_embeddings.py +112 -0
stripedhyena/sample.py +59 -0
stripedhyena/tokenizer.py +184 -0
stripedhyena/utils.py +94 -0

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -1,216 +1,46 @@
-# Byte-compiled / optimized / DLL files
 __pycache__/
-*.py[codz]
 *$py.class
-# C extensions
 *.so
-# Distribution / packaging
 .Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
 *.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#   Usually these files are written by a python script from a template
-#   before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py.cover
-.hypothesis/
-.pytest_cache/
-cover/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-.pybuilder/
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-# Pipfile.lock
-# UV
-#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-# uv.lock
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-# poetry.lock
-# poetry.toml
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
-#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
-# pdm.lock
-# pdm.toml
-.pdm-python
-.pdm-build/
-# pixi
-#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
-# pixi.lock
-#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
-#   in the .venv directory. It is recommended not to include this directory in version control.
-.pixi
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# Redis
-*.rdb
-*.aof
-*.pid
-# RabbitMQ
-mnesia/
-rabbitmq/
-rabbitmq-data/
-# ActiveMQ
-activemq-data/
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.envrc
-.venv
-env/
 venv/
 ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Cython debug symbols
-cython_debug/
-# PyCharm
-#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#   and can be added to the global gitignore or merged into this file.  For a more nuclear
-#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
-# .idea/
-# Abstra
-#   Abstra is an AI-powered process automation framework.
-#   Ignore directories containing user credentials, local state, and settings.
-#   Learn more at https://abstra.io/docs
-.abstra/
-# Visual Studio Code
-#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
-#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
-#   and can be added to the global gitignore or merged into this file. However, if you prefer,
-#   you could uncomment the following to ignore the entire vscode folder
-# .vscode/
-# Ruff stuff:
-.ruff_cache/
-# PyPI configuration file
-.pypirc
-# Marimo
-marimo/_static/
-marimo/_lsp/
-__marimo__/
-# Streamlit
-.streamlit/secrets.toml

+# Python
 __pycache__/
+*.py[cod]
 *$py.class
 *.so
 .Python
 *.egg-info/
+dist/
+build/
+# Virtual environments
 venv/
+env/
 ENV/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+Thumbs.db
+# Gradio
+gradio_cached_examples/
+flagged/
+# Model cache (these will download automatically on HF Spaces)
+*.bin
+*.safetensors
+models/
+checkpoints/
+# Logs
+*.log
+DEPLOY.md
+DEPLOYMENT_READY.md
+README_TASK.md
+SETUP_NOTES.md
+verify_deployment.py
+verify_deployment.py
+install_stripedhyena.py

README.md CHANGED Viewed

@@ -8,7 +8,7 @@ sdk_version: 4.44.0
 app_file: app.py
 pinned: false
 license: apache-2.0
-python_version: 3.11
 ---
 Check configuration

 app_file: app.py
 pinned: false
 license: apache-2.0
+python_version: 3.10
 ---
 Check configuration

evo/configs/evo-1-8k-base_inference.yml CHANGED Viewed

@@ -2,8 +2,8 @@ vocab_size: 512
 hidden_size: 4096
 num_filters: 4096
 max_sequence_len: 8192
-attn_layer_idxs: [8, 16, 24]
-hyena_layer_idxs: [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31]
 num_layers: 32
 short_filter_length: 3
 num_attention_heads: 32

 hidden_size: 4096
 num_filters: 4096
 max_sequence_len: 8192
+attn_layer_idxs: []
+hyena_layer_idxs: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
 num_layers: 32
 short_filter_length: 3
 num_attention_heads: 32

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
-gradio==4.44.0
 torch==2.1.0
 numpy==1.24.3
 transformers==4.36.0
 einops==0.7.0
 pyyaml==6.0.1
-git+https://github.com/togethercomputer/stripedhyena.git

 torch==2.1.0
 numpy==1.24.3
 transformers==4.36.0
 einops==0.7.0
 pyyaml==6.0.1
+tokenizers>=0.15.0
+gradio==4.44.0

stripedhyena/__init__.py ADDED Viewed

File without changes

stripedhyena/cache.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright (c) Together
+# This software is distributed under the terms of the Apache License, Version 2.0
+# Author: Michael Poli
+from dataclasses import dataclass, field
+from typing import Optional
+from torch import Tensor
+# https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/generation.py
+@dataclass
+class InferenceParams:
+    """Inference parameters that are passed to the main model in order
+    to efficienly calculate and store the context during inference."""
+    max_seqlen: int
+    max_batch_size: int
+    seqlen_offset: int = 0
+    batch_size_offset: int = 0
+    key_value_memory_dict: dict = field(default_factory=dict)
+    lengths_per_sample: Optional[Tensor] = None
+    def reset(self, max_seqlen, max_batch_size):
+        self.max_seqlen = max_seqlen
+        self.max_batch_size = max_batch_size
+        self.seqlen_offset = 0
+        if self.lengths_per_sample is not None:
+            self.lengths_per_sample.zero_()
+@dataclass
+class RecurrentInferenceParams:
+    """Inference parameters passed to blocks with recurrent mode."""
+    fir_filter_length: int = 3
+    state_dim: int = 16
+    # seqlen_offset not used
+    seqlen_offset: int = 0
+    fir_state_dict: dict = field(default_factory=dict)
+    state_dict: dict = field(default_factory=dict)
+    def reset(self):
+        self.fir_filter_length = 3
+        self.state_dim = 16
+        self.seqlen_offset = 0

stripedhyena/engine.py ADDED Viewed

	@@ -0,0 +1,388 @@

+# Copyright (c) Together
+# This software is distributed under the terms of the Apache License, Version 2.0
+# Author: Michael Poli
+import gc
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    import conv1d_cpp
+except:
+    pass
+from stripedhyena.utils import column_split
+IIR_PREFILL_MODES = [
+    "recurrence",
+    "modal-fft",
+    "hybrid-modal-recurrence",
+    "modal-scan",
+    "canonical-fft",
+    "iir-fir-caching",
+]
+def canonicalize_modal_system(poles, residues):
+    """Canonicalize a modal system.
+    Args:
+        poles (Tensor): The poles of the system.
+        residues (Tensor): The residues of the system.
+    Returns:
+        Tuple[Tensor, Tensor]: The canonicalized poles and residues.
+    """
+    raise NotImplementedError
+def list_tensors(idx):
+    for obj in gc.get_objects():
+        try:
+            if torch.is_tensor(obj) and isinstance(obj, torch.Tensor):
+                # dump to log
+                print(type(obj), obj.size())
+                el = obj[0]
+                with open(f"tensors_{idx}.txt", "a") as f:
+                    f.write(f"{type(obj)} {obj.size()} {el}\n")
+        except Exception as e:
+            pass
+class HyenaInferenceEngine:
+    def __init__(
+        self,
+        fir_fn=None,
+        iir_prefill_style="modal-fft",
+        layer_idx=None,
+    ) -> None:
+        self.fir_fn = fir_fn
+        assert iir_prefill_style in IIR_PREFILL_MODES, f"iir_prefill_style must be one of {IIR_PREFILL_MODES}"
+        self.iir_prefill_style = iir_prefill_style
+        self.layer_idx = layer_idx
+        self.low_mem_mode = False
+    def parallel_fir(
+        self,
+        fir_fn,
+        u,
+        weight,
+        bias,
+        L,
+        fir_length=3,
+        inference_params=None,
+        prefill_mode=None,
+        padding_mask=None,
+    ):
+        """Compute the output state of the long convolutional filter."""
+        # prepare input layout, dimensions and dispatch to fir kernel
+        if fir_fn != torch.nn.functional.conv1d:
+            z_pre = fir_fn(u)[:, :L]  # B, L, D
+            z_pre = z_pre.permute(0, 2, 1)
+        else:
+            u = u.permute(0, 2, 1)  # B, D, L
+            z_pre = fir_fn(
+                u,
+                weight,
+                bias=None,  # don't pass it here, add manually instead!  source of small error
+                stride=1,
+                padding=fir_length - 1,
+                groups=u.shape[1],
+            )[..., :L]
+            # add manually instead!  source of small error
+            z_pre = z_pre + bias[None, :, None]
+        # handle padding post fir, the only place with biases
+        if type(padding_mask) == torch.Tensor:
+            z_pre = z_pre * padding_mask[:, None]
+        if inference_params is not None:
+            # handle seqlen last and dim last cases for `u`
+            if fir_fn != torch.nn.functional.conv1d:
+                fir_state = u[:, -fir_length + 1 :].permute(0, 2, 1)
+            else:
+                fir_state = u[..., -fir_length + 1 :]
+        else:
+            fir_state = None
+        return z_pre, fir_state
+    def parallel_iir(
+        self,
+        z_pre,
+        h,
+        D,
+        L,
+        poles,
+        residues,
+        t,
+        dims,
+        layer_idx,
+        inference_params=None,
+        prefill_style="fft",
+        fftconv_fn=None,
+        padding_mask=None,
+        use_flashfft=False,
+        column_split_hyena=False,
+        long_fir_threshold=None,
+    ):
+        """Compute the output state of the short convolutional filter."""
+        fft_size = 2 * L
+        hidden_size, num_attention_heads, hidden_size_per_attention_head, _, _ = dims
+        # Compatibility with training infra that column splits the projections
+        if column_split_hyena:
+            z = z_pre.reshape(
+                z_pre.shape[0],
+                num_attention_heads,
+                3 * hidden_size_per_attention_head,
+                z_pre.shape[2],
+            )
+            x2, x1, v = (
+                z[:, :, :hidden_size_per_attention_head],
+                z[
+                    :,
+                    :,
+                    hidden_size_per_attention_head : 2 * hidden_size_per_attention_head,
+                ],
+                z[:, :, 2 * hidden_size_per_attention_head :],
+            )
+            x2, x1, v = (
+                x2.reshape(x2.shape[0], -1, x2.shape[-1]),
+                x1.reshape(x1.shape[0], -1, x1.shape[-1]),
+                v.reshape(v.shape[0], -1, v.shape[-1]),
+            )
+        else:
+            x2, x1, v = z_pre.split([hidden_size, hidden_size, hidden_size], dim=1)
+        x1v = x1 * v
+        if inference_params is not None and prefill_style == "recurrence":
+            y = self.prefill_via_direct_recurrence(
+                inference_params=inference_params,
+                x1v=x1v,
+                L=L,
+                poles=poles,
+                residues=residues,
+            )
+        else:
+            if use_flashfft and (L % 2) == 0:  # only works with even L
+                y = fftconv_fn(
+                    x1v.to(dtype=torch.bfloat16).contiguous(),
+                    h.to(dtype=torch.float32),
+                )
+                X_s = None
+            elif long_fir_threshold is None:
+                H = torch.fft.rfft(h.to(dtype=torch.float32), n=fft_size) / fft_size
+                X_s = torch.fft.fft(x1v.to(dtype=torch.float32), n=fft_size)
+                X = X_s[..., : H.shape[-1]]
+                if len(z_pre.shape) > 3:
+                    H = H.unsqueeze(1)
+                y = torch.fft.irfft(X * H, n=fft_size, norm="forward")[..., :L]
+            else:
+                assert h.shape[0] == 1, "batch size must be 1 for long_fir_threshold"
+                h = h[0][:, None]  # rearrange to d, 1, l for depthwise conv1d
+                h = h[..., :long_fir_threshold]
+                y = F.conv1d(
+                    x1v,
+                    h.to(dtype=x1v.dtype),
+                    stride=1,
+                    groups=x1v.shape[1],
+                    padding=h.shape[-1] - 1,
+                )[..., :L]
+        y = y.to(dtype=x1v.dtype)
+        y = (y + x1v * D.unsqueeze(-1)) * x2
+        if inference_params is not None:
+            if prefill_style == "fft":
+                self.prefill_via_modal_fft(
+                    inference_params=inference_params,
+                    x1v=x1v,
+                    X_s=X_s,
+                    L=L,
+                    t=t,
+                    poles=poles,
+                    dims=dims,
+                    layer_idx=layer_idx,
+                    use_flashfft=use_flashfft,
+                    fftconv_fn=fftconv_fn,
+                )
+            elif prefill_style == "recurrence":
+                # recurrent prefill is done before
+                pass
+            else:
+                raise NotImplementedError
+            if self.low_mem_mode:
+                # TODO: smarter gc
+                del z_pre, x2, x1, v, x1v, h, poles, residues
+                torch.cuda.empty_cache()
+        return y.permute(0, 2, 1)
+    def step_fir(self, u, fir_state, weight, bias=None):
+        """Step the FIR filter.
+        Note:
+        `fir_state` contains the last `short_filter_length - 1` elements of `u`: `u_(L-2), u_{L-1), ...`
+        We assume dimensions of `short_filter_weight` to be `[d, 1, short_filter_len]` (SISO / multi SISO layout).
+        """
+        h0, h = weight[..., 0, -1], weight[..., 0, :-1]
+        h0, h = h0[None], h[None]
+        y = h0 * u + torch.sum(fir_state * h, dim=-1) + bias
+        # update
+        fir_state = torch.roll(fir_state, -1, dims=2)
+        fir_state[..., -1] = u
+        return y, fir_state
+    def step_iir(self, x2, x1, v, D, residues, poles, iir_state, iir_groups=1):
+        x1v = x1 * v
+        residues, poles = (
+            torch.view_as_complex(residues.to(torch.float32)),
+            torch.view_as_complex(poles.to(torch.float32)),
+        )
+        # squeeze the dummy seqlen dimension
+        # D, state_dim, 1 -> 1, D, state_dim
+        residues, poles = residues[..., 0][None], poles[..., 0][None]
+        iir_state = poles * iir_state + x1v[..., None]
+        res_state = torch.sum(residues * iir_state, dim=-1).real
+        if iir_groups > 1:
+            raise NotImplementedError
+        y = x2 * (res_state + D * x1v)
+        return y, iir_state
+    def prefill_via_fir_caching(self, u, inference_params, L, *args, **kwargs):
+        """Turns the IIR filter into a FIR and uses a cache for decoding."""
+        raise NotImplementedError(":)")
+    def prefill_via_direct_recurrence(
+        self, inference_params, x1v, L, residues, poles, *args, **kwargs
+    ) -> torch.Tensor:
+        """
+        Compute the IIR state via explicit SSM recurrence (modal form)
+        This is the most memory efficient prefilling method for Hyena filters.
+        Note:
+            dtypes: [state: float32, poles: float32, x1v: bfloat16, output: bfloat16]
+        """
+        state_dim = poles.shape[1]
+        x1v_ = x1v[..., None, None]  # b, d, l, sdim, reim
+        x1v_ = x1v_.repeat(1, 1, 1, state_dim, 2)  # b, d, l, sdim, reim
+        x1v_[..., 1] = 0
+        state = 0 * x1v_[:, :, 0]
+        output = 0 * x1v_[:, :, :, 0, 0]  # b, d, l
+        # suppress dummy seqlen dimension
+        poles = poles[:, :, 0][None]
+        residues = residues[:, :, 0][None].repeat(x1v_.shape[0], 1, 1, 1)  # b, d, sdim, reim
+        # state: b, d, sdim, reim
+        # poles: 1, d, sdim, reim
+        # x1v_: b, d, l, sdim, reim
+        for i in range(L):
+            state[..., 0] = poles[..., 0] * state[..., 0] - poles[..., 1] * state[..., 1] + x1v_[:, :, i, :, 0]
+            state[..., 1] = poles[..., 0] * state[..., 1] + poles[..., 1] * state[..., 0] + x1v_[:, :, i, :, 1]
+            output[:, :, i] = torch.sum(residues * state, dim=-2)[..., 0]  # .real
+        inference_params.state_dict[self.layer_idx] = torch.view_as_complex(state.to(dtype=torch.float32))
+        return output
+    def prefill_via_hybrid_recurrence(self, inference_params, u, log_poles, x1v_f_a, L, *args, **kwargs):
+        """
+        Compute the IIR state via hybrid recurrence-convolution over blocks
+        """
+        raise NotImplementedError(":)")
+    def prefill_via_scan(self, u, inference_params=None, *args, **kwargs):
+        raise NotImplementedError
+    def prefill_via_canonical_fft(self, u, inference_params=None, *args, **kwargs):
+        """
+        Compute the IIR state via a single FFT with the denominator of the SSM in companion form.
+        This is the most memory efficient "parallelized" prefilling method for Hyena.
+        From: https://arxiv.org/abs/2310.18780
+        """
+        raise NotImplementedError(":)")
+    def prefill_via_modal_fft(
+        self,
+        inference_params,
+        x1v,
+        L,
+        poles,
+        t,
+        dims,
+        layer_idx,
+        X_s=None,
+        use_flashfft=False,
+        fftconv_fn=None,
+        state_dtype=torch.complex64,
+        *args,
+        **kwargs,
+    ):
+        """
+        Compute the IIR state via a single FFT, using the poles of the SSM in modal form.
+        """
+        # When the model has a long convolution derived from a SSM in modal form and prefill_style is "fft",
+        # we split the filter into poles and residues and reuse FFT computation on the input.
+        # This optimization is currently not supported when using flashfftconv.
+        hidden_size, _, _, state_size, hyena_filter_groups = dims
+        if use_flashfft:
+            # using real states
+            poles = poles.squeeze().reshape(poles.shape[0], -1)[..., None]
+            state_s = poles**t
+            if hyena_filter_groups > 1:
+                raise NotImplementedError
+            x1v = x1v[:, :, None].repeat(1, 1, 2 * state_size, 1)
+            x1v = x1v.reshape(x1v.shape[0], -1, x1v.shape[-1])
+            state_s = state_s[None]
+            state = fftconv_fn(
+                x1v.contiguous(),
+                state_s.to(dtype=torch.float32),
+            )
+            state = state[..., L - 1].reshape(x1v.shape[0], hidden_size, state_size, 2)
+            state = torch.view_as_complex(state.contiguous().to(dtype=torch.float32))
+            inference_params.state_dict[self.layer_idx] = state
+        else:
+            assert X_s is not None
+            bs = x1v.shape[0]
+            fft_size = 2 * L
+            poles = torch.view_as_complex(poles.to(torch.float32))
+            state_s = poles**t
+            state_S = torch.fft.fft(state_s, n=fft_size).repeat(bs, 1, 1, 1)  # B, D, state_dim, 2 * L
+            if hyena_filter_groups > 1:
+                state_S = state_S.repeat_interleave(hidden_size // hyena_filter_groups, 1)
+            state = torch.fft.ifft(X_s[..., None, :] * state_S, n=fft_size)
+            inference_params.state_dict[layer_idx] = state[..., L - 1].to(dtype=state_dtype)
+    def _compute_state(self, log_poles, u, t, L, *args, **kwargs):
+        """
+        Compute the IIR state given an input `u` and log_poles of the modal system.
+        """
+        bs = u.shape[0]
+        fft_size = 2 * L
+        U = torch.fft.rfft(u.to(torch.float32), n=fft_size)
+        fft_size = 2 * L
+        x = (log_poles * t).exp()
+        # [batch, hidden_size, state_dim, 2 * seqlen]
+        X = torch.fft.fft(x, n=fft_size).repeat(bs, 1, 1, 1)
+        state = torch.fft.ifft(U[..., None, :] * X, n=fft_size)[..., :L]
+        return state

stripedhyena/generation.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# Copyright (c) Together
+# This software is distributed under the terms of the Apache License, Version 2.0
+# Author: Michael Poli
+# Barebones generation class for standalone inference.
+import torch
+from stripedhyena.sample import sample
+from stripedhyena.tokenizer import CharLevelTokenizer
+from stripedhyena.utils import print_rank_0
+class Generator:
+    def __init__(self, model, tokenizer, top_k=50, top_p=0.7, temperature=1):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.top_k = top_k
+        self.top_p = top_p
+        self.temperature = temperature
+        self.untils = ["\n\n"]
+    def generate(
+        self,
+        device,
+        input_string=None,
+        input_ids=None,
+        num_tokens=32,
+        cached_generation=False,
+        print_generation=True,
+        verbose=False,
+        skip_special_tokens=False,
+        stop_at_eos=True,
+        max_seqlen=None,
+    ):
+        if isinstance(self.tokenizer.eos, int):
+            eos_token_ids = torch.LongTensor([self.tokenizer.eos]).to(device)
+        else:
+            # is a tensor
+            eos_token_ids = self.tokenizer.tokenize(self.tokenizer.eos).to(device)
+        if input_ids is None:
+            input = self.tokenizer.tokenize(input_string)
+            if isinstance(input, list):
+                input = torch.LongTensor(input).unsqueeze(0).to(device)
+            # is a tensor
+            else:
+                input = input.unsqueeze(0).to(device)
+        else:
+            input = input_ids
+        x = input
+        if max_seqlen is not None:
+            x = x[:, -max_seqlen:]
+        prompt_len = x.shape[-1]
+        num_tokens = int(num_tokens)
+        tot_length = prompt_len + num_tokens
+        batch_size = x.shape[0]
+        generation = torch.empty(
+            x.shape[0],
+            num_tokens,
+            dtype=torch.long,
+            device=x.device,
+        )
+        scores = torch.empty(
+            x.shape[0],
+            num_tokens,
+            self.tokenizer.vocab_size,
+            dtype=torch.float,
+            device=x.device,
+        )
+        if cached_generation:
+            inference_params_dict_out = self.model.initialize_inference_params()
+            inference_params_dict_out["mha"].max_batch_size = batch_size
+            inference_params_dict_out["hyena"].max_batch_size = batch_size
+        else:
+            inference_params_dict_out = None
+        if verbose:
+            mem_after_tok = torch.cuda.memory_allocated(device=x.device) / 1e9
+            print_rank_0(f"Memory after tokenization: {mem_after_tok} GB")
+            print_rank_0("Starting generation...")
+            if input_string is not None:
+                print_rank_0("Prompt: " + input_string)
+            else:
+                print_rank_0(f"Prompt ids: {input_ids} {input_ids.shape}")
+        for i in range(int(num_tokens)):
+            post_prefill = cached_generation and i > 0
+            # prefill then process only the last token
+            if post_prefill:
+                x = x[:, -1:]
+                seqlen_offset = inference_params_dict_out["mha"].seqlen_offset
+                if seqlen_offset == 0:
+                    seqlen_offset = input.shape[-1]
+                    inference_params_dict_out["hyena"].seqlen_offset = seqlen_offset
+                    inference_params_dict_out["mha"].seqlen_offset = seqlen_offset
+                else:
+                    inference_params_dict_out["mha"].seqlen_offset += 1
+                    inference_params_dict_out["hyena"].seqlen_offset += 1
+            # do forward pass with no gradient
+            with torch.no_grad():
+                logits, inference_params_dict_out = self.model(
+                    x,
+                    inference_params_dict=inference_params_dict_out,
+                )
+            last_logits = logits[:, -1]
+            new_idx = sample(
+                last_logits,
+                top_k=self.top_k,
+                top_p=self.top_p,
+                temperature=self.temperature,
+            )
+            if stop_at_eos and (generation[0, -2:] == eos_token_ids).all():
+                print_rank_0("Stopping generation at EOS")
+            if print_generation and verbose and batch_size == 1:
+                print_rank_0(
+                    f"{self.tokenizer.detokenize([new_idx.item()])}",
+                    end=" ",
+                )
+            scores[:, i] = last_logits
+            generation[:, i] = new_idx
+            if post_prefill:
+                x = new_idx[:, None]
+            else:
+                x = torch.cat([x, new_idx[:, None]], dim=-1)
+        if verbose:
+            kwargs = {}
+            if not isinstance(self.tokenizer, CharLevelTokenizer):
+                kwargs["skip_special_tokens"] = skip_special_tokens
+            y = self.tokenizer.detokenize_batch(generation[:, : i + 1], **kwargs)
+            for until in self.untils:
+                if until in y:
+                    y = y.split(until)[0]
+                    break
+            print_rank_0(f"\nInput: {input_string}, Output: {y}")
+            mem_end = torch.cuda.memory_allocated(device=x.device) / 1e9
+            print_rank_0(f"Memory after generation: {mem_end} GB")
+        return generation[:, : i + 1], scores[:, : i + 1]

stripedhyena/layers.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# Copyright (c) Together
+# This software is distributed under the terms of the Apache License, Version 2.0
+# Author: Michael Poli
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch import Tensor
+from stripedhyena.utils import grab_first_if_tuple
+class RMSNorm(torch.nn.Module):
+    def __init__(self, config):
+        super(RMSNorm, self).__init__()
+        self.eps, self.hidden_size = config.eps, config.hidden_size
+        self.scale = torch.nn.Parameter(torch.ones(self.hidden_size))
+        self.register_parameter("scale", self.scale)
+        self.scale = self.scale.to(config.params_dtype)
+        self.use_flash_rmsnorm = config.get("use_flash_rmsnorm", False)
+        if self.use_flash_rmsnorm:
+            from flash_attn.ops.rms_norm import rms_norm as rmsnorm_func
+            self.rmsnorm_func = rmsnorm_func
+    def forward(self, x):
+        if self.use_flash_rmsnorm:
+            return self.rmsnorm_func(x, self.scale, self.eps)
+        else:
+            y = x / (x.norm(2, dim=-1, keepdim=True) * self.hidden_size ** (-1.0 / 2) + self.eps)
+            return self.scale * y
+class ParallelGatedMLP(nn.Module):
+    def __init__(
+        self,
+        config,
+    ):
+        super().__init__()
+        multiple_of = config.get("inner_size_multiple_of", 64)
+        self.act_type = config.get("mlp_activation", "silu")
+        if self.act_type == "gelu":
+            self.act = F.gelu
+        elif self.act_type == "silu":
+            self.act = F.silu
+        else:
+            raise NotImplementedError
+        self.multiple_of = multiple_of * config.model_parallel_size
+        inner_size = int(2 * config.hidden_size * 4 / 3)
+        inner_size = self.multiple_of * ((inner_size + self.multiple_of - 1) // self.multiple_of)
+        if config.get("inner_mlp_size", None) is not None:
+            inner_size = config.inner_mlp_size
+        self.l1 = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=inner_size,
+            bias=False,
+        )
+        self.l2 = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=inner_size,
+            bias=False,
+        )
+        self.l3 = nn.Linear(
+            in_features=inner_size,
+            out_features=config.hidden_size,
+            bias=False,
+        )
+    def forward(self, z):
+        z1, z2 = self.l1(z), self.l2(z)
+        z1, z2 = grab_first_if_tuple(z1), grab_first_if_tuple(z2)
+        y = self.l3(self.act(z1) * z2)
+        return grab_first_if_tuple(y)
+class Embedding(nn.Module):
+    _train_dtype = "bf16"
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
+    def embed(self, input_ids, position_ids=None, tokentype_ids=None):
+        embeddings = self.word_embeddings(input_ids)
+        return embeddings
+    def unembed(self, u):
+        weight = self.word_embeddings.weight
+        return torch.matmul(u, weight)
+class VocabParallelEmbedding(nn.Embedding):
+    "Adapted from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/embedding.py"
+    def __init__(self, config):
+        vocab_size, process_group, padding_idx = (
+            config.vocab_size,
+            config.get("process_group", None),
+            config.get("padding_idx", None),
+        )
+        self.process_group = process_group
+        if process_group is not None:
+            world_size = torch.distributed.get_world_size(process_group)
+            if vocab_size % world_size != 0:
+                raise ValueError(f"vocab_size ({vocab_size}) must be divisible by " f"world_size ({world_size})")
+            if world_size > 1 and padding_idx is not None:
+                raise RuntimeError("ParallelEmbedding does not support padding_idx")
+        else:
+            world_size = 1
+        super().__init__(
+            vocab_size // world_size,
+            embedding_dim=config.hidden_size,
+            padding_idx=padding_idx,
+        )
+    def embed(self, input: Tensor) -> Tensor:
+        if self.process_group is None:
+            return self.forward(input)
+        else:
+            rank = torch.distributed.get_rank(self.process_group)
+            vocab_size = self.num_embeddings
+            vocab_start_index, vocab_end_index = (
+                rank * vocab_size,
+                (rank + 1) * vocab_size,
+            )
+            # Create a mask of valid vocab ids (1 means it needs to be masked).
+            input_ids_mask = (input < vocab_start_index) | (input >= vocab_end_index)
+            input = input - vocab_start_index
+            input[input_ids_mask] = 0
+            embeddings = self.forward(input)
+            embeddings[input_ids_mask] = 0.0
+            # Reduce to the global process group
+            torch.distributed.all_reduce(embeddings, group=self.process_group)
+            return embeddings
+    def unembed(self, u: Tensor) -> Tensor:
+        if self.process_group is None:
+            return u @ self.weight.T
+        else:
+            raise NotImplementedError

stripedhyena/model.py ADDED Viewed

	@@ -0,0 +1,445 @@

+# Copyright (c) Together
+# This software is distributed under the terms of the Apache License, Version 2.0
+# Author: Michael Poli
+# Note: MP and PP utilities are removed for ease of use and editing.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from stripedhyena.cache import InferenceParams, RecurrentInferenceParams
+from stripedhyena.engine import HyenaInferenceEngine
+from stripedhyena.layers import ParallelGatedMLP, RMSNorm, VocabParallelEmbedding
+from stripedhyena.utils import column_split, print_rank_0
+try:
+    from flash_attn.modules.mha import MHA
+except ImportError:
+    "flash_attn not installed"
+try:
+    from stripedhyena.positional_embeddings import swap_mha_rope
+except ImportError:
+    "could not import swap_mha_rope from src.positional_embeddings"
+class AttentionBlock(nn.Module):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__()
+        self.config = config
+        self.pre_norm, self.post_norm = RMSNorm(config), RMSNorm(config)
+        self.layer_idx = layer_idx
+        self.proj_groups = config.get("proj_groups", 1)
+        dtype = config.get("attn_block_dtype", torch.bfloat16)
+        mlp_dtype = config.get("mlp_dtype", torch.bfloat16)
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size_per_attention_head = config.hidden_size // config.num_attention_heads
+        self.counter = 0
+        self.inner_mha_cls = MHA(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_heads_kv=config.num_attention_heads // self.proj_groups,
+            rotary_emb_dim=config.hidden_size // config.num_attention_heads,
+            qkv_proj_bias=config.get("qkv_proj_bias", True),
+            rotary_emb_base=config.get("rotary_emb_base", 10000),
+            causal=True,
+            layer_idx=layer_idx,
+            out_proj_bias=config.get("mha_out_proj_bias", True),
+            use_flash_attn=self.config.use_flash_attn,
+        ).to(dtype=dtype)
+        # check if using interpolated rotary pos emb from config, and swap the rope emb
+        if config.get("use_interpolated_rotary_pos_emb", False):
+            swap_mha_rope(
+                mha=self.inner_mha_cls,
+                kwargs_new_rope={"scaling_factor": config.get("rotary_emb_scaling_factor", 1.0)},
+            )
+        if self.config.get("smeared_gqa", False):
+            self.inner_mha_cls.num_heads_kv = self.inner_mha_cls.num_heads
+        self.inner_mha_cls.rotary_emb.register_buffer("inv_freq", self.inner_mha_cls.rotary_emb.inv_freq)
+        self.mlp = ParallelGatedMLP(config).to(dtype=mlp_dtype)
+    def forward(self, u, inference_params=None, padding_mask=None, *args, **kwargs):
+        if (
+            type(padding_mask) == torch.Tensor
+        ):  # workaround for masking bug in FA. This works because Wqkv does not have bias
+            # and attention scores will be also automatically zeroed.
+            u = u * padding_mask[..., None]
+        u = (
+            self.inner_mha_cls(
+                self.pre_norm(u),
+                inference_params=inference_params,
+            )
+            + u
+        )
+        if type(padding_mask) == torch.Tensor:  # guard against bias
+            u = u * padding_mask[..., None]
+        u = self.mlp(self.post_norm(u)) + u
+        return u, None
+class ParallelHyenaFilter(nn.Module):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hyena_filter_groups = config.get("hyena_filter_groups", self.config.hidden_size)
+        self.use_flashfft = config.get("use_flashfft", False)
+        self.state_size = config.state_size
+        self.hidden_size = config.hidden_size
+        self.num_filters = config.num_filters
+        self.inference_mode = config.get("inference_mode", True)
+        self.counter = 0
+        self.column_split_hyena = config.get("column_split_hyena", True)
+        assert self.hidden_size % self.num_filters == 0 and self.num_filters <= self.hidden_size
+        self.D = nn.Parameter(torch.zeros(self.hidden_size))
+        # attention heads are not used except to split post short_filter
+        # projections in the same way as the checkpoint
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size_per_attention_head = self.hidden_size // self.num_attention_heads
+        # after preprocessing here we can save the new checkpoint
+        self.short_filter_length = config.short_filter_length
+        self.short_filter_weight = nn.Parameter(torch.randn(3 * config.hidden_size, 1, config.short_filter_length))
+        self.short_filter_bias = (
+            nn.Parameter(torch.randn(3 * config.hidden_size)) if config.short_filter_bias else None
+        )
+        self.engine = HyenaInferenceEngine(layer_idx=layer_idx)
+        self.use_flash_depthwise = config.get("use_flash_depthwise", False)
+        self.data_dtype = None
+        if self.use_flash_depthwise:
+            try:
+                from flashfftconv import FlashDepthwiseConv1d
+                self.fir_fn = FlashDepthwiseConv1d(
+                    channels=3 * self.hidden_size,
+                    kernel_size=self.short_filter_length,
+                    padding=self.short_filter_length - 1,
+                    weights=self.short_filter_weight,
+                    bias=self.short_filter_bias,
+                    device=None,
+                    dtype=self.config.get("depthwise_dtype", torch.bfloat16),
+                )
+            except ImportError:
+                "flashfftconv not installed"
+        else:
+            self.fir_fn = F.conv1d
+        self.fftconv_fn = None
+        self.long_fir_threshold = config.get("long_fir_threshold", None)
+        if self.long_fir_threshold is not None:
+            assert self.use_flashfft is False, "long_fir_threshold not compatible with fused flashfft"
+        self.num_systems = self.hidden_size // self.hyena_filter_groups
+        poles = torch.randn(self.num_systems, self.state_size, 1, 2)
+        # TODO: bring over init from internals
+        poles[..., 0] = 1e-2 * torch.randn(self.num_systems, self.state_size, 1)
+        poles[..., 1] = 1e-3 * torch.randn(self.num_systems, self.state_size, 1)
+        self.poles = nn.Parameter(poles)
+        self.residues = nn.Parameter(torch.randn(self.num_systems, self.state_size, 1, 2))
+        self.h = None
+    def forward(self, u, inference_params=None, padding_mask=None, *args, **kwargs):
+        if inference_params is not None and self.layer_idx in inference_params.fir_state_dict.keys():
+            return self.sequential_forward(u, inference_params)
+        else:
+            return self.parallel_forward(u, inference_params, padding_mask)
+    def parallel_forward(self, u, inference_params=None, padding_mask=None):
+        L = u.shape[1]
+        z_pre, fir_state = self.engine.parallel_fir(
+            self.fir_fn,
+            u,
+            self.short_filter_weight,
+            self.short_filter_bias,
+            L,
+            fir_length=self.short_filter_length,
+            inference_params=inference_params,
+            padding_mask=padding_mask,
+        )
+        if inference_params:
+            inference_params.fir_state_dict[self.layer_idx] = fir_state
+        if self.h is None:
+            h, filter_dtype, poles, residues = self.compute_filter(L, u.device)
+        else:
+            h = self.h
+            filter_dtype = self.h.dtype
+        if self.hyena_filter_groups > 1:
+            h = h.repeat_interleave(self.hidden_size // self.hyena_filter_groups, 1)
+        # if inference_params is not None, we plan to perform generation:
+        # prefilling is handled by the engine.
+        dims = (
+            self.hidden_size,
+            self.num_attention_heads,
+            self.hidden_size_per_attention_head,
+            self.state_size,
+            self.hyena_filter_groups,
+        )
+        y = self.engine.parallel_iir(
+            z_pre,
+            h,
+            self.D,
+            L,
+            t=self.t,
+            poles=self.poles,
+            residues=self.residues,
+            dims=dims,
+            inference_params=inference_params,
+            layer_idx=self.layer_idx,
+            prefill_style=self.config.get("prefill_style", "fft"),
+            use_flashfft=self.use_flashfft,
+            fftconv_fn=self.fftconv_fn,
+            column_split_hyena=self.column_split_hyena,
+            long_fir_threshold=self.long_fir_threshold,
+            padding_mask=padding_mask,
+        )
+        return y, inference_params
+    def sequential_forward(self, u, inference_params):
+        if self.data_dtype is None:
+            self.data_dtype = u.dtype
+        if len(u.shape) > 2:
+            u = u[:, -1]
+        fir_state, iir_state = (
+            inference_params.fir_state_dict[self.layer_idx],
+            inference_params.state_dict[self.layer_idx],
+        )
+        z_pre, fir_state = self.engine.step_fir(
+            u, fir_state, weight=self.short_filter_weight, bias=self.short_filter_bias
+        )
+        x2, x1, v = (
+            column_split(z_pre, self.num_attention_heads, self.hidden_size_per_attention_head)
+            if self.column_split_hyena
+            else z_pre.split([self.hidden_size, self.hidden_size, self.hidden_size], dim=1)
+        )
+        y, iir_state = self.engine.step_iir(
+            x2,
+            x1,
+            v,
+            self.D,
+            self.residues,
+            self.poles,
+            iir_state,
+            iir_groups=self.hyena_filter_groups,
+        )
+        inference_params.fir_state_dict[self.layer_idx] = fir_state
+        inference_params.state_dict[self.layer_idx] = iir_state
+        y = y.to(dtype=self.data_dtype)
+        return y[:, None], inference_params
+    def update_time(self, L, device):
+        """
+        Set [0, 1, ..., L-1] where L is the length of the current batch of inputs.
+        If L is greater than the length of the previous batch, then the time vector is
+        reinitialized. Otherwise, the time vector is truncated from cache.
+        """
+        if not hasattr(self, "t"):
+            self.t = torch.arange(L, device=device)[None, None]
+        elif self.t.shape[-1] < L:
+            self.t = torch.arange(L, device=device)[None, None]
+        else:
+            self.t = self.t[..., :L]
+    def compute_filter(self, L, device):
+        self.update_time(L, device)
+        filter_dtype = torch.float32
+        residues, log_poles = (
+            torch.view_as_complex(self.residues.to(filter_dtype)),
+            torch.view_as_complex(self.poles.to(filter_dtype)).log(),
+        )
+        h = (residues * (log_poles * self.t).exp()).real.sum(1)[None]
+        return h, filter_dtype, log_poles, residues
+class ParallelGatedConvBlock(nn.Module):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.low_mem_mode = config.get("low_mem_mode", False)
+        dtype = config.get("hyena_block_dtype", torch.float32)
+        mlp_dtype = config.get("mlp_dtype", torch.bfloat16)
+        self.pre_norm, self.post_norm = RMSNorm(config).to(dtype=dtype), RMSNorm(config).to(dtype=dtype)
+        self.filter = ParallelHyenaFilter(config, layer_idx).to(dtype=dtype)
+        self.projections = nn.Linear(config.hidden_size, 3 * config.hidden_size)
+        self.out_filter_dense = nn.Linear(config.hidden_size, config.hidden_size).to(dtype)
+        self.mlp = ParallelGatedMLP(config).to(dtype=mlp_dtype)
+        self.proj_norm_fn = self.proj_norm
+        self.res_mlp_norm_fn = self.res_mlp_norm
+        if self.config.get("compile", False):
+            self.proj_norm_fn = torch.compile(self.proj_norm, fullgraph=True, dynamic=False, mode="reduce-overhead")
+            self.res_mlp_norm_fn = torch.compile(
+                self.res_mlp_norm, fullgraph=True, dynamic=False, mode="reduce-overhead"
+            )
+    def proj_norm(self, x):
+        return self.projections(self.pre_norm(x))
+    def res_mlp_norm(self, x):
+        return self.mlp(self.post_norm(x)) + x
+    def forward(self, u, inference_params=None, padding_mask=None, *args, **kwargs):
+        z = self.proj_norm_fn(u)
+        if type(padding_mask) == torch.Tensor:  # guard against bias
+            z = z * padding_mask[..., None]
+        z, inference_params = self.filter(z, inference_params=inference_params, padding_mask=padding_mask)
+        z_in = self.out_filter_dense(z) + u
+        if type(padding_mask) == torch.Tensor:  # guard against bias
+            z_in = z_in * padding_mask[..., None]
+        y = self.res_mlp_norm_fn(z_in)
+        return y, inference_params
+def get_block(config, layer_idx, flash_fft=None):
+    if layer_idx in config.attn_layer_idxs:
+        return AttentionBlock(config, layer_idx)
+    elif layer_idx in config.hyena_layer_idxs:
+        block = ParallelGatedConvBlock(config, layer_idx)
+        if config.get("use_flashfft", "False"):
+            block.filter.fftconv_fn = flash_fft
+        return block
+    else:
+        raise NotImplementedError
+class StripedHyena(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embedding_layer = VocabParallelEmbedding(config)
+        self.norm = RMSNorm(config) if config.get("final_norm", True) else None
+        self.unembed = self.embedding_layer if config.tie_embeddings else VocabParallelEmbedding(config)
+        if config.get("use_flashfft", "True"):
+            try:
+                from flashfftconv import FlashFFTConv
+                self.flash_fft = FlashFFTConv(config.seqlen, dtype=torch.bfloat16)
+            except ImportError:
+                "flashfftconv not installed"
+        else:
+            self.flash_fft = None
+        self.blocks = nn.ModuleList(
+            get_block(config, layer_idx, flash_fft=self.flash_fft) for layer_idx in range(config.num_layers)
+        )
+    def forward(self, x, inference_params_dict=None, padding_mask=None):
+        L = x.shape[1]
+        x = self.embedding_layer.embed(x)
+        if inference_params_dict is not None:
+            x, inference_params_dict_out = self.stateful_forward(
+                x,
+                inference_params_dict=inference_params_dict,
+            )
+        else:
+            x, inference_params_dict_out = self.stateless_forward(x, padding_mask=padding_mask)
+        x = self.norm(x)
+        x = self.unembed.unembed(x)
+        return x, inference_params_dict_out
+    def stateful_forward(self, x, inference_params_dict=None):
+        for block_idx, block in enumerate(self.blocks):
+            block_name = "mha" if block_idx in self.config.attn_layer_idxs else "hyena"
+            inference_params = inference_params_dict[block_name]
+            x, _ = block(x, inference_params=inference_params)
+        return x, inference_params_dict
+    def stateless_forward(self, x, padding_mask=None):
+        if type(padding_mask) == torch.Tensor:
+            x = x * padding_mask[..., None]
+        for _, block in enumerate(self.blocks):
+            x, _ = block(x, inference_params=None, padding_mask=padding_mask)
+        return x, None
+    def initialize_inference_params(self):
+        inference_params_dict = {
+            "mha": InferenceParams(
+                max_seqlen=self.config.get("max_seqlen", 8192),
+                max_batch_size=self.config.get("max_batch_size", 1),
+                seqlen_offset=0,
+            ),
+            "hyena": RecurrentInferenceParams(
+                fir_filter_length=self.config.short_filter_length,
+                state_dim=self.config.state_size,
+                seqlen_offset=0,
+            ),
+        }
+        return inference_params_dict
+    def precompute_filters(self, L, device):
+        for block_idx, block in enumerate(self.blocks):
+            if type(block) == ParallelGatedConvBlock:
+                if type(block.filter) == ParallelHyenaFilter:
+                    L = block.filter.long_fir_threshold or L
+                    print_rank_0(f"Precomputing filters, L={L}...")
+                    filter_dtype = torch.float16 if L >= 2048 else torch.float32
+                    block.filter._set_time(L, device)
+                    residues, poles = (
+                        torch.view_as_complex(block.filter.residues.to(torch.float16)),
+                        torch.view_as_complex(block.filter.poles.to(torch.float16)),
+                    )
+                    block.filter.h = (residues * poles**block.filter.t).real.sum(1)[None]
+                    block.filter.h = block.filter.h.to(dtype=filter_dtype)
+    def load_poles_residues(self, path):
+        "Load different poles and residues for each layer."
+        for block_idx, block in enumerate(self.blocks):
+            if type(block) == ParallelGatedConvBlock:
+                if type(block.filter) == ParallelHyenaFilter:
+                    print(f"Loading poles and residues for block {block_idx}")
+                    poles = torch.load(path + f"/approx_poles_{block_idx+1}.pt", map_location="cpu")
+                    poles = torch.view_as_real(poles)
+                    residues = torch.load(path + f"/approx_residues_{block_idx+1}.pt", map_location="cpu")
+                    residues = torch.view_as_real(residues)
+                    poles = poles.permute(1, 0, 2).unsqueeze(-2)
+                    residues = residues.permute(1, 0, 2).unsqueeze(-2)
+                    block.filter.poles = nn.Parameter(poles)
+                    block.filter.residues = nn.Parameter(residues)
+    def to_bfloat16_except_poles_residues(self):
+        """Convert all parameters to bfloat16 except for the poles and residues.
+        Particularly important for longer prompts.
+        """
+        for k, p in self.named_parameters():
+            if "poles" not in k and "residues" not in k:
+                p.data = p.data.to(torch.bfloat16)

stripedhyena/positional_embeddings.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+Armin Thomas, Jan 2023.  Modified by Eric Nguyen.
+Wrappers for linearly interpolated rope embeddings to use inside of MHA layers of Flash Attn.
+"""
+import copy
+import torch
+from einops import rearrange
+from flash_attn.layers.rotary import RotaryEmbedding
+from flash_attn.modules.mha import MHA
+# simple wrapper for flash-attn RoPE with linear scaling:
+class LinearlyScaledRotaryEmbedding(RotaryEmbedding):
+    def __init__(
+        self,
+        dim: int,
+        scaling_factor: float = 1.0,
+        base=10000.0,
+        interleaved=False,
+        scale_base=None,
+        pos_idx_in_fp32=True,
+        device=None,
+    ):
+        super().__init__(
+            dim=dim,
+            base=base,
+            interleaved=interleaved,
+            scale_base=scale_base,
+            pos_idx_in_fp32=pos_idx_in_fp32,
+            device=device,
+        )
+        self._linear_scaling_factor = scaling_factor
+    # adpated from: https://github.com/Dao-AILab/flash-attention/blob/43ceab630bc6c27712428da5a33fc9cb5c369d91/flash_attn/layers/rotary.py#L368
+    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
+        # Reset the tables if the sequence length has changed,
+        # if we're on a new device (possibly due to tracing for instance),
+        # or if we're switching from inference mode to training
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached is None
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+            or (self.training and self._cos_cached.is_inference())
+        ):
+            self._seq_len_cached = seqlen
+            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
+            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
+            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
+            if self.pos_idx_in_fp32:
+                t = torch.arange(seqlen, device=device, dtype=torch.float32)
+                # linear scaling:
+                t = t / self._linear_scaling_factor
+                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
+                # will be large. Having it in bf16 will lose a lot of precision and cause the
+                # cos & sin output to change significantly.
+                # We want to recompute self.inv_freq if it was not loaded in fp32
+                if self.inv_freq.dtype != torch.float32:
+                    inv_freq = self._compute_inv_freq(device=device)
+                else:
+                    inv_freq = self.inv_freq
+            else:
+                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+                # linear scaling:
+                t = t / self._linear_scaling_factor
+                inv_freq = self.inv_freq
+            # Don't do einsum, it converts fp32 to fp16 under AMP
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            freqs = torch.outer(t, inv_freq)
+            if self.scale is None:
+                self._cos_cached = torch.cos(freqs).to(dtype)
+                self._sin_cached = torch.sin(freqs).to(dtype)
+            else:
+                power = (
+                    torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) - seqlen // 2
+                ) / self.scale_base
+                scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
+                # We want the multiplication by scale to happen in fp32
+                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
+                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
+                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
+                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
+# swap out RoPE of existing mha:
+def swap_mha_rope(mha, new_rope: torch.nn.Module = LinearlyScaledRotaryEmbedding, kwargs_new_rope: dict = None):
+    # determine mha dtype and device:
+    dtype = mha.Wq.weight.dtype if mha.cross_attn else mha.Wqkv.weight.dtype
+    device = mha.Wq.weight.device if mha.cross_attn else mha.Wqkv.weight.device
+    # determine RoPE settings:
+    kwargs_old_rope = dict(
+        dim=mha.rotary_emb.dim,
+        base=mha.rotary_emb.base,
+        interleaved=mha.rotary_emb.interleaved,
+        scale_base=mha.rotary_emb.scale_base,
+        pos_idx_in_fp32=mha.rotary_emb.pos_idx_in_fp32,
+        device=mha.rotary_emb.inv_freq.device,
+    )
+    # delete old RoPE:
+    del mha.rotary_emb
+    # create new RoPE:
+    kwargs_new_rope = kwargs_new_rope or {"scaling_factor": 1.0}
+    scaled_rope = new_rope(**kwargs_new_rope, **kwargs_old_rope).to(dtype)
+    # attach new RoPE to mha:
+    mha.rotary_emb = scaled_rope
+    # make new sure RoPE is correctly registered:
+    assert isinstance(mha.rotary_emb, new_rope)
+    return mha

stripedhyena/sample.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import torch
+# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
+# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L231
+def modify_logits_for_top_k_filtering(logits, top_k):
+    """Set the logits for none top-k values to -inf. Done in-place."""
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits.masked_fill_(indices_to_remove, float("-Inf"))
+# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py
+# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L170
+def modify_logits_for_top_p_filtering(logits, top_p):
+    """Set the logits for none top-p values to -inf. Done in-place."""
+    if top_p <= 0.0 or top_p >= 1.0:
+        return
+    # First sort and calculate cumulative sum of probabilities.
+    sorted_logits, sorted_indices = torch.sort(logits, descending=False)
+    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+    # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
+    sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
+    # scatter sorted tensors to original indexing
+    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+    logits.masked_fill_(indices_to_remove, float("-inf"))
+# https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/generation.py
+def sample(logits, top_k=1, top_p=0.0, temperature=1.0):
+    """Sample from top-k logits.
+    Arguments:
+        logits: Tensor of shape (batch_size, vocab_size)
+    """
+    logits = torch.nan_to_num(logits)
+    logits = torch.where(logits == float("-inf"), 0, logits)
+    logits = torch.where(logits == float("inf"), 0, logits)
+    if top_k == 1:  # Short-circuit for greedy decoding
+        return logits.argmax(dim=-1)
+    else:
+        if top_p > 0.0:
+            assert top_p <= 1.0, "top-p should be in (0, 1]."
+        if top_k > 0:
+            top_k = min(top_k, logits.size(-1))  # Safety check
+            logits_top, indices = torch.topk(logits, top_k, dim=-1)
+            if temperature != 1.0:
+                logits_top /= temperature
+            modify_logits_for_top_p_filtering(logits_top, top_p)
+            return indices[
+                torch.arange(indices.shape[0], device=indices.device),
+                torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1),
+            ]
+        else:
+            # Clone so that when we modify for top_p we don't change the original logits
+            logits_top = logits / temperature if temperature != 1.0 else logits.clone()
+            modify_logits_for_top_p_filtering(logits_top, top_p)
+            return torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1)

stripedhyena/tokenizer.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# based on https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
+import json
+import pathlib
+from abc import ABC, abstractmethod
+from typing import List, Union
+import numpy as np
+import torch
+import tqdm
+from tokenizers import Tokenizer
+class HFAutoTokenizer:
+    def __init__(self, vocab_file):
+        self.tokenizer = Tokenizer.from_file(vocab_file)
+        self.eos = "</s>"
+        self.bos = "<s>"
+        self.eos_id = self.tokenize(self.eos)
+        self.bos_id = self.tokenize(self.bos)
+        self.vsize = 32000
+    def encode_to_list(self, text):
+        return self.tokenizer.encode(text, add_special_tokens=False)
+    def tokenize_file(self, input_file, output_file, verbose=False):
+        if verbose:
+            print(f"Tokenizing file: {input_file}")
+        if pathlib.Path(output_file).exists():
+            print(f"Output file {output_file} already exists, skipping")
+            return
+        with open(input_file, "r") as fin, open(output_file, "w") as fout:
+            for line in tqdm.tqdm(fin):
+                if verbose:
+                    print(f"Tokenizing line: {line[-200:]}")
+                data = json.loads(line.strip())
+                if "text" not in data.keys():
+                    break
+                tokenized_data = self.tokenize(data["text"])
+                fout.write(json.dumps({"tokens": tokenized_data}) + "\n")
+    def tokenize(self, text: str, *args, **kwargs):
+        ids = self.tokenizer.encode(text)
+        if type(ids) == list:
+            return torch.tensor(ids)
+        else:
+            return torch.tensor(ids.ids)
+    def tokenize_batch(self, text_batch):
+        return self.tokenizer.encode_batch(text_batch)
+    def detokenize(self, token_ids, skip_special_tokens=False):
+        return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
+    def detokenize_batch(self, token_ids_batch, skip_special_tokens=False):
+        out = []
+        for token_ids in token_ids_batch:
+            out.append(
+                self.detokenize(
+                    [t.item() for t in token_ids],
+                    skip_special_tokens=skip_special_tokens,
+                )
+            )
+        return out
+    @property
+    def eod(self):
+        return self.eod_id
+    @property
+    def vocab_size(self):
+        return 32000
+class AbstractTokenizer(ABC):
+    """Abstract class for tokenizer."""
+    def __init__(self, name):
+        self.name = name
+        super().__init__()
+    @property
+    @abstractmethod
+    def vocab_size(self):
+        pass
+    @property
+    @abstractmethod
+    def vocab(self):
+        """Dictionary from vocab text token to id token."""
+        pass
+    @property
+    @abstractmethod
+    def inv_vocab(self):
+        """Dictionary from vocab id token to text token."""
+        pass
+    @abstractmethod
+    def tokenize(self, text):
+        pass
+    def detokenize(self, token_ids):
+        raise NotImplementedError("detokenizer is not implemented for {} " "tokenizer".format(self.name))
+    @property
+    def cls(self):
+        raise NotImplementedError("CLS is not provided for {} " "tokenizer".format(self.name))
+    @property
+    def sep(self):
+        raise NotImplementedError("SEP is not provided for {} " "tokenizer".format(self.name))
+    @property
+    def pad(self):
+        raise NotImplementedError("PAD is not provided for {} " "tokenizer".format(self.name))
+    @property
+    def eod(self):
+        raise NotImplementedError("EOD is not provided for {} " "tokenizer".format(self.name))
+    @property
+    def mask(self):
+        raise NotImplementedError("MASK is not provided for {} " "tokenizer".format(self.name))
+class CharLevelTokenizer(AbstractTokenizer):
+    """Character Level Tokenizer"""
+    def __init__(self, vocab_size):
+        name = "CharLevelTokenizer"
+        super().__init__(name)
+        self._vocab_size = vocab_size
+        self.eod_id = 0
+        self.eos_id = 0
+        self.pad_id = 1
+    def clamp(self, n):
+        return max(32, min(n, self.vocab_size))
+    @property
+    def vocab_size(self):
+        return self._vocab_size
+    @property
+    def vocab(self):
+        raise NotImplementedError
+    @property
+    def inv_vocab(self):
+        raise NotImplementedError
+    def decode_token(self, token: int):
+        return str(chr(self.clamp(token)))
+    def tokenize(self, text: str):
+        return list(np.frombuffer(text.encode(), dtype=np.uint8))
+    def tokenize_batch(self, text_batch: Union[List[str], str]):
+        if isinstance(text_batch, list):
+            return [self.tokenize(s) for s in text_batch]
+        else:
+            return self.tokenize(text_batch)
+    def detokenize(self, token_ids):
+        return "".join(list(map(self.decode_token, token_ids)))
+    def detokenize_batch(self, token_ids: Union[List[str], str]):
+        if isinstance(token_ids, list):
+            return [self.detokenize(s) for s in token_ids]
+        # elif if tensor, convert to list first
+        elif isinstance(token_ids, torch.Tensor):
+            return [self.detokenize(s) for s in token_ids.tolist()]
+        else:
+            return self.detokenize(token_ids)
+    @property
+    def eod(self):
+        return self.eod_id
+    # duplicate to suppose both names, eos and eod
+    @property
+    def eos(self):
+        return self.eod_id

stripedhyena/utils.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+def grab_first_if_tuple(x):
+    if x.__class__.__name__ == "tuple":
+        return x[0]
+    else:
+        return x
+def column_split(x, num_heads, head_size):
+    """Split a tensor with `num_heads` alongside the head dimension, instead of
+    across heads. Fixed to three projections
+    """
+    x_reshaped = x.reshape(
+        x.shape[0],
+        num_heads,
+        3 * head_size,
+    )
+    x2, x1, v = (
+        x_reshaped[:, :, :head_size],
+        x_reshaped[
+            :,
+            :,
+            head_size : 2 * head_size,
+        ],
+        x_reshaped[:, :, 2 * head_size :],
+    )
+    x2, x1, v = (
+        x2.reshape(x2.shape[0], -1),
+        x1.reshape(x1.shape[0], -1),
+        v.reshape(v.shape[0], -1),
+    )
+    return x2, x1, v
+def get_init_from_string(init_str):
+    if type(init_str) == str:
+        if init_str == "torch.nn.init.zeros_":
+            return torch.nn.init.zeros_
+        elif init_str == "torch.nn.init.xavier_uniform_":
+            return torch.nn.init.xavier_uniform_
+        elif init_str == "torch.nn.init.xavier_normal_":
+            return torch.nn.init.xavier_normal_
+        else:
+            raise ValueError(f"Unrecognized init {init_str}")
+def print_rank_0(message, debug=False, end="\n"):
+    """Print from rank 0 only."""
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print(message, flush=True, end=end)
+    else:
+        print(message, flush=True, end=end)
+class dotdict(dict):
+    """dot.notation access to dictionary attributes"""
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator)
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+class VocabUtility:
+    """Split the vocabulary into `world_size` chunks amd return the
+    first and last index of the vocabulary belonging to the `rank`
+    partition: Note that indices in [first, last]"""
+    @staticmethod
+    def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size):
+        index_f = rank * per_partition_vocab_size
+        index_l = index_f + per_partition_vocab_size
+        return index_f, index_l
+    @staticmethod
+    def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
+        per_partition_vocab_size = divide(global_vocab_size, world_size)
+        return VocabUtility.vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, world_size)