TRIA

Sleeping

App Files Files Community

saumyap29 commited on Nov 12, 2025

Commit

c9f87fa

1 Parent(s): ecf18ad

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
.gradio/certificate.pem +31 -0
README.md +1 -2
app.py +192 -0
pretrained/.gitignore +0 -0
pretrained/tokenizer/dac/dac_44.1kHz_7.7kbps.pt +3 -0
pretrained/tria/small_musdb_moises_2b/80000/extras.pt +3 -0
pretrained/tria/small_musdb_moises_2b/80000/model.pt +3 -0
pretrained/tria/small_musdb_moises_2b/best/extras.pt +3 -0
pretrained/tria/small_musdb_moises_2b/best/model.pt +3 -0
requirements.txt +11 -0
tria/__init__.py +6 -0
tria/__pycache__/__init__.cpython-310.pyc +0 -0
tria/__pycache__/constants.cpython-310.pyc +0 -0
tria/__pycache__/features.cpython-310.pyc +0 -0
tria/__pycache__/util.cpython-310.pyc +0 -0
tria/constants.py +11 -0
tria/data/__init__.py +0 -0
tria/data/dataset.py +280 -0
tria/data/preprocess.py +124 -0
tria/features.py +187 -0
tria/model/__init__.py +1 -0
tria/model/__pycache__/__init__.cpython-310.pyc +0 -0
tria/model/__pycache__/mask.cpython-310.pyc +0 -0
tria/model/__pycache__/sample.cpython-310.pyc +0 -0
tria/model/__pycache__/tria.cpython-310.pyc +0 -0
tria/model/mask.py +263 -0
tria/model/sample.py +168 -0
tria/model/tria.py +344 -0
tria/nn/__init__.py +0 -0
tria/nn/__pycache__/__init__.cpython-310.pyc +0 -0
tria/nn/__pycache__/attention.cpython-310.pyc +0 -0
tria/nn/__pycache__/norm.cpython-310.pyc +0 -0
tria/nn/__pycache__/pos_enc.cpython-310.pyc +0 -0
tria/nn/__pycache__/transformer.cpython-310.pyc +0 -0
tria/nn/attention.py +280 -0
tria/nn/norm.py +53 -0
tria/nn/pos_enc.py +101 -0
tria/nn/transformer.py +259 -0
tria/pipelines/__init__.py +0 -0
tria/pipelines/__pycache__/__init__.cpython-310.pyc +0 -0
tria/pipelines/tokenizer/__init__.py +2 -0
tria/pipelines/tokenizer/__pycache__/__init__.cpython-310.pyc +0 -0
tria/pipelines/tokenizer/__pycache__/tokenizer.cpython-310.pyc +0 -0
tria/pipelines/tokenizer/dac/LICENSE +21 -0
tria/pipelines/tokenizer/dac/__init__.py +1 -0
tria/pipelines/tokenizer/dac/__pycache__/__init__.cpython-310.pyc +0 -0
tria/pipelines/tokenizer/dac/__pycache__/dac.cpython-310.pyc +0 -0
tria/pipelines/tokenizer/dac/__pycache__/modules.cpython-310.pyc +0 -0
tria/pipelines/tokenizer/dac/dac.py +203 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

README.md CHANGED Viewed

@@ -8,6 +8,5 @@ sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 app_file: app.py
 pinned: false
 license: mit
+short_description: Audio Prompted Drums Generation
 ---

app.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import spaces
+import gradio as gr
+import torch
+from pathlib import Path
+from audiotools import AudioSignal
+from tria.model.tria import TRIA
+from tria.pipelines.tokenizer import Tokenizer
+from tria.features import rhythm_features
+from functools import partial
+from pyharp.core import ModelCard, build_endpoint
+from pyharp.media.audio import load_audio, save_audio
+from pyharp.labels import LabelList
+# Global Config
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+N_OUTPUTS = 3
+# Model Zoo
+MODEL_ZOO = {
+    "small_musdb_moises_2b": {
+        "checkpoint": "pretrained/tria/small_musdb_moises_2b/80000/model.pt",
+        "model_cfg": {
+            "codebook_size": 1024,
+            "n_codebooks": 9,
+            "n_channels": 512,
+            "n_feats": 2,
+            "n_heads": 8,
+            "n_layers": 12,
+            "mult": 4,
+            "p_dropout": 0.0,
+            "bias": True,
+            "max_len": 1000,
+            "pos_enc": "rope",
+            "qk_norm": True,
+            "use_sdpa": True,
+            "interp": "nearest",
+            "share_emb": True,
+        },
+        "tokenizer_cfg": {"name": "dac"},
+        "feature_cfg": {
+            "sample_rate": 16_000,
+            "n_bands": 2,
+            "n_mels": 40,
+            "window_length": 384,
+            "hop_length": 192,
+            "quantization_levels": 5,
+            "slow_ma_ms": 200,
+            "post_smooth_ms": 100,
+            "legacy_normalize": False,
+            "clamp_max": 50.0,
+            "normalize_quantile": 0.98,
+        },
+        "infer_cfg": {
+            "top_p": 0.95,
+            "top_k": None,
+            "temp": 1.0,
+            "mask_temp": 10.5,
+            "iterations": [8, 8, 8, 8, 4, 4, 4, 4, 4],
+            "guidance_scale": 2.0,
+            "causal_bias": 1.0,
+        },
+        "max_duration": 6.0,
+    },
+}
+# Loaded model cache
+LOADED = dict(name=None, model=None, tokenizer=None, feature_fn=None, infer_cfg=None, sample_rate=None, max_duration=None)
+# Model loading
+def load_model_by_name(name: str):
+    """Load a TRIA model by name (cached)."""
+    if LOADED["name"] == name and LOADED["model"] is not None:
+        return LOADED["model"]
+    cfg = MODEL_ZOO[name]
+    model = TRIA(**cfg["model_cfg"])
+    sd = torch.load(cfg["checkpoint"], map_location="cpu")
+    model.load_state_dict(sd, strict=True)
+    model.to(DEVICE).eval()
+    tokenizer = Tokenizer(**cfg["tokenizer_cfg"]).to(DEVICE)
+    feat_fn = partial(rhythm_features, **cfg.get("feature_cfg", {}))
+    LOADED.update(
+        dict(
+            name=name,
+            model=model,
+            tokenizer=tokenizer,
+            feature_fn=feat_fn,
+            infer_cfg=cfg["infer_cfg"],
+            sample_rate=tokenizer.sample_rate,
+            max_duration=cfg["max_duration"],
+        )
+    )
+    return model
+# Inference logic
+@spaces.GPU
+@torch.inference_mode()
+def generate_audio(model_name, timbre_path, rhythm_path, cfg_scale, top_p, mask_temperature, seed):
+    model = load_model_by_name(model_name)
+    tokenizer = LOADED["tokenizer"]
+    feat_fn = LOADED["feature_fn"]
+    sample_rate = LOADED["sample_rate"]
+    infer_cfg = LOADED["infer_cfg"]
+    timbre_sig = load_audio(timbre_path).resample(sample_rate)
+    rhythm_sig = load_audio(rhythm_path).resample(sample_rate)
+    timbre_sig.ensure_max_of_audio()
+    rhythm_sig.ensure_max_of_audio()
+    prefix_dur = int(LOADED["max_duration"] / 3)
+    timbre_tokens = tokenizer.encode(timbre_sig)
+    rhythm_tokens = tokenizer.encode(rhythm_sig)
+    tokens = torch.cat([timbre_tokens.tokens, rhythm_tokens.tokens], dim=-1)
+    n_batch, n_codebooks, n_frames = tokens.shape
+    prefix_frames = timbre_tokens.tokens.shape[-1]
+    feats = feat_fn(rhythm_sig)
+    feats = torch.nn.functional.interpolate(feats, n_frames - prefix_frames, mode=model.interp)
+    full_feats = torch.zeros(n_batch, feats.shape[1], n_frames, device=DEVICE)
+    full_feats[..., prefix_frames:] = feats
+    prefix_mask = torch.arange(n_frames, device=DEVICE)[None, :].repeat(n_batch, 1) < prefix_frames
+    buffer_mask = prefix_mask[:, None, :].repeat(1, n_codebooks, 1)
+    feats_mask = ~prefix_mask
+    outputs = []
+    for i in range(N_OUTPUTS):
+        torch.manual_seed(seed + i)
+        gen = model.inference(
+            tokens.clone().to(DEVICE),
+            full_feats.to(DEVICE),
+            buffer_mask.clone().to(DEVICE),
+            feats_mask.to(DEVICE),
+            top_p=float(top_p),
+            mask_temp=float(mask_temperature),
+            iterations=infer_cfg["iterations"],
+            guidance_scale=float(cfg_scale),
+        )[..., prefix_frames:]
+        rhythm_tokens.tokens = gen
+        out_sig = tokenizer.decode(rhythm_tokens)
+        out_sig.ensure_max_of_audio()
+        output_path = f"tria_out_{i+1}.wav"
+        save_audio(out_sig, output_path)
+        path_i = output_path
+        outputs.append(str(path_i))
+    return tuple(outputs)
+# PyHARP Metadata
+model_card = ModelCard(
+    name="TRIA: The Rhythm In Anything",
+    description=(
+        "Transform your rhythmic ideas into full drum performances. TRIA takes two short audio prompts: \n "
+        "Rhythm Prompt (tapping, beatboxing, or percussion gesture) "
+        "and a Timbre Prompt (an example drum sound or kit recording) \n "
+        "It generates 3 drum arrangements that match your groove and chosen timbre. "
+    ),
+    author="Patrick O'Reilly, Julia Barnett, Hugo Flores García, Annie Chu, Nathan Pruyne, Prem Seetharaman, Bryan Pardo",
+    tags=["tria", "rhythm-generation", "pyharp"],
+)
+# Gradio and PyHARP Endpoint
+with gr.Blocks(title="TRIA") as demo:
+    timbre_in = gr.Audio(type="filepath", label="Timbre Prompt").harp_required(True)
+    rhythm_in = gr.Audio(type="filepath", label="Rhythm Prompt").harp_required(True)
+    model_names = list(MODEL_ZOO.keys())
+    model_dropdown = gr.Dropdown(choices=model_names, value=model_names[0], label="Model")
+    with gr.Row():
+        cfg_scale = gr.Slider(0.0, 10.0, value=2.0, step=0.1, label="CFG Scale")
+        top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01, label="Top P")
+        mask_temperature = gr.Slider(0.0, 20.0, value=10.5, step=0.1, label="Mask Temperature")
+        seed = gr.Slider(0, 1000, value=0, step=1, label="Random Seed")
+    out1 = gr.Audio(type="filepath", label="Generated #1")
+    out2 = gr.Audio(type="filepath", label="Generated #2")
+    out3 = gr.Audio(type="filepath", label="Generated #3")
+    app = build_endpoint(
+        model_card=model_card,
+        input_components=[model_dropdown, timbre_in, rhythm_in, cfg_scale, top_p, mask_temperature, seed],
+        output_components=[out1, out2, out3],
+        process_fn=generate_audio,
+    )
+demo.queue().launch(share=True, show_error=True)

pretrained/.gitignore ADDED Viewed

File without changes

pretrained/tokenizer/dac/dac_44.1kHz_7.7kbps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ffa16e9cd52d67dadef026823403481930942f3fead32f44b75c4b60627246a
+size 306721572

pretrained/tria/small_musdb_moises_2b/80000/extras.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e18d9b8dbf5c5ff0d86aaf04d2af014960d97eeb396f7743e7595692ee31b68
+size 344556763

pretrained/tria/small_musdb_moises_2b/80000/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e20c3850253ba7fb267440573137f4b6099cad1e437fcfd574b84d60138155c
+size 172260091

pretrained/tria/small_musdb_moises_2b/best/extras.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e18d9b8dbf5c5ff0d86aaf04d2af014960d97eeb396f7743e7595692ee31b68
+size 344556763

pretrained/tria/small_musdb_moises_2b/best/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e20c3850253ba7fb267440573137f4b6099cad1e437fcfd574b84d60138155c
+size 172260091

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch==2.9.0
+torchaudio==2.9.0
+numpy
+argbind
+descript-audiotools>=0.9.2
+pyharp>=1.7.8
+gradio>=4.42.0
+librosa
+soundfile
+tqdm

tria/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+__version__ = "0.0.1"
+from . import constants
+from . import util
+from . import features
+from . import transforms

tria/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (294 Bytes). View file

tria/__pycache__/constants.cpython-310.pyc ADDED Viewed

Binary file (465 Bytes). View file

tria/__pycache__/features.cpython-310.pyc ADDED Viewed

Binary file (4.52 kB). View file

tria/__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (6.11 kB). View file

tria/constants.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from pathlib import Path
+MANIFESTS_DIR = Path(__file__).parent.parent / "manifests"
+DATA_DIR = Path(__file__).parent.parent / "data"
+PRETRAINED_DIR = Path(__file__).parent.parent / "pretrained"
+ASSETS_DIR = Path(__file__).parent.parent / "assets"
+STEMS = ["drums", "bass", "vocals", "other", "mixture"]
+SAMPLE_RATE = 44_100
+DURATION = 6.0

tria/data/__init__.py ADDED Viewed

File without changes

tria/data/dataset.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import csv
+from pathlib import Path
+from typing import Callable
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Union
+import numpy as np
+import soundfile as sf
+from audiotools import AudioSignal
+from audiotools.core.util import random_state
+from torch.utils.data import Dataset
+from ..constants import DURATION
+from ..constants import SAMPLE_RATE
+from ..constants import STEMS
+from ..util import collate
+from ..util import get_info
+from ..util import load_audio
+from ..util import rms_salience
+################################################################################
+# Dataset for loading aligned excerpts across stem classes
+################################################################################
+class StemDataset(Dataset):
+    """
+    Load aligned excerpts from specified stem classes given paths in one or more
+    CSV manifests. Based on `audiotools.data.datasets.AudioDataset`.
+    Parameters
+    ----------
+    sources : Union[str, Path, List[Union[str, Path]]]
+        CSV manifest(s) with columns for each requested stem.
+    stems : List[str]
+        Column names to load, e.g. ["mixture","drums","bass","vocals"].
+        The **first** stem is used for salience unless `salience_on` is set.
+    sample_rate : int
+    duration : float
+    n_examples : int
+    num_channels : int
+    relative_path : str
+        Prepended to relative CSV paths.
+    strict : bool
+        Drop rows with missing stems (True) vs. fill with silence (False).
+    with_replacement : bool
+        Sampling strategy for rows.
+    shuffle_state : int
+        Seed for deterministic per-index RNG.
+    loudness_cutoff : Optional[float]
+        dB LUFS cutoff; if None, take random excerpt (still shared across stems).
+    salience_num_tries : int
+        Max tries for salient excerpt search (see `AudioSignal.salient_excerpt`).
+    salience_on : Optional[str]
+        Which stem to use for salience. Defaults to first of `stems`.
+    """
+    def __init__(
+        self,
+        stems: List[str] = STEMS,
+        sample_rate: int = SAMPLE_RATE,
+        duration: float = DURATION,
+        sources: Union[str, Path, List[Union[str, Path]]] = None,
+        source_weights: Optional[List[float]] = None,
+        n_examples: int = 1000,
+        num_channels: int = 1,
+        relative_path: str = "",
+        strict: bool = True,
+        with_replacement: bool = True,
+        shuffle_state: int = 0,
+        loudness_cutoff: Optional[float] = -40.0,
+        salience_num_tries: int = 8,
+        salience_on: Optional[str] = None,
+    ):
+        super().__init__()
+        assert sources is not None
+        assert len(stems) >= 1
+        self.stems = list(stems)
+        self.sample_rate = int(sample_rate)
+        self.duration = float(duration)
+        self.num_channels = int(num_channels)
+        self.relative_path = Path(relative_path)
+        self.strict = strict
+        self.with_replacement = with_replacement
+        self.length = int(n_examples)
+        self.shuffle_state = int(shuffle_state)
+        self.loudness_cutoff = loudness_cutoff
+        self.salience_num_tries = int(salience_num_tries)
+        self.salience_on = salience_on or self.stems[0]
+        if self.salience_on not in self.stems:
+            raise ValueError(
+                f"`salience_on` ('{self.salience_on}') must be one of {self.stems}"
+            )
+        # Read manifests
+        csv_paths = [sources] if isinstance(sources, (str, Path)) else list(sources)
+        self.source_rows: List[List[Dict]] = []
+        kept_mask: List[bool] = []
+        kept_csvs: List[Path] = []
+        for cpath in csv_paths:
+            # Read rows for source
+            cpath = Path(cpath)
+            raw_rows = []
+            with open(cpath, "r") as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    entry = {"__manifest__": str(cpath)}
+                    stem_paths = {}
+                    for s in self.stems:
+                        raw = (row.get(s) or "").strip()
+                        stem_paths[s] = str(self._resolve_path(raw)) if raw else ""
+                    entry["paths"] = stem_paths
+                    extra = {k: v for k, v in row.items() if k not in self.stems}
+                    if extra:
+                        entry["meta"] = extra
+                    raw_rows.append(entry)
+            # Filter rows for source
+            filtered = []
+            for r in raw_rows:
+                missing = [
+                    s for s, p in r["paths"].items() if not p or not Path(p).is_file()
+                ]
+                if self.strict and missing:
+                    continue
+                min_dur = np.inf
+                any_valid = False
+                for s, p in r["paths"].items():
+                    if p and Path(p).is_file():
+                        any_valid = True
+                        try:
+                            total_sec = float(sf.info(p).duration)
+                            min_dur = min(min_dur, float(total_sec))
+                        except Exception:
+                            if self.strict:
+                                min_dur = -np.inf
+                                break
+                if not any_valid or not np.isfinite(min_dur):
+                    continue
+                if min_dur < self.duration and self.strict:
+                    continue
+                r["min_duration"] = min_dur if np.isfinite(min_dur) else 0.0
+                filtered.append(r)
+            if len(filtered) > 0:
+                self.source_rows.append(filtered)
+                kept_mask.append(True)
+                kept_csvs.append(cpath)
+            else:
+                kept_mask.append(False)
+        if len(self.source_rows) == 0:
+            raise RuntimeError(
+                "StemDataset: no valid rows after filtering in any source."
+            )
+        self.csv_paths = kept_csvs
+        lengths = [len(lst) for lst in self.source_rows]
+        self._source_offsets = np.cumsum([0] + lengths[:-1])  # for global idx
+        self._n_rows = int(sum(lengths))
+        # Weights over non-empty sources
+        if source_weights is None:
+            self._weights = None
+        else:
+            if len(source_weights) != len(csv_paths):
+                raise ValueError(
+                    f"source_weights must match number of sources ({len(csv_paths)}), "
+                    f"got {len(source_weights)}"
+                )
+            w = np.asarray(source_weights, dtype=float)
+            # Keep only weights for sources that survived filtering
+            w = w[np.array(kept_mask, dtype=bool)]
+            w = np.clip(w, 0, None)
+            if not np.any(w > 0):
+                w = np.ones_like(w)
+            self._weights = (w / w.sum()).tolist()
+    def _resolve_path(self, p: Union[str, Path]) -> Path:
+        p = Path(p).expanduser()
+        if not p.is_absolute():
+            p = (self.relative_path / p).expanduser()
+        return p
+    def _pick_row(self, state: np.random.RandomState):
+        # Sample a non-empty source
+        sidx = int(state.choice(len(self.source_rows), p=self._weights))
+        n_in_source = len(self.source_rows[sidx])
+        item_idx = int(state.randint(n_in_source))
+        row = self.source_rows[sidx][item_idx]
+        # Map to a global idx for metadata
+        ridx_global = int(self._source_offsets[sidx] + item_idx)
+        return ridx_global, row
+    def __len__(self):
+        return self.length
+    def __getitem__(self, idx: int):
+        state = random_state((self.shuffle_state + int(idx)) & 0x7FFFFFFF)
+        ridx, row = self._pick_row(state)
+        primary = self.salience_on
+        p0 = row["paths"].get(primary, "")
+        offset = 0.0
+        primary_sig = None
+        if p0 and Path(p0).is_file():
+            if self.loudness_cutoff is None or not self.salience_num_tries:
+                try:
+                    total_sec, _sr = get_info(p0)
+                except Exception:
+                    total_sec = 0.0
+                max_off = max(0.0, total_sec - self.duration)
+                offset = float(state.rand() * max_off) if max_off > 0 else 0.0
+            else:
+                offset = rms_salience(
+                    p0,
+                    duration=self.duration,
+                    cutoff_db=float(self.loudness_cutoff),
+                    num_tries=int(self.salience_num_tries),
+                    state=state,
+                )
+            primary_sig = load_audio(p0, offset=offset, duration=self.duration)
+        else:
+            offset = 0.0
+        item: Dict[str, Dict] = {}
+        for s in self.stems:
+            p = row["paths"][s]
+            exists = bool(p) and Path(p).is_file()
+            if s == primary and primary_sig is not None:
+                sig = primary_sig.clone()  # reuse window we already loaded
+            elif exists:
+                sig = load_audio(
+                    p, offset=offset, duration=self.duration
+                )  # windowed load
+            else:
+                sig = AudioSignal.zeros(
+                    self.duration, self.sample_rate, self.num_channels
+                )
+            # Channel formatting
+            if self.num_channels == 1:
+                sig = sig.to_mono()
+            elif self.num_channels != sig.num_channels:
+                assert sig.num_channels == 1
+                sig.audio_data = sig.audio_data.repeat(1, self.num_channels, 1)
+            # Resample/pad to target SR and exact duration
+            sig = sig.resample(self.sample_rate)
+            if sig.duration < self.duration:
+                sig = sig.zero_pad_to(int(self.duration * self.sample_rate))
+            # Metadata
+            sig.metadata["path"] = p
+            sig.metadata["offset"] = offset
+            sig.metadata["source_row"] = ridx
+            if "meta" in row:
+                for k, v in row["meta"].items():
+                    sig.metadata[k] = v
+            item[s] = {"signal": sig, "path": p}
+        item["idx"] = idx
+        return item
+    @staticmethod
+    def collate(list_of_dicts: Union[list, dict], n_splits: int = None):
+        return collate(list_of_dicts, n_splits=n_splits)

tria/data/preprocess.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import csv
+import os
+from pathlib import Path
+from typing import Callable, Dict, Tuple, Union, Optional, Any
+from rich.progress import track
+import numpy as np
+from audiotools.core.util import random_state
+from ..util import ensure_dir
+SplitType = Union[Tuple[float, float, float], Callable[[Path], str]]
+def create_manifests(
+    data_dir: Union[str, Path],
+    ext: str,
+    output_dir: Union[str, Path],
+    split: SplitType,
+    attributes: Dict[str, Callable[[Path], Any]],
+    seed: Optional[int] = 0,
+) -> Dict[str, Path]:
+    """
+    Create CSV manifests for audio dataset.
+    Parameters
+    ----------
+    data_dir : str
+        Dataset root directory to search recursively for files
+    ext : str
+        Audio file extension
+    output_dir : str
+        Directory to which to write manifests
+    split : SplitType
+        Either a 3-tuple containing (train, val, test) proportions summing to 1
+        or a Callable that returns "train", "val", or "test" given a filepath
+    attributes : dict
+        Dictionary mapping column names to Callables for extracting values
+        given filepaths; for example {'path': lambda p: str(p)}
+    seed : int
+        Random seed
+    """
+    data_dir = Path(data_dir)
+    output_dir = Path(output_dir)
+    ensure_dir(output_dir)
+    all_files = sorted(
+        [p for p in data_dir.rglob(f"*{ext}") if p.is_file()],
+        key=lambda p: str(p).lower(),
+    )
+    splits = {"train": [], "val": [], "test": []}
+    # Callable split: apply given function to file paths to obtain train/val/test
+    # assignments
+    if callable(split):
+        for p in all_files:
+            s = split(p)
+            if s not in splits:
+                raise ValueError(
+                    f"Split function must return one of "
+                    f"{list(splits.keys())}, got {s!r} for {p}"
+                )
+            splits[s].append(p)
+    # Proportional split: randomly shuffle files and split according to given
+    # values
+    else:
+        if not (isinstance(split, tuple) and len(split) == 3):
+            raise ValueError(f"Split proportions tuple must have length 3")
+        p_train, p_val, p_test = split
+        total = float(p_train + p_val + p_test)
+        if not np.isclose(total, 1.0, atol=1e-6):
+            raise ValueError(f"Split proportions must sum to 1.0 (got {total}).")
+        rs = random_state(seed)
+        idx = np.array(rs.permutation(len(all_files)))
+        n = len(idx)
+        n_train = int(np.floor(p_train * n))
+        n_val = int(np.floor(p_val * n))
+        n_test = n - n_train - n_val
+        train_idx = idx[:n_train]
+        val_idx = idx[n_train:n_train + n_val]
+        test_idx = idx[n_train + n_val:]
+        for i in train_idx:
+            splits["train"].append(all_files[int(i)])
+        for i in val_idx:
+            splits["val"].append(all_files[int(i)])
+        for i in test_idx:
+            splits["test"].append(all_files[int(i)])
+    columns = list(attributes.keys())
+    # Write CSVs
+    out_paths: Dict[str, Path] = {}
+    for s in ("train", "val", "test"):
+        out_csv = output_dir / f"{s}.csv"
+        out_paths[s] = out_csv
+        with out_csv.open("w", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=columns)
+            writer.writeheader()
+            for p in track(
+                splits[s],
+                description=f"Writing {s}.csv",
+                total=len(splits[s])
+            ):
+                try:
+                    row = {}
+                    for col, fn in attributes.items():
+                        row[col] = fn(p)
+                    writer.writerow(row)
+                except Exception as e:
+                    print(
+                        f"Error at path {p}:\n"
+                        f"{e}\n"
+                        f"Skipping..."
+                    )
+    return out_paths

tria/features.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import torch
+from audiotools import AudioSignal
+################################################################################
+# Utilities for extracting rhythm feature representations
+################################################################################
+def _moving_average(x: torch.Tensor, window_length: int):
+    """
+    Smooth features with moving average over frames.
+    Parameters
+    ----------
+    x : torch.Tensor
+        Shape (n_batch, n_feats, n_frames)
+    window_length : int
+        Smoothing window length
+    """
+    if window_length <= 1:
+        return x
+    n_feats = x.shape[1]
+    kernel = torch.ones(
+        (n_feats, 1, window_length),
+        device=x.device, dtype=x.dtype
+    ) / window_length
+    pad_left = (window_length - 1) // 2
+    pad_right = window_length // 2
+    x_pad = torch.nn.functional.pad(x, (pad_left, pad_right), mode="reflect")
+    # Smooth separately over feature channels
+    return torch.nn.functional.conv1d(x_pad, kernel, groups=n_feats)
+# The 'original' TRIA features can be recovered using:
+#  * `slow_ma_ms` = None
+#  * `post_smooth_ms` = None
+#  * `legacy_normalize` = True
+def rhythm_features(
+    signal: AudioSignal,
+    sample_rate: int = 44_100,
+    n_bands: int = 2,
+    n_mels: int = 80,
+    window_length: int = 1024,
+    hop_length: int = 512,
+    normalize_quantile: float = 0.98,
+    quantization_levels: int = 33,
+    clamp_max: float = 50.0,
+    eps: float = 1e-8,
+    slow_ma_ms: float = 100.0,
+    post_smooth_ms: float = 10.0,
+    legacy_normalize: bool = False,
+):
+    """
+    Extract multi-band 'rhythm' features from audio by adaptively splitting
+    spectrogram along frequency axis and applying normalization, quantization,
+    and smoothing / sparsity filtering.
+    Parameters
+    ----------
+    signal : AudioSignal
+        Audio from which to extract features
+    sample_rate : int
+        Sample rate at which to extract features
+    n_bands : int
+        Number of frequency bands into which to adaptively divide spectrogram
+    n_mels : int
+        Number of base mel frequency bins in spectrogram
+    window_length : int
+        Spectrogram window length
+    hop_length : int
+        Spectrogram hop length
+    normalize_quantile : float
+        Optionally normalize each band relative to top-p largest magnitude
+        rather than absolute max
+    quantization_levels : int
+        Number of bins into which feature magnitudes are quantized
+    clamp_max : float
+        Maximum allowed spectrogram magnitude
+    eps : float
+        For numerical stability
+    slow_ma_ms : float
+        Smoothing filter length in milliseconds for transient emphasis (smoothed
+        features are subtracted)
+    post_smooth_ms : float
+        Smoothing filter length in milliseconds for transient smoothing
+    legacy_normalize : bool
+        If `True`, use mean/std and sigmoid normalization as described in
+        original TRIA paper
+    """
+    assert n_bands >= 1
+    assert quantization_levels >= 2
+    # Loudness normalization
+    signal = signal.clone().to_mono().resample(sample_rate).normalize(-16.)
+    signal.ensure_max_of_audio()
+    # Clamped mel spectrogram
+    mel = signal.mel_spectrogram(
+        n_mels=n_mels,
+        hop_length=hop_length,
+        window_length=window_length,
+    ).mean(1)  # (n_batch, n_mels, n_frames)
+    mel = torch.clamp(mel, 0.0, clamp_max)
+    n_batch, _, n_frames = mel.shape
+    if legacy_normalize:
+        # Original normalization: divide by number of mels
+        mel = mel / n_mels
+    else:
+        # Compress logarithmically
+        mel = torch.log1p(mel) / torch.log1p(torch.tensor(clamp_max, device=mel.device, dtype=mel.dtype))
+    # Split spectrogram into bands adaptively
+    energy_per_bin = mel.mean(dim=-1)          # (n_batch, n_mels)
+    cum = energy_per_bin.cumsum(dim=1)         # (n_batch, n_mels)
+    total = cum[:, -1:]                        # (n_batch, 1)
+    if n_bands == 1:
+        bands = mel.sum(dim=1, keepdim=True)                   # (n_batch, 1, n_frames)
+    else:
+        targets = torch.linspace(
+            1.0 / n_bands, (n_bands - 1) / n_bands, n_bands - 1,
+            device=mel.device, dtype=mel.dtype
+        )[None, :] * total                                     # (n_batch, n_bands-1)
+        edges = torch.searchsorted(cum, targets, right=False)  # (n_batch, n_bands-1)
+        cuts = torch.cat(
+            [
+                torch.zeros(n_batch, 1, dtype=torch.long, device=mel.device),
+                edges + 1,
+                torch.full((n_batch, 1), mel.size(1), dtype=torch.long, device=mel.device),
+            ],
+            dim=1
+        )  # (n_batch, n_bands+1)
+        prefix = mel.cumsum(dim=1)  # (n_batch, n_mels, n_frames)
+        prefix_pad = torch.cat(
+            [torch.zeros(n_batch, 1, n_frames, device=mel.device, dtype=mel.dtype), prefix],
+            dim=1
+        )
+        a_idx = cuts[:, :-1].unsqueeze(-1).expand(n_batch, n_bands, n_frames)
+        b_idx = cuts[:, 1: ].unsqueeze(-1).expand(n_batch, n_bands, n_frames)
+        bands = prefix_pad.gather(1, b_idx) - prefix_pad.gather(1, a_idx)  # (n_batch, n_bands, n_frames)
+    # Emphasize transients by subtracting smoothed features
+    transient = bands.clone()
+    to_frames = lambda ms: max(1, int(round((ms / 1000.0) * sample_rate / hop_length)))
+    if slow_ma_ms is not None:
+        slow_win = to_frames(slow_ma_ms)
+        bands_slow = _moving_average(bands, slow_win)  # (n_batch, n_bands, n_frames)
+        transient = torch.relu(bands - bands_slow)
+    # Apply additional smoothing to transients
+    if post_smooth_ms is not None:
+        ps_win = to_frames(post_smooth_ms)
+        if ps_win > 1:
+            transient = _moving_average(transient, ps_win)
+    # Normalize features across time per band
+    if legacy_normalize:
+        # Original normalization (mean/std with sigmoid compression)
+        mean = transient.mean(dim=-1, keepdim=True)
+        std = transient.std(dim=-1, keepdim=True).clamp_min(eps)
+        transient = torch.sigmoid((transient - mean) / std)
+    else:
+        # Quantile-based normalization
+        q = torch.quantile(
+            transient.clamp_min(0.0),
+            q=normalize_quantile,
+            dim=-1,
+            keepdim=True
+        ).clamp_min(eps)
+        transient = (transient / q).clamp(0.0, 1.0)
+    # Quantize feature intensities into bins to ensure a tight information
+    # bottleneck
+    steps = quantization_levels - 1
+    return torch.round(transient * steps) / steps

tria/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .tria import TRIA

tria/model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (182 Bytes). View file

tria/model/__pycache__/mask.cpython-310.pyc ADDED Viewed

Binary file (5.87 kB). View file

tria/model/__pycache__/sample.cpython-310.pyc ADDED Viewed

Binary file (4.68 kB). View file

tria/model/__pycache__/tria.cpython-310.pyc ADDED Viewed

Binary file (7.21 kB). View file

tria/model/mask.py ADDED Viewed

	@@ -0,0 +1,263 @@

+from typing import Iterable
+from typing import Union
+import torch
+from audiotools.core.util import random_state
+################################################################################
+# Utilities for masked language modeling
+################################################################################
+def cosine_schedule(t: torch.Tensor) -> torch.Tensor:
+    """
+    Map timestep in [0, 1] to masking ratio in (0, 1] via cosine schedule
+    proposed by Chang et al. in "MaskGIT: Masked generative image
+    transformer" (2022).
+    Parameters
+    ----------
+    t : torch.Tensor
+        Timestep in [0, 1]
+    Returns
+    -------
+    torch.Tensor
+        Mask proportion in (0, 1]
+    """
+    return (t * torch.pi / 2).cos().clamp(1e-10, 1.0)
+def format_seed(seed):
+    if isinstance(seed, (int, float)):
+        seed = [seed]
+    elif isinstance(seed, torch.Tensor):
+        seed = seed.tolist()
+    elif isinstance(seed, Iterable):
+        pass
+    else:
+        raise ValueError(f"Invalid random seed of type {type(seed)}")
+    return [random_state(s) for s in seed]
+def get_span_mask(
+    tokens: torch.Tensor,
+    min_prop: float,
+    max_prop: float,
+    seed: Union[int, Iterable[int]],
+) -> torch.Tensor:
+    """
+    Mask a random span of consecutive frames across all codebooks, varying
+    across batch.
+    Parameters
+    ----------
+    tokens : torch.Tensor
+        Tokens to be masked, shape (n_batch, n_codebooks, n_frames)
+    min_prop : float
+        Minimum proportion of frames to mask
+    max_prop : float
+        Maximum proportion of frames to mask
+    seed : Iterable[int]
+        One or more random seeds to determine masks
+    Returns
+    -------
+    torch.Tensor
+        Mask of shape (n_batch, n_frames)
+    """
+    assert min_prop >= 0.0
+    assert max_prop <= 1.0
+    n_batch, n_codebooks, n_frames = tokens.shape
+    states = format_seed(seed)
+    assert len(states) == n_batch
+    mask = torch.ones(
+        n_batch,
+        n_frames,
+        device=tokens.device,
+        dtype=torch.bool,
+    )  # (n_batch, n_frames)
+    for i, s in enumerate(states):
+        prop = s.uniform(min_prop, max_prop) if min_prop < max_prop else min_prop
+        if prop >= 1.0:
+            mask[i] = False
+        else:
+            span = int(prop * n_frames)
+            st = s.randint(0, max(n_frames - span, 1))
+            mask[i, st : st + span] = False
+    return mask
+def get_current_codebook_mask(
+    tokens: torch.Tensor, codebooks: torch.Tensor
+) -> torch.Tensor:
+    """
+    Given tokens and batch of selected codebooks, mask all codebooks "above" and
+    "below" selected codebooks.
+    Parameters
+    ----------
+    tokens : torch.Tensor
+        Tokens to be masked, shape (n_batch, n_codebooks, n_frames)
+    codebooks : torch.Tensor
+        Selected codebooks "above" which tokens should be masked, shape
+        (n_batch,)
+    Returns
+    -------
+    torch.Tensor
+        Mask of shape (n_batch, n_codebooks)
+    """
+    n_batch, n_codebooks, n_frames = tokens.shape
+    assert codebooks.ndim == 1
+    assert codebooks.shape[0] in [1, n_batch]
+    codebooks = codebooks.repeat(n_batch // codebooks.shape[0])
+    mask = (
+        torch.arange(
+            n_codebooks,
+            dtype=codebooks.dtype,
+            device=codebooks.device,
+        )[None, :]
+        == codebooks[:, None]
+    )  # (n_batch, n_codebooks)
+    return mask
+def get_next_codebooks_mask(
+    tokens: torch.Tensor, codebooks: torch.Tensor
+) -> torch.Tensor:
+    """
+    Given tokens and batch of selected codebooks, mask all codebooks "above"
+    selected codebooks.
+    Parameters
+    ----------
+    tokens : torch.Tensor
+        Tokens to be masked, shape (n_batch, n_codebooks, n_frames)
+    codebooks : torch.Tensor
+        Selected codebooks "above" which tokens should be masked, shape
+        (n_batch,)
+    Returns
+    -------
+    torch.Tensor
+        Mask of shape (n_batch, n_codebooks)
+    """
+    n_batch, n_codebooks, n_frames = tokens.shape
+    assert codebooks.ndim == 1
+    assert codebooks.shape[0] in [1, n_batch]
+    codebooks = codebooks.repeat(n_batch // codebooks.shape[0])
+    mask = (
+        torch.arange(
+            n_codebooks,
+            dtype=codebooks.dtype,
+            device=codebooks.device,
+        )[None, :]
+        <= codebooks[:, None]
+    )  # (n_batch, n_codebooks)
+    return mask
+def get_random_mask(
+    tokens: torch.Tensor,
+    prop: Union[float, Iterable[float]],
+    seed: Union[int, Iterable[int]],
+) -> torch.Tensor:
+    """
+    Parameters
+    ----------
+    tokens : torch.Tensor
+        Tokens to be masked, shape (n_batch, n_codebooks, n_frames)
+    prop : torch.Tensor
+        Proportion of tokens to be masked, shape (n_batch,)
+    seed : Iterable[int]
+        One or more random seeds to determine masks
+    Returns
+    -------
+    torch.Tensor
+        Random mask of shape (n_batch, n_codebooks, n_frames)
+    """
+    n_batch, n_codebooks, n_frames = tokens.shape
+    if isinstance(prop, torch.Tensor):
+        prop = prop.tolist()
+    assert len(prop) == n_batch
+    states = format_seed(seed)
+    assert len(states) == n_batch
+    mask = torch.ones(
+        n_batch,
+        n_codebooks,
+        n_frames,
+        device=tokens.device,
+        dtype=torch.bool,
+    )  # (n_batch, n_codebooks, n_frames)
+    for i, (s, p) in enumerate(zip(states, prop)):
+        mask[i] = torch.from_numpy(s.rand(n_codebooks, n_frames)).to(mask.device) > p
+    return mask
+def combine_masks(
+    mask_span: torch.Tensor,
+    mask_current_codebook: torch.Tensor,
+    mask_next_codebooks: torch.Tensor,
+    mask_random: torch.Tensor,
+    leak: bool = False,
+) -> torch.Tensor:
+    """
+    Combine sampled masks to allow for application to token buffer.
+    Parameters
+    ----------
+    mask_span : torch.Tensor
+        Shape (n_batch, n_frames)
+    mask_current_codebook : torch.Tensor
+        Shape (n_batch, n_codebooks)
+    mask_next_codebooks : torch.Tensor
+        Shape (n_batch, n_codebooks)
+    mask_random : torch.Tensor
+        Shape (n_batch, n_codebooks, n_frames)
+    Returns
+    -------
+    torch.Tensor
+        Combined mask, shape (n_batch, n_codebooks, n_frames)
+    torch.Tensor
+    """
+    mask_current_level = mask_current_codebook[:, :, None] & (~mask_random)
+    if leak:
+        # Allow leakage from "higher" codebooks inside masked span
+        higher = (~mask_next_codebooks[:, :, None]) & (~mask_random)
+    else:
+        # Strictly mask "higher" codebooks inside masked span
+        higher = ~mask_next_codebooks[:, :, None]
+    # Inside span, unmask everything except "higher" codebooks and masked
+    # positions in current codebook
+    mask = ~(higher | mask_current_level)
+    # Outside span, fully unmask
+    mask = mask | mask_span[:, None, :]
+    return mask

tria/model/sample.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import math
+import torch
+import torch.nn.functional as F
+from typing import Iterable, Union, Optional
+import numpy as np
+from numpy.random import RandomState
+from .mask import cosine_schedule, format_seed
+################################################################################
+# Utilities for sampling from trained TRIA model
+################################################################################
+def top_p_top_k(
+    logits: torch.Tensor,
+    top_p: float = None,
+    top_k: int = None,
+):
+    """
+    Adapted from `vampnet.modules.transformer.sample_from_logits` by Hugo Flores
+    Garcia. See: https://github.com/hugofloresgarcia/vampnet/
+    Parameters
+    ----------
+    logits : torch.Tensor
+        Shape (..., n_classes)
+    """
+    logits = logits.clone()
+    n_classes = logits.shape[-1]
+    # Mask logits outside top-k by setting to -inf
+    if top_k is not None and 0 < top_k < n_classes:
+        thresh = logits.topk(top_k, dim=-1).values[..., -1:]  # (..., 1)
+        logits[logits < thresh] = float("-inf")
+    # Mask logits outside top-p by setting to -inf
+    if top_p is not None and 0.0 < top_p < 1.0:
+        # Sort descending
+        sorted_logits, sorted_idx = logits.sort(dim=-1, descending=True)   # (..., n_classes)
+        sorted_probs = F.softmax(sorted_logits, dim=-1)                    # (..., n_classes)
+        cumsum = sorted_probs.cumsum(dim=-1)                               # (..., n_classes)
+        # Keep at least one logit
+        to_remove = cumsum > top_p
+        to_remove[..., 0] = False
+        remove_idx = torch.zeros_like(to_remove).scatter(-1, sorted_idx, to_remove)
+        logits[remove_idx] = float("-inf")
+    return logits
+def sample(
+    logits: torch.Tensor,
+    temp: float,
+    argmax: bool = False,
+):
+    """
+    Adapted from `vampnet.modules.transformer.sample_from_logits` by Hugo Flores
+    Garcia. See: https://github.com/hugofloresgarcia/vampnet/
+    Parameters
+    ----------
+    logits : torch.Tensor
+        Shape (..., n_classes)
+    Returns
+    -------
+    torch.Tensor
+        Sampled tokens, shape of `logits` with trailing `n_classes` dimension
+        removed
+    torch.Tensor
+        Probabilities of sampled tokens, shape of `logits` with trailing
+        `n_classes` dimension removed
+    """
+    if temp <= 0:
+        argmax = True
+        temp = 1.0
+    if argmax:
+        sampled = logits.argmax(dim=-1)
+        probs = F.softmax(
+            logits, dim=-1
+        ).take_along_dim(sampled.unsqueeze(-1), dim=-1).squeeze(-1)
+        return sampled, probs
+    probs = F.softmax(logits / temp, dim=-1)
+    flat = probs.reshape(-1, probs.shape[-1])
+    draws = torch.multinomial(flat, 1).squeeze(-1)
+    sampled = draws.view(*probs.shape[:-1])
+    chosen = probs.take_along_dim(sampled.unsqueeze(-1), dim=-1).squeeze(-1)
+    return sampled, chosen
+def mask_by_confidence(
+    probs: torch.Tensor,
+    n: torch.Tensor,
+    temp: float,
+    causal_bias: float,
+    state: Iterable[RandomState],
+    eligible: Optional[torch.Tensor] = None,
+):
+    """
+    Re-mask predicted tokens in a single codebook such that `n` previously-
+    masked tokens are left unmasked, using confidence (probability assigned to
+    tokens during sampling) to select which tokens remain. This confidence can
+    be mediated by random noise and a bias to unmask early (leftward) positions
+    first.
+    Parameters
+    ----------
+    probs : torch.Tensor
+        Probabilities assigned to sampled tokens, shape (n_batch, n_frames)
+    n : torch.Tensor
+        Target number of unmasked tokens, shape (n_batch,)
+    temp : float
+        Mask temperature, corresponding to randomness in unmasking process
+    causal_bias : float
+        Bias towards unmasking early (leftward) token positions first; typically
+        in (0, 1]. Note that large values of `temp` can effectively "wash out"
+        this causal bias
+    state : Iterable[RandomState]
+        Random seeds for reproducibility
+    eligible : torch.Tensor
+        Optional indicator for positions eligible for unmasking, shape (n_batch, n_frames)
+    """
+    n_batch, n_frames = probs.shape
+    device = probs.device
+    if eligible is None:
+        eligible = torch.isfinite(probs) & (probs > 0)
+    else:
+        eligible = eligible.to(torch.bool)
+    # Masked token count and target
+    n_masked = eligible.long().sum(dim=-1)
+    n_unmask = (n_masked - n).clamp_min(0)
+    # Gumbel noise to introduce randomness into unmasking
+    u = torch.stack([
+        torch.from_numpy(s.uniform(1e-6, 1 - 1e-6, n_frames)) for s in state
+    ], dim=0).to(probs)
+    gumbel = -torch.log(-torch.log(u))
+    # Log-confidences + noise
+    s = probs.clamp_min(1e-12)
+    confs = torch.log(s) + temp * gumbel
+    # Optional causal bias in log-domain
+    if causal_bias > 0:
+        frame_relpos = (1 - (torch.arange(n_frames, device=device, dtype=confs.dtype) + 1) / n_frames).view(1, -1)
+        confs = confs + causal_bias * frame_relpos
+    # Only eligible positions can be chosen
+    confs_masked = confs.masked_fill(~eligible, float("-inf"))
+    sorted_vals, sorted_idx = confs_masked.sort(dim=-1, descending=True)
+    rank = torch.arange(n_frames, device=device).view(1, n_frames).expand_as(confs_masked)
+    k = n_unmask.view(n_batch, 1)
+    pick_sorted = rank < k
+    pick = torch.zeros_like(pick_sorted, dtype=torch.bool).scatter(-1, sorted_idx, pick_sorted)
+    # Return tokens_mask semantics (True = unmasked/keep)
+    mask = ~(eligible & (~pick))
+    return mask

tria/model/tria.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import torch
+from typing import Optional, Union, Iterable
+from ..nn.transformer import Transformer
+from .mask import cosine_schedule, format_seed
+from .sample import mask_by_confidence, top_p_top_k, sample
+################################################################################
+# TRIA masked language model
+################################################################################
+class TRIA(torch.nn.Module):
+    def __init__(
+        self,
+        codebook_size: int = 1024,
+        n_codebooks: int = 9,
+        n_feats: int = 2,
+        n_channels: int = 512,
+        n_heads: int = 8,
+        n_layers: int = 12,
+        mult: int = 4,
+        p_dropout: float = 0.0,
+        p_token_dropout: float = 0.0,
+        bias: bool = False,
+        max_len: int = 8192,
+        pos_enc: Optional[str] = "rope",
+        qk_norm: bool = True,
+        use_sdpa: bool = True,
+        interp: str = "nearest",
+        share_emb: bool = True,
+    ):
+        super().__init__()
+        assert interp in ["nearest", "linear"]
+        self.adapter = torch.nn.Linear(n_feats, n_channels, bias=bias)
+        self.in_proj = torch.nn.Linear(2 * n_channels, n_channels, bias=bias)
+        self.backbone = Transformer(
+            n_channels=n_channels,
+            n_heads=n_heads,
+            n_layers=n_layers,
+            mult=mult,
+            p_dropout=p_dropout,
+            bias=False,
+            max_len=max_len,
+            pos_enc_self_attn=pos_enc,
+            qk_norm=qk_norm,
+            use_sdpa=use_sdpa,
+        )
+        self.tokens_emb = torch.nn.Embedding(codebook_size * n_codebooks, n_channels)
+        self.head = torch.nn.Linear(n_channels, codebook_size * n_codebooks, bias=False)  # No bias on head, to allow weight-sharing
+        if share_emb:
+            self.tokens_emb.weight = self.head.weight
+        # Masked token embedding
+        self.tokens_mask_emb = torch.nn.Parameter(torch.zeros(n_channels))
+        # Attributes
+        self.p_token_dropout = p_token_dropout
+        self.codebook_size = codebook_size
+        self.n_codebooks = n_codebooks
+        self.n_feats = n_feats
+        self.n_channels = n_channels
+        self.n_layers = n_layers
+        self.interp = interp
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        feats: torch.Tensor,
+        codebook: torch.Tensor,
+        tokens_mask: torch.Tensor,
+        feats_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        tokens : torch.Tensor
+            Acoustic tokens, fully or partially masked; shape
+            (n_batch, n_codebooks, n_frames)
+        feats : torch.Tensor
+            Aligned features to guide generation; shape (n_batch, n_feats, n_frames)
+        codebook : torch.Tensor
+            Codebook in which to predict masked tokens; shape (n_batch,)
+        tokens_mask : torch.Tensor
+            Boolean tensor indicating umasked token positions (True where
+            unmasked, False where masked); shape (n_batch, n_codebooks, n_frames)
+        feats_mask : torch.Tensor
+        """
+        assert tokens.ndim == 3       # (n_batch, n_codebooks, n_frames)
+        assert feats.ndim == 3        # (n_batch, n_feats, n_frames')
+        assert tokens_mask.ndim == 3  # (n_batch, n_codebooks, n_frames)
+        assert feats_mask.ndim == 2   # (n_batch, n_frames')
+        assert tokens.shape[1] == self.n_codebooks
+        n_batch, n_codebooks, n_frames = tokens.shape
+        # Interpolate features and mask to tokens resulution
+        feats = torch.nn.functional.interpolate(feats, n_frames, mode=self.interp)
+        feats_mask = torch.nn.functional.interpolate(
+            feats_mask[:, None, :].float(), n_frames, mode="nearest")
+        # Adapt features
+        feats = self.adapter(feats.transpose(1, 2))  # (n_batch, n_frames, n_channels)
+        # Embed tokens
+        codebook_offsets = torch.arange(
+            n_codebooks, dtype=tokens.dtype, device=tokens.device
+        ).reshape(1, -1, 1) * self.codebook_size   # (1, n_codebooks, 1)
+        tokens = tokens + codebook_offsets         # (n_batch, n_codebooks, n_frames)
+        tokens_emb = self.tokens_emb(tokens)       # (n_batch, n_codebooks, n_frames, n_channels)
+        # Zero masked token embeddings
+        tokens_emb = tokens_emb * tokens_mask.unsqueeze(-1).float()
+        # Apply learned embedding to masked token positions in current codebook
+        mask_pos = torch.arange(
+            n_codebooks, dtype=tokens.dtype, device=tokens.device
+        )[None, :] == codebook[:, None]  # (n_batch, n_codebooks)
+        mask_pos = torch.logical_and(mask_pos.unsqueeze(-1), ~tokens_mask)  # (n_batch, n_codebooks, n_frames)
+        tokens_emb = tokens_emb + (
+            mask_pos.unsqueeze(-1).float()
+        ) * self.tokens_mask_emb.reshape(1, 1, 1, -1)  # (n_batch, n_codebooks, n_frames, n_channels)
+        # Token dropout (encourage attention to unmasked frames)
+        if self.training and self.p_token_dropout > 0.0:
+            # Apply dropout within masked frames and "below" current codebook
+            below = torch.arange(
+                n_codebooks, device=tokens.device
+            )[None, :, None] < codebook[:, None, None]  # (n_batch, n_codebooks, 1)
+            eligible = below & feats_mask.bool()       # (n_batch, n_codebooks, n_frames)
+            drop = (
+                torch.rand(
+                    n_batch, 1, n_frames, 1, device=tokens.device
+                ) < self.p_token_dropout) & eligible[..., None]
+            tokens_emb = tokens_emb.masked_fill(drop, 0.0)
+        # Zero "ignored" features
+        feats = feats * feats_mask.transpose(1, 2)
+        # Sum embedded tokens across codebooks
+        tokens_emb = tokens_emb.sum(dim=1)      # (n_batch, n_frames, n_channels)
+        # Sum embedded tokens and adapted features
+        x = torch.cat([feats, tokens_emb], dim=-1)  # (n_batch, n_frames, 2 * n_channels)
+        x = self.in_proj(x)                         # (n_batch, n_frames, n_channels)
+        # Process with transformer
+        x = self.backbone(x=x)  # (n_batch, n_frames, n_channels)
+        # Predict token logits
+        logits = self.head(x)   # (n_batch, n_frames, n_codebooks * codebook_size)
+        logits = logits.reshape(
+            n_batch, n_frames, n_codebooks, self.codebook_size
+        ).permute(0, 2, 1, 3)   # (n_batch, n_codebooks, n_frames, codebook_size)
+        return logits
+    @torch.inference_mode()
+    def inference(
+        self,
+        tokens: torch.Tensor,
+        feats: torch.Tensor,
+        tokens_mask: torch.Tensor,
+        feats_mask: torch.Tensor,
+        top_p: Union[float, Iterable[float]] = 1.0,
+        top_k: Union[int, Iterable[int]] = None,
+        temp: Union[float, Iterable[float]] = 1.0,
+        mask_temp: Union[float, Iterable[float]] = 10.5,
+        iterations: Union[int, Iterable[int]] = 8,
+        guidance_scale: Union[float, Iterable[float]] = None,
+        causal_bias: Union[float, Iterable[float]] = None,
+        seed: Union[int, Iterable[int]] = None,
+    ):
+        assert not self.training
+        device = next(iter(self.parameters())).device
+        # Avoid overwriting
+        tokens = tokens.clone().to(device)
+        tokens_mask = tokens_mask.clone().to(device)
+        assert tokens.ndim == 3
+        n_batch, n_codebooks, n_frames = tokens.shape
+        assert feats.ndim == 3
+        _, n_feats, _ = feats.shape
+        assert n_codebooks == self.n_codebooks
+        assert n_feats == self.n_feats
+        # Interpolate features to token resolution
+        feats = torch.nn.functional.interpolate(
+            feats.to(device), n_frames, mode=self.interp,
+        )
+        feats_mask = torch.nn.functional.interpolate(
+            feats_mask.unsqueeze(1).float().to(device), n_frames, mode="nearest",
+        ).squeeze(1).to(feats_mask.dtype)
+        # Account for per-codebook args
+        def _to_codebooks(v):
+            if isinstance(v, torch.Tensor):
+                v = v.tolist()
+            elif isinstance(v, Iterable):
+                pass
+            else:
+                v = [v]
+            if len(v) == n_codebooks:
+                return v
+            elif len(v) == 1:
+                return v * n_codebooks
+            else:
+                raise ValueError(
+                    f"Sampling parameters must be scalars, "
+                    f"length-1 iterable, or length-n_codebooks ({n_codebooks})"
+                )
+        # Construct `n_codebooks` state lists of length `n_batch` each
+        seed = seed or 0
+        if not isinstance(seed, Iterable):
+            seed = [seed]
+        assert len(seed) in [1, n_batch]
+        seed = seed * (n_batch // len(seed))
+        state = [format_seed([s + 10007 * cb for s in seed]) for cb in range(n_codebooks)]
+        top_p, top_k = _to_codebooks(top_p), _to_codebooks(top_k)
+        temp, mask_temp = _to_codebooks(temp), _to_codebooks(mask_temp)
+        iterations = _to_codebooks(iterations)
+        guidance_scale = _to_codebooks(guidance_scale)
+        causal_bias = _to_codebooks(causal_bias)
+        # Track initial masked token counts
+        n_masked_init = (~tokens_mask).long().sum(dim=-1)  # (n_batch, n_codebooks)
+        # Generate one codebook at a time
+        for codebook_idx, (
+            _state, _top_p, _top_k, _temp, _mask_temp,
+            _iterations, _guidance_scale, _causal_bias,
+        ) in enumerate(zip(
+            state, top_p, top_k, temp, mask_temp,
+            iterations, guidance_scale, causal_bias,
+        )):
+            _causal_bias = _causal_bias or 0.
+            assert 0. <= _causal_bias
+            _temp = _temp or 1.0
+            assert 0. < _temp
+            _mask_temp = _mask_temp or 0.0
+            assert 0. <= _mask_temp
+            _iterations = max(_iterations or 1, 1)
+            for _iter in range(_iterations):
+                # CFG on features by masking
+                if _guidance_scale:
+                    tokens_cfg = torch.cat([tokens, tokens], dim=0)
+                    tokens_mask_cfg = torch.cat([tokens_mask, tokens_mask], dim=0)
+                    feats_cfg = torch.cat([feats, feats], dim=0)
+                    feats_mask_cfg = torch.cat([feats_mask, torch.zeros_like(feats_mask)], dim=0)
+                    logits_cond, logits_uncond = self.forward(
+                        tokens_cfg,
+                        feats_cfg,
+                        torch.full(
+                            (tokens_cfg.shape[0],),
+                            codebook_idx,
+                            dtype=torch.long,
+                            device=device,
+                        ),
+                        tokens_mask_cfg,
+                        feats_mask_cfg,
+                    ).chunk(2, dim=0)  # (n_batch, n_codebooks, n_frames, codebook_size) x2
+                    logits = logits_uncond + _guidance_scale * (logits_cond - logits_uncond)  # (n_batch, n_codebooks, n_frames, codebook_size)
+                else:
+                    logits = self.forward(
+                        tokens,
+                        feats,
+                        torch.full(
+                            (tokens.shape[0],),
+                            codebook_idx,
+                            dtype=torch.long,
+                            device=device,
+                        ),
+                        tokens_mask,
+                        feats_mask,
+                    )  # (n_batch, n_codebooks, n_frames, codebook_size)
+                # Truncate logits and sample tokens at masked positions
+                logits = top_p_top_k(
+                    logits[:, codebook_idx:codebook_idx+1, ...], _top_p, _top_k
+                )  # (n_batch, 1, n_frames, codebook_size)
+                sampled, probs = sample(
+                    logits, _temp, argmax=(_iter==_iterations-1),
+                )  # (n_batch, 1, n_frames) x2
+                write_idx = ~(tokens_mask[:, codebook_idx, :])  # (n_batch, n_frames)
+                tokens[:, codebook_idx, :][write_idx] = sampled[:, 0, :][write_idx]
+                # Compute implied generation timestep and corresponding target mask
+                # ratio
+                t = (_iter + 1) / _iterations
+                tgt_p_mask = cosine_schedule(torch.tensor([t]*n_batch, device=device))  # (n_batch,)
+                # Compute target and actual number of masked positions in current
+                # codebook
+                tgt_n_masked = torch.floor(tgt_p_mask * n_masked_init[:, codebook_idx]).long()  # (n_batch,)
+                n_masked = write_idx.long().sum(dim=-1)  # (n_batch,)
+                # Do not complete unmasking until final iteration, i.e. always leave at
+                # least one token unmasked
+                if _iter < _iterations - 1:
+                    tgt_n_masked = torch.minimum(n_masked - 1, tgt_n_masked).clamp_min(1)
+                # Select which tokens to unmask via confidence (assigned probability),
+                # mediated by causal bias and random noise
+                _probs = torch.full_like(probs[:, 0, :], torch.inf)  # (n_batch, n_frames)
+                _probs[write_idx] = probs[:, 0, :][write_idx]
+                tokens_mask[:, codebook_idx, :] = mask_by_confidence(
+                    probs=_probs,
+                    n=tgt_n_masked,
+                    temp=_mask_temp * (1 - t),  # Mask temperature annealing
+                    causal_bias=_causal_bias or 0.0,
+                    state=_state,
+                    eligible=write_idx,
+                )
+                # Re-apply span and codebook masks
+                tokens_mask = ~torch.logical_and(~tokens_mask, feats_mask.unsqueeze(1))
+                tokens_mask[:, :codebook_idx, :] = True
+        return tokens

tria/nn/__init__.py ADDED Viewed

File without changes

tria/nn/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (146 Bytes). View file

tria/nn/__pycache__/attention.cpython-310.pyc ADDED Viewed

Binary file (6.67 kB). View file

tria/nn/__pycache__/norm.cpython-310.pyc ADDED Viewed

Binary file (2.25 kB). View file

tria/nn/__pycache__/pos_enc.cpython-310.pyc ADDED Viewed

Binary file (2.87 kB). View file

tria/nn/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (6.7 kB). View file

tria/nn/attention.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import math
+from typing import Optional
+from typing import Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .norm import QKNorm
+from .pos_enc import apply_rope
+from .pos_enc import apply_sinusoidal
+from .pos_enc import build_rope_cache
+from .pos_enc import build_sinusoidal_cache
+################################################################################
+# Multihead attention operation
+################################################################################
+def ensure_masks(
+    n_batch: int,
+    seq_len_q: int,
+    seq_len_k: int,
+    device,
+    mask_q: Optional[torch.Tensor],
+    mask_k: Optional[torch.Tensor],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Parameters
+    ----------
+    n_batch : int
+    seq_len_q : int
+    seq_len_k : int
+    mask_q : torch.Tensor
+        Shape (n_batch, seq_len_q)
+    mask_k : torch.Tensor
+        Shape (n_batch, seq_len_k)
+    """
+    if mask_q is None:
+        mask_q = torch.ones(n_batch, seq_len_q, dtype=torch.bool, device=device)
+    if mask_k is None:
+        mask_k = torch.ones(n_batch, seq_len_k, dtype=torch.bool, device=device)
+    return mask_q, mask_k
+def make_attn_mask(
+    mask_q: torch.Tensor,
+    mask_k: torch.Tensor,
+    dtype,
+) -> torch.Tensor:
+    """
+    Use "key padding mask" convention to prevent empty rows in attention score
+    matrix (and thus softmax issues).
+    Parameters
+    ----------
+    mask_q : torch.Tensor
+        Query sequence mask, shape (n_batch, seq_len_q)
+    mask_k : torch.Tensor
+        Key sequence mask, shape (n_batch, seq_len_k)
+    Returns
+    -------
+    torch.Tensor
+        Additive attention mask for scaled_dot_product_attention, shape
+        (n_batch, 1, seq_len_q, seq_len_k)
+    """
+    n_batch, seq_len_q = mask_q.shape
+    seq_len_k = mask_k.shape[1]
+    exclude = (
+        (~mask_k)[:, None, :].expand(n_batch, seq_len_q, seq_len_k).unsqueeze(1)
+    )  # (n_batch, 1, seq_len_q, seq_len_k)
+    mask = exclude.to(dtype=dtype).masked_fill(exclude, float("-inf"))
+    return mask  # (n_batch, 1, seq_len_q, seq_len_k)
+def sdpa_with_fallback(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    attn_mask: Optional[torch.Tensor],
+    p_dropout: float,
+    training: bool,
+    use_sdpa: bool = True,
+) -> torch.Tensor:
+    """
+    Optionally use PyTorch scaled_dot_product_attention (SDPA), which picks
+    efficient attention implementations (e.g. flash attention) if available
+    Parameters
+    ----------
+    q : torch.Tensor
+        Query, shape (n_batch, n_heads, seq_len_q, head_channels)
+    k : torch.Tensor
+        Key, shape (n_batch, n_heads, seq_len_k, head_channels)
+    v : torch.Tensor
+        Value, shape (n_batch, n_heads, seq_len_k, head_channels)
+    attn_mask : torch.Tensor
+        Additive attention mask (0 or -inf), shape (n_batch, 1, seq_len_q, seq_len_k)
+    Returns
+    -------
+    torch.Tensor
+        Shape (n_batch, n_heads, seq_len_q, head_channels)
+    """
+    n_batch, n_heads, seq_len_q, head_channels = q.shape
+    seq_len_k = k.shape[2]
+    if use_sdpa and q.is_cuda:
+        if attn_mask is not None and (
+            (attn_mask.dtype == torch.bool and attn_mask.all())
+            or (attn_mask.dtype != torch.bool and not attn_mask.ne(0).any())
+        ):
+            attn_mask = None
+        out = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=attn_mask,
+            dropout_p=p_dropout if training else 0.0,
+            is_causal=False,
+        )
+        return out
+    # Fallback
+    scale = 1.0 / math.sqrt(head_channels)
+    scores = torch.einsum("bhtd,bhsd->bhts", q, k) * scale
+    if attn_mask is not None:
+        scores = scores + attn_mask  # Additive mask
+    attn = scores.softmax(dim=-1)
+    if training and p_dropout > 0.0:
+        attn = F.dropout(attn, p=p_dropout)
+    out = torch.einsum("bhts,bhsd->bhtd", attn, v)
+    return out
+class MultiheadAttention(nn.Module):
+    def __init__(
+        self,
+        n_channels: int,
+        n_heads: int,
+        p_dropout: float = 0.0,
+        bias: bool = True,
+        max_len: int = 8192,
+        pos_enc: Optional[str] = "rope",
+        qk_norm: bool = True,
+        use_sdpa: bool = True,
+    ):
+        super().__init__()
+        assert n_channels % n_heads == 0, "`n_channels` must be divisible by `n_heads`"
+        assert pos_enc in ("rope", "absolute", "none", None)
+        self.n_channels = n_channels
+        self.n_heads = n_heads
+        self.head_channels = n_channels // n_heads
+        self.p_dropout = p_dropout
+        self.pos_enc = pos_enc
+        self.max_len = max_len
+        self.use_sdpa = use_sdpa
+        self.q_proj = nn.Linear(n_channels, n_channels, bias=bias)
+        self.k_proj = nn.Linear(n_channels, n_channels, bias=bias)
+        self.v_proj = nn.Linear(n_channels, n_channels, bias=bias)
+        self.o_proj = nn.Linear(n_channels, n_channels, bias=bias)
+        self.o_dropout = nn.Dropout(p_dropout)
+        self.qk_norm = QKNorm(self.head_channels) if qk_norm else None
+        self.pos_cache = None
+    def _maybe_build_pos_cache(self, device, dtype):
+        if self.pos_enc in [None, "none"] or self.pos_cache is not None:
+            return
+        if self.pos_enc == "absolute":
+            self.pos_cache = build_sinusoidal_cache(
+                self.max_len, self.head_channels, device, dtype=torch.float32
+            )
+        elif self.pos_enc == "rope":
+            cos, sin = build_rope_cache(
+                self.max_len, self.head_channels, device, dtype=torch.float32
+            )
+            self.pos_cache = (cos, sin)
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        mask_q: Optional[torch.Tensor] = None,
+        mask_k: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        q : torch.Tensor
+            Query, shape (n_batch, seq_len_q, n_channels)
+        k : torch.Tensor
+            Key, shape (n_batch, seq_len_k, n_channels)
+        v : torch.Tensor
+            Value, shape (n_batch, seq_len_k, n_channels)
+        mask_q : torch.Tensor
+            Boolean mask, `True` for valid positions; shape (n_batch, seq_len_q)
+        mask_k : torch.Tensor
+            Boolean mask, `True` for valid positions; shape (n_batch, seq_len_k)
+        attn_mask : torch.tensor
+            Additive (0, -inf) mask; shape (n_batch, 1, seq_len_q, seq_len_k)
+        """
+        n_batch, seq_len_q, _ = q.shape
+        seq_len_k = k.shape[1]
+        device, dtype = q.device, q.dtype
+        # Projections (n_batch, seq_len, n_channels) -> (n_batch, n_heads, seq_len, head_channels)
+        q = (
+            self.q_proj(q)
+            .view(n_batch, seq_len_q, self.n_heads, self.head_channels)
+            .transpose(1, 2)
+        )
+        k = (
+            self.k_proj(k)
+            .view(n_batch, seq_len_k, self.n_heads, self.head_channels)
+            .transpose(1, 2)
+        )
+        v = (
+            self.v_proj(v)
+            .view(n_batch, seq_len_k, self.n_heads, self.head_channels)
+            .transpose(1, 2)
+        )
+        # Positional encoding
+        self._maybe_build_pos_cache(device=device, dtype=dtype)
+        if self.pos_enc == "absolute":
+            cache = self.pos_cache  # (max_seq_len, head_channels)
+            q = apply_sinusoidal(q, cache)
+            k = apply_sinusoidal(k, cache)
+        elif self.pos_enc == "rope":
+            cos, sin = self.pos_cache  # (max_seq_len, head_channels/2)
+            q = apply_rope(q, cos, sin)
+            k = apply_rope(k, cos, sin)
+        # QK-Norm
+        if self.qk_norm is not None:
+            q, k = self.qk_norm(q, k)
+        # Masks
+        mask_q, mask_k = ensure_masks(
+            n_batch, seq_len_q, seq_len_k, device, mask_q, mask_k
+        )
+        pad_mask = make_attn_mask(
+            mask_q, mask_k, dtype
+        )  # (n_batch, 1, seq_len_q, seq_len_k)
+        if attn_mask is not None:
+            pad_mask = pad_mask + attn_mask
+        # Attention
+        y = sdpa_with_fallback(
+            q,
+            k,
+            v,
+            attn_mask=pad_mask,
+            p_dropout=self.p_dropout,
+            training=self.training,
+            use_sdpa=self.use_sdpa,
+        )  # (n_batch, n_heads, seq_len_q, head_channels)
+        y = y.transpose(1, 2).contiguous().view(n_batch, seq_len_q, self.n_channels)
+        y = self.o_proj(y)  # (n_batch, seq_len_q, n_channels)
+        y = self.o_dropout(y)
+        # Mask outputs
+        if mask_q is not None:
+            with torch.no_grad():
+                y.masked_fill_(~mask_q[:, :, None], 0.0)
+        return y

tria/nn/norm.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from typing import Tuple
+import torch
+import torch.nn as nn
+################################################################################
+# Normalization layers
+################################################################################
+class RMSNorm(nn.Module):
+    def __init__(self, n_channels: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(n_channels))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Normalize over final dimension
+        """
+        rms = x.pow(2).mean(dim=-1, keepdim=True).add(self.eps).rsqrt()
+        return self.weight * x * rms  # Broadcast targets final dimension
+class QKNorm(nn.Module):
+    """
+    RMS-normalize query and key across channel dimension with a learnable gain.
+    Applied per-head, per-position.
+    """
+    def __init__(self, head_channels: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.g_q = nn.Parameter(torch.ones(head_channels))
+        self.g_k = nn.Parameter(torch.ones(head_channels))
+    def forward(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Parameters
+        ----------
+        q : torch.Tensor
+            Query, shape (n_batch, n_heads, seq_len_q, head_channels)
+        k : torch.Tensor
+            Key, shape (n_batch, n_heads, seq_len_k, head_channels)
+        """
+        def _rmsnorm(x, g):
+            rms = x.pow(2).mean(dim=-1, keepdim=True).add(self.eps).rsqrt()
+            return x * rms * g  # Broadcast targets final dimension
+        return _rmsnorm(q, self.g_q), _rmsnorm(k, self.g_k)

tria/nn/pos_enc.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import torch
+################################################################################
+# Utilities for positional encoding
+################################################################################
+def build_sinusoidal_cache(seq_len: int, n_channels: int, device, dtype):
+    """
+    Returns
+    -------
+    torch.Tensor
+        Cache, shape (seq_len, n_channels)
+    """
+    assert n_channels % 2 == 0
+    pos = torch.arange(seq_len, device=device, dtype=dtype).unsqueeze(1)  # (seq_len, 1)
+    i = torch.arange(n_channels // 2, device=device, dtype=dtype).unsqueeze(
+        0
+    )  # (1, n_channels/2)
+    inv_freq = 1.0 / (10000 ** (i / (n_channels // 2)))
+    ang = pos * inv_freq  # (seq_len, n_channels/2)
+    emb = torch.cat([torch.sin(ang), torch.cos(ang)], dim=1)  # (seq_len, n_channels)
+    return emb
+def apply_sinusoidal(x: torch.Tensor, cache: torch.Tensor) -> torch.Tensor:
+    """
+    Parameters
+    ----------
+    x : torch.Tensor
+        Shape (n_batch, n_heads, seq_len, head_channels) or (n_batch, seq_len, n_channels)
+    cache: torch.Tensor
+        Shape (seq_len, n_channels)
+    Returns
+    -------
+    torch.Tensor
+         Shape (n_batch, n_heads, seq_len, head_channels) or (n_batch, seq_len, n_channels)
+    """
+    if x.ndim == 4:
+        n_batch, n_heads, seq_len, head_channels = x.shape
+        return x + cache.to(x.dtype)[None, None, :seq_len, :head_channels]
+    elif x.ndim == 3:
+        n_batch, seq_len, n_channels = x.shape
+        return x + cache.to(x.dtype)[None, :seq_len, :n_channels]
+    else:
+        raise ValueError(
+            f"Invalid input shape {tuple(x.shape)}; "
+            f"expected (n_batch, [n_heads], seq_len, n_channels)"
+        )
+def build_rope_cache(
+    seq_len: int, n_channels: int, device, dtype, base: float = 10000.0
+):
+    """
+    Returns
+    ----------
+    torch.Tensor, torch.Tensor
+        Caches, shape (seq_len, n_channels/2)
+    """
+    assert n_channels % 2 == 0
+    theta = 1.0 / (
+        base
+        ** (torch.arange(0, n_channels, 2, device=device, dtype=dtype) / n_channels)
+    )
+    seq = torch.arange(seq_len, device=device, dtype=dtype)
+    freqs = torch.einsum("t,d->td", seq, theta)  # (seq_len, n_channels/2)
+    return torch.cos(freqs), torch.sin(
+        freqs
+    )  # (seq_len, n_channels/2), (seq_len, n_channels/2)
+def apply_rope(
+    q_or_k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> torch.Tensor:
+    """
+    Parameters
+    ----------
+    q_or_k : torch.Tensor
+        Shape (n_batch, n_heads, seq_len, head_channels) where head_channels even
+    cos : torch.Tensor
+        Shape (seq_len, head_channels/2)
+    sin : torch.Tensor
+        Shape (seq_len, head_channels/2)
+    Returns
+    -------
+    torch.Tensor
+        Shape (n_batch, n_heads, seq_len, head_channels)
+    """
+    n_batch, n_heads, seq_len, head_channels = q_or_k.shape
+    q = q_or_k.reshape(n_batch, n_heads, seq_len, head_channels // 2, 2)
+    q1, q2 = q[..., 0], q[..., 1]  # (n_batch, n_heads, seq_len, n_channels / 2)
+    c = cos[:seq_len].to(q_or_k.dtype)[None, None, :, :]
+    s = sin[:seq_len].to(q_or_k.dtype)[None, None, :, :]
+    out1 = q1 * c - q2 * s
+    out2 = q1 * s + q2 * c
+    return torch.stack([out1, out2], dim=-1).reshape(
+        n_batch, n_heads, seq_len, head_channels
+    )

tria/nn/transformer.py ADDED Viewed

	@@ -0,0 +1,259 @@

+from typing import Optional
+from typing import Tuple
+import torch
+import torch.nn as nn
+from .attention import MultiheadAttention
+from .norm import RMSNorm
+################################################################################
+# Transformer
+################################################################################
+def lengths_to_mask(
+    lengths: torch.Tensor, max_len: Optional[int] = None
+) -> torch.Tensor:
+    """
+    Parameters
+    ----------
+    lengths : torch.Tensor
+        Shape (n_batch,)
+    max_len : int
+    """
+    if max_len is None:
+        max_len = int(lengths.amax())
+    rng = torch.arange(max_len, device=lengths.device)
+    return rng[None, :] < lengths[:, None]  # (n_batch, max_len)
+class MLP(nn.Module):
+    def __init__(
+        self, n_channels: int, mult: int = 4, p_dropout: float = 0.1, bias: bool = True
+    ):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(n_channels, n_channels * mult),
+            nn.GELU(),
+            nn.Linear(n_channels * mult, n_channels),
+            nn.Dropout(p_dropout),
+        )
+    def forward(self, x: torch.Tensor):
+        assert x.ndim == 3  # (n_batch, seq_len, n_channels)
+        return self.mlp(x)  # (n_batch, seq_len, n_channels)
+class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        n_channels: int,
+        n_heads: int,
+        mult: int = 4,
+        p_dropout: float = 0.0,
+        bias: bool = True,
+        max_len: int = 8192,
+        pos_enc_self_attn: Optional[str] = "rope",
+        pos_enc_cross_attn: Optional[str] = "absolute",
+        qk_norm: bool = True,
+        use_sdpa: bool = True,
+        cross_attn: bool = False,
+        norm: str = "layer",
+    ):
+        super().__init__()
+        assert norm in ["layer", "rms", "none", None]
+        if norm == "rms":
+            norm_cls = RMSNorm
+        elif norm == "layer":
+            norm_cls = nn.LayerNorm
+        else:
+            norm_cls = nn.Identity
+        self.norm_1 = norm_cls(n_channels)
+        self.self_attn = MultiheadAttention(
+            n_channels=n_channels,
+            n_heads=n_heads,
+            p_dropout=p_dropout,
+            bias=bias,
+            max_len=max_len,
+            pos_enc=pos_enc_self_attn,
+            qk_norm=qk_norm,
+            use_sdpa=use_sdpa,
+        )
+        self.cross_attn = cross_attn
+        if cross_attn:
+            self.norm_x = norm_cls(n_channels)
+            self.norm_c = norm_cls(n_channels)
+            self.cross = MultiheadAttention(
+                n_channels=n_channels,
+                n_heads=n_heads,
+                p_dropout=p_dropout,
+                bias=bias,
+                max_len=max_len,
+                pos_enc=pos_enc_cross_attn,
+                qk_norm=qk_norm,
+                use_sdpa=use_sdpa,
+            )
+        self.norm_2 = norm_cls(n_channels)
+        self.mlp = MLP(n_channels=n_channels, mult=mult, p_dropout=p_dropout, bias=bias)
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: Optional[torch.Tensor] = None,
+        mask_x: Optional[torch.Tensor] = None,
+        mask_c: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input sequence, shape (n_batch, seq_len_x, n_channels)
+        c : torch.Tensor
+            Conditioning sequence, shape (n_batch, seq_len_c, n_channels)
+        mask_x : torch.Tensor
+            Boolean mask indicating valid positions in input sequence, shape
+            (n_batch, seq_len_x)
+        mask_c : torch.Tensor
+            Boolean mask indicating valid positions in conditioning sequence,
+            shape (n_batch, seq_len_c)
+        """
+        if self.cross_attn:
+            assert c is not None
+        # Self-attention
+        y = self.norm_1(x)
+        y = self.self_attn(y, y, y, mask_q=mask_x, mask_k=mask_x)
+        x = x + y
+        # Cross-attention
+        if self.cross_attn and c is not None:
+            q = self.norm_x(x)
+            k = self.norm_c(c)
+            v = k
+            y = self.cross(q, k, v, mask_q=mask_x, mask_k=mask_c)
+            x = x + y
+        # MLP
+        y = self.norm_2(x)
+        y = self.mlp(y)
+        x = x + y
+        # Zero invalid outputs
+        if mask_x is not None:
+            with torch.no_grad():
+                x.masked_fill_(~mask_x[:, :, None], 0.0)
+        return x
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        n_channels: int,
+        n_heads: int,
+        n_layers: int,
+        mult: int,
+        p_dropout: float = 0.0,
+        bias: bool = True,
+        max_len: int = 8192,
+        pos_enc_self_attn: Optional[str] = "rope",
+        pos_enc_cross_attn: Optional[str] = "absolute",
+        qk_norm: bool = True,
+        use_sdpa: bool = True,
+        cross_attn: bool = False,
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [
+                TransformerBlock(
+                    n_channels=n_channels,
+                    n_heads=n_heads,
+                    mult=mult,
+                    p_dropout=p_dropout,
+                    bias=bias,
+                    max_len=max_len,
+                    pos_enc_self_attn=pos_enc_self_attn,
+                    pos_enc_cross_attn=pos_enc_cross_attn,
+                    qk_norm=qk_norm,
+                    use_sdpa=use_sdpa,
+                    cross_attn=cross_attn,
+                )
+                for _ in range(n_layers)
+            ]
+        )
+        self.n_channels = n_channels
+        self.max_len = max_len
+        self.pos_enc_self_attn = pos_enc_self_attn
+        self.pos_enc_cross_attn = pos_enc_cross_attn
+    @torch.no_grad()
+    def _masks_from_lengths(
+        self,
+        mask_x: Optional[torch.Tensor],
+        mask_c: Optional[torch.Tensor],
+        lengths_x: Optional[torch.Tensor],
+        lengths_c: Optional[torch.Tensor],
+        seq_len_x: int,
+        seq_len_c: Optional[int],
+        device,
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+        if mask_x is None and lengths_x is not None:
+            mask_x = lengths_to_mask(lengths_x.to(device), seq_len_x)
+        if mask_c is None and lengths_c is not None:
+            assert seq_len_c is not None
+            mask_c = lengths_to_mask(lengths_c.to(device), seq_len_c)
+        if mask_x is not None:
+            mask_x = mask_x.bool()
+        if mask_c is not None:
+            mask_c = mask_c.bool()
+        return mask_x, mask_c
+    def forward(
+        self,
+        x: torch.Tensor,
+        c: Optional[torch.Tensor] = None,
+        mask_x: Optional[torch.Tensor] = None,
+        mask_c: Optional[torch.Tensor] = None,
+        lengths_x: Optional[torch.Tensor] = None,
+        lengths_c: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input sequence, shape (n_batch, seq_len_x, n_channels)
+        c : torch.Tensor
+            Conditioning sequence, shape (n_batch, seq_len_c, n_channels)
+        mask_x : torch.Tensor
+            Boolean mask indicating valid positions in input sequence, shape
+            (n_batch, seq_len_x)
+        mask_c : torch.Tensor
+            Boolean mask indicating valid positions in conditioning sequence,
+            shape (n_batch, seq_len_c)
+        lengths_x : torch.Tensor
+            Valid lengths of input sequences, shape (n_batch,)
+        lengths_c : torch.Tensor
+            Valid lengths of conditioning sequences, shape (n_batch,)
+        """
+        assert x.ndim == 3
+        n_batch, seq_len_x, n_channels = x.shape
+        assert n_channels == self.n_channels
+        seq_len_c = c.shape[1] if c is not None else None
+        # Create valid masks from lengths if necessary
+        mask_x, mask_c = self._masks_from_lengths(
+            mask_x, mask_c, lengths_x, lengths_c, seq_len_x, seq_len_c, x.device
+        )
+        for i, block in enumerate(self.layers):
+            x = block(x=x, c=c, mask_x=mask_x, mask_c=mask_c)
+        return x

tria/pipelines/__init__.py ADDED Viewed

File without changes

tria/pipelines/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (153 Bytes). View file

tria/pipelines/tokenizer/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .tokenizer import Tokenizer
2	+ from .tokenizer import TokenSequence

tria/pipelines/tokenizer/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (242 Bytes). View file

tria/pipelines/tokenizer/__pycache__/tokenizer.cpython-310.pyc ADDED Viewed

Binary file (4.87 kB). View file

tria/pipelines/tokenizer/dac/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023-present, Descript
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

tria/pipelines/tokenizer/dac/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .dac import DAC

tria/pipelines/tokenizer/dac/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (198 Bytes). View file

tria/pipelines/tokenizer/dac/__pycache__/dac.cpython-310.pyc ADDED Viewed

Binary file (5.77 kB). View file

tria/pipelines/tokenizer/dac/__pycache__/modules.cpython-310.pyc ADDED Viewed

Binary file (4.04 kB). View file

tria/pipelines/tokenizer/dac/dac.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import math
+from typing import List
+from typing import Union
+import numpy as np
+import torch
+from torch import nn
+from .modules import Decoder
+from .modules import Encoder
+from .modules import init_weights
+from .nn.quantize import ResidualVectorQuantize
+################################################################################
+# Descript Audio Codec (DAC)
+################################################################################
+class DAC(torch.nn.Module):
+    """
+    Descript Audio Codec (DAC) proposed by Kumar et al. in "High-Fidelity Audio
+    Compression with Improved RVQGAN" (2023). Code adapted from:
+    https://github.com/descriptinc/descript-audio-codec
+    """
+    def __init__(
+        self,
+        sample_rate: int = 44_100,
+        encoder_dim: int = 64,
+        encoder_rates: List[int] = (2, 4, 8, 8),
+        latent_dim: int = None,
+        decoder_dim: int = 1536,
+        decoder_rates: List[int] = (8, 8, 4, 2),
+        n_codebooks: int = 9,
+        codebook_size: int = 1024,
+        codebook_dim: Union[int, list] = 8,
+        quantizer_dropout: bool = False,
+    ):
+        super().__init__()
+        self.encoder_dim = encoder_dim
+        self.encoder_rates = encoder_rates
+        self.decoder_dim = decoder_dim
+        self.decoder_rates = decoder_rates
+        self.sample_rate = sample_rate
+        if latent_dim is None:
+            latent_dim = encoder_dim * (2 ** len(encoder_rates))
+        self.latent_dim = latent_dim
+        self.hop_length = np.prod(encoder_rates)
+        self.encoder = Encoder(encoder_dim, encoder_rates, latent_dim)
+        self.n_codebooks = n_codebooks
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.quantizer = ResidualVectorQuantize(
+            input_dim=latent_dim,
+            n_codebooks=n_codebooks,
+            codebook_size=codebook_size,
+            codebook_dim=codebook_dim,
+            quantizer_dropout=quantizer_dropout,
+        )
+        self.decoder = Decoder(
+            latent_dim,
+            decoder_dim,
+            decoder_rates,
+        )
+        self.apply(init_weights)
+        self.delay = self.get_delay()
+        # As long as we don't run chunked/segmented encoding and decoding,
+        # we can keep padding on
+        self.padding = True
+    @property
+    def padding(self):
+        if not hasattr(self, "_padding"):
+            self._padding = True
+        return self._padding
+    @padding.setter
+    def padding(self, value: bool):
+        assert isinstance(value, bool)
+        layers = [
+            l for l in self.modules() if isinstance(l, (nn.Conv1d, nn.ConvTranspose1d))
+        ]
+        for layer in layers:
+            if value:
+                if hasattr(layer, "original_padding"):
+                    layer.padding = layer.original_padding
+            else:
+                layer.original_padding = layer.padding
+                layer.padding = tuple(0 for _ in range(len(layer.padding)))
+        self._padding = value
+    def get_delay(self):
+        # Any number works here, delay is invariant to input length
+        l_out = self.get_output_length(0)
+        L = l_out
+        layers = []
+        for layer in self.modules():
+            if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
+                layers.append(layer)
+        for layer in reversed(layers):
+            d = layer.dilation[0]
+            k = layer.kernel_size[0]
+            s = layer.stride[0]
+            if isinstance(layer, nn.ConvTranspose1d):
+                L = ((L - d * (k - 1) - 1) / s) + 1
+            elif isinstance(layer, nn.Conv1d):
+                L = (L - 1) * s + d * (k - 1) + 1
+            L = math.ceil(L)
+        l_in = L
+        return (l_in - l_out) // 2
+    def get_output_length(self, input_length: int):
+        L = input_length
+        # Calculate output length
+        for layer in self.modules():
+            if isinstance(layer, (nn.Conv1d, nn.ConvTranspose1d)):
+                d = layer.dilation[0]
+                k = layer.kernel_size[0]
+                s = layer.stride[0]
+                if isinstance(layer, nn.Conv1d):
+                    L = ((L - d * (k - 1) - 1) / s) + 1
+                elif isinstance(layer, nn.ConvTranspose1d):
+                    L = (L - 1) * s + d * (k - 1) + 1
+                L = math.floor(L)
+        return L
+    def encode(
+        self,
+        audio_data: torch.Tensor,
+    ):
+        """
+        Encode given audio data and return quantized latent codes.
+        Parameters
+        ----------
+        audio_data : torch.Tensor
+            Audio data to encode, shape (batch_size, 1, n_samples)
+        Returns
+        -------
+        codes:
+            Codebook indices across all quantizer levels, shape
+            (n_batch, n_quantizers, n_frames)
+        z_O: torch.Tensor
+            Quantized output obtained by summing projected quantized residuals
+            (z_o) over all quantizer levels, shape (n_batch, latent_dim, n_frames)
+        z_i: torch.Tensor
+            Continuous representation of inputs projected into codebook space,
+            shape (n_batch, n_quantizers, codebook_dim, n_frames). Note that
+            each quantizer level represents a predicted residual.
+        z_q: torch.Tensor
+            Quantized representation of input in codebook space, shape
+            (n_batch, n_quantizers, codebook_dim, n_frames). Note that each
+            quantizer level represents a quantized predicted residual.
+        z_o: torch.Tensor
+            Continuous representation of quantized input, projected back into
+            latent space, shape (n_batch, n_quantizers, latent_dim, n_frames).
+            Note that each quantizer level represents a projected quantized
+            predicted residual.
+        """
+        # Predict continuous latents
+        z = self.encoder(audio_data)  # (n_batch, latent_dim, n_frames)
+        return *self.quantizer(z, n_quantizers=None), z
+    def decode(
+        self,
+        codes: torch.Tensor,
+    ):
+        """
+        Decode given quantized latent codes and return audio data
+        Parameters
+        ----------
+        codes : torch.Tensor
+            Quantized latent codes, shape (n_batch, n_quantizers, n_frames)
+        Returns
+        -------
+        torch.Tensor
+            Decoded audio data, shape (n_batch, 1, n_samples)
+        """
+        z_O = self.quantizer.from_codes(codes)  # (n_batch, latent_dim, n_frames)
+        recons = self.decoder(z_O)  # (n_batch, 1, n_samples)
+        return recons