andy hickl commited on Sep 18, 2024

Commit

ed99557

1 Parent(s): bfcda41

Uploaded from Github

Files changed (31) hide show

LICENSE +23 -0
LICENSE.audiocraft +21 -0
MANIFEST.in +5 -0
README.md +151 -0
moshi/__init__.py +18 -0
moshi/client.py +196 -0
moshi/client_utils.py +211 -0
moshi/models/__init__.py +14 -0
moshi/models/compression.py +474 -0
moshi/models/lm.py +488 -0
moshi/models/loaders.py +159 -0
moshi/modules/__init__.py +23 -0
moshi/modules/conv.py +329 -0
moshi/modules/gating.py +82 -0
moshi/modules/resample.py +119 -0
moshi/modules/rope.py +90 -0
moshi/modules/seanet.py +395 -0
moshi/modules/streaming.py +363 -0
moshi/modules/transformer.py +750 -0
moshi/quantization/__init__.py +13 -0
moshi/quantization/base.py +170 -0
moshi/quantization/core_vq.py +384 -0
moshi/quantization/vq.py +340 -0
moshi/server.py +256 -0
moshi/utils/__init__.py +10 -0
moshi/utils/autocast.py +45 -0
moshi/utils/compile.py +284 -0
moshi/utils/sampling.py +126 -0
pyproject.toml +33 -0
requirements.txt +10 -0
setup.cfg +10 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,23 @@

+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.

LICENSE.audiocraft ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) Meta Platforms, Inc. and affiliates.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,5 @@

+include LICENSE*
+include *.md
+include *.cfg
+include requirements.txt
+include moshi/py.typed

README.md ADDED Viewed

	@@ -0,0 +1,151 @@

+# Moshi - PyTorch
+See the [top-level README.md][main_repo] for more information on Moshi.
+[Moshi][moshi] is a speech-text foundation model and full-duplex spoken dialogue framework.
+It uses [Mimi][moshi], a state-of-the-art streaming neural audio codec. Mimi operates at 12.5 Hz, and compresses
+24 kHz audio down to 1.1 kbps, in a fully streaming manner (latency of 80ms, the frame size), yet performs better than existing, non-streaming, codec.
+This is the PyTorch implementation for Moshi and Mimi.
+## Requirements
+You will need at least Python 3.10. We kept a minimal set of dependencies for the current project.
+It was tested with PyTorch 2.2 or 2.4. If you need a specific CUDA version, please make sure
+to have PyTorch properly installed before installing Moshi.
+```bash
+pip install moshi      # moshi PyTorch, from PyPI
+# Or the bleeding edge versions for Moshi
+pip install -e "git+https://git@github.com/kyutai-labs/moshi#egg=moshi&subdirectory=moshi"
+```
+While we hope that the present codebase will work on Windows, we do not provide official support for it.
+At the moment, we do not support quantization for the PyTorch version, so you will need a GPU with a significant amount of memory (24GB).
+## Usage
+This package provides a streaming version of the audio tokenizer (Mimi) and the lm model (Moshi).
+In order to run in interactive mode, you need to start a server which will
+run the model, you can then use either the web UI or a command line client.
+Start the server with:
+```bash
+python -m moshi.server [--gradio-tunnel]
+```
+And then access the web UI on [localhost:8998](http://localhost:8998). If your GPU is on a distant machine
+with no direct access, `--gradio-tunnel` will create a tunnel with a URL accessible from anywhere.
+Keep in mind that this tunnel goes through the US and can add significant latency (up to 500ms from Europe).
+You can use `--gradio-tunnel-token` to set a fixed secret token and reuse the same address over time.
+Alternatively, you might want to use SSH to redirect your connection.
+You can use `--hf-repo` to select a different pretrained model, by setting the proper Hugging Face repository.
+See [the model list](https://github.com/kyutai-labs/moshi?tab=readme-ov-file#models) for a reference of the available models.
+Accessing a server that is not localhost via http may cause issues with using
+the microphone in the web UI (in some browsers this is only allowed using
+https).
+A local client is also available, as
+```bash
+python -m moshi.client [--url URL_TO_GRADIO]
+```
+However note, that unlike the web browser, this client is barebone. It does not perform any echo cancellation,
+nor does it try to compensate for a growing lag by skipping frames.
+## API
+You can use programmatically the Mimi/Moshi as follows:
+```python
+from huggingface_hub import hf_hub_download
+import torch
+from moshi.models import loaders, LMGen
+mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
+mimi = loaders.get_mimi(mimi_weight, device='cpu')
+mimi.set_num_codebooks(8)  # up to 32 for mimi, but limited to 8 for moshi.
+wav = torch.randn(1, 1, 24000 * 10)  # should be [B, C=1, T]
+with torch.no_grad():
+    codes = mimi.encode(wav)  # [B, K = 8, T]
+    decoded = mimi.decode(codes)
+    # Supports streaming too.
+    frame_size = int(mimi.sample_rate / mimi.frame_rate)
+    all_codes = []
+    with mimi.streaming(batch_size=1):
+        for offset in range(0, wav.shape[-1], frame_size):
+            frame = wav[:, :, offset: offset + frame_size]
+            codes = mimi.encode(frame)
+            assert codes.shape[-1] == 1, codes.shape
+            all_codes.append(codes)
+# Now if you have a GPU around.
+mimi.cuda()
+moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
+moshi = loaders.get_moshi_lm(moshi_weight, device='cuda')
+lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)  # this handles sampling params etc.
+out_wav_chunks = []
+# Now we will stream over both Moshi I/O, and decode on the fly with Mimi.
+with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1):
+    for idx, code in enumerate(all_codes):
+        tokens_out = lm_gen.step(code.cuda())
+        # tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token.
+        if tokens_out is not None:
+            wav_chunk = mimi.decode(tokens_out[:, 1:])
+            out_wav_chunks.append(wav_chunk)
+        print(idx, end='\r')
+out_wav = torch.cat(out_wav_chunks, dim=-1)
+```
+## Development
+If you wish to install from a clone of this repository, maybe to further develop Moshi, you can do the following:
+```bash
+# From the current folder (e.g. `moshi/`)
+pip install -e '.[dev]'
+pre-commit install
+```
+Once locally installed, Mimi can be tested with the following command, from **the root** of the repository,
+```bash
+wget https://github.com/metavoiceio/metavoice-src/raw/main/assets/bria.mp3
+python scripts/mimi_streaming_test.py
+```
+Similary, Moshi can be tested (with a GPU) with
+```bash
+python scripts/moshi_benchmark.py
+```
+## License
+The present code is provided under the MIT license.
+Note that parts of this code is based on [AudioCraft](https://github.com/facebookresearch/audiocraft), released under
+the MIT license.
+## Citation
+If you use either Mimi or Moshi, please cite the following paper,
+```
+@techreport{kyutai2024moshi,
+    author = {Alexandre D\'efossez and Laurent Mazar\'e and Manu Orsini and Am\'elie Royer and
+			  Patrick P\'erez and Herv\'e J\'egou and Edouard Grave and Neil Zeghidour},
+    title = {Moshi: a speech-text foundation model for real-time dialogue},
+    institution = {Kyutai},
+    year={2024},
+    month={September},
+    url={http://kyutai.org/Moshi.pdf},
+}
+```
+[moshi]: https://kyutai.org/Moshi.pdf
+[main_repo]: https://github.com/kyutai-labs/moshi

moshi/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+moshi is the inference codebase for Kyutai audio generation models.
+The code has been adapted from Audiocraft, see LICENSE.audiocraft
+  Copyright (c) Meta Platforms, Inc. and affiliates.
+"""
+# flake8: noqa
+from . import utils
+from . import modules
+from . import models
+from . import quantization
+__version__ = "0.1.0"

moshi/client.py ADDED Viewed

	@@ -0,0 +1,196 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Client for the Moshi server."""
+import argparse
+import asyncio
+import queue
+import sys
+import aiohttp
+import numpy as np
+import sphn
+import sounddevice as sd
+from .client_utils import AnyPrinter, Printer, RawPrinter
+class Connection:
+    def __init__(
+        self,
+        printer: AnyPrinter,
+        websocket: aiohttp.ClientWebSocketResponse,
+        sample_rate: float = 24000,
+        channels: int = 1,
+        frame_size: int = 1920,
+    ) -> None:
+        self.printer = printer
+        self.websocket = websocket
+        self.sample_rate = sample_rate
+        self.frame_size = frame_size
+        self.channels = channels
+        self._done = False
+        self._in_stream = sd.InputStream(
+            samplerate=sample_rate,
+            channels=channels,
+            blocksize=self.frame_size,
+            callback=self._on_audio_input,
+        )
+        self._out_stream = sd.OutputStream(
+            samplerate=sample_rate,
+            channels=channels,
+            blocksize=frame_size,
+            callback=self._on_audio_output,
+        )
+        self._opus_writer = sphn.OpusStreamWriter(sample_rate)
+        self._opus_reader = sphn.OpusStreamReader(sample_rate)
+        self._output_queue = queue.Queue()
+    async def _queue_loop(self) -> None:
+        while True:
+            if self._done:
+                return
+            await asyncio.sleep(0.001)
+            msg = self._opus_writer.read_bytes()
+            if len(msg) > 0:
+                try:
+                    await self.websocket.send_bytes(b"\x01" + msg)
+                except Exception as e:
+                    print(e)
+                    self._lost_connection()
+                    return
+    async def _decoder_loop(self) -> None:
+        all_pcm_data = None
+        while True:
+            if self._done:
+                return
+            await asyncio.sleep(0.001)
+            pcm = self._opus_reader.read_pcm()
+            if all_pcm_data is None:
+                all_pcm_data = pcm
+            else:
+                all_pcm_data = np.concatenate((all_pcm_data, pcm))
+            while all_pcm_data.shape[-1] >= self.frame_size:
+                self._output_queue.put(all_pcm_data[: self.frame_size])
+                all_pcm_data = np.array(all_pcm_data[self.frame_size :])
+    async def _recv_loop(self) -> None:
+        try:
+            async for message in self.websocket:
+                if message.type == aiohttp.WSMsgType.CLOSED:
+                    self.printer.log("info", "Connection closed")
+                    break
+                elif message.type == aiohttp.WSMsgType.ERROR:
+                    self.printer.log("error", f"{self.websocket.exception()}")
+                    break
+                elif message.type != aiohttp.WSMsgType.BINARY:
+                    self.printer.log("error", f"received from server: {message.type}")
+                    continue
+                message = message.data
+                if not isinstance(message, bytes):
+                    self.printer.log(
+                        "warning", f"unsupported message type {type(message)}"
+                    )
+                    continue
+                if len(message) == 0:
+                    self.printer.log("warning", "empty message")
+                    continue
+                kind = message[0]
+                if kind == 1:  # audio
+                    payload = message[1:]
+                    self._opus_reader.append_bytes(payload)
+                    self.printer.print_pending()
+                elif kind == 2:  # text
+                    payload = message[1:]
+                    self.printer.print_token(payload.decode())
+                else:
+                    self.printer.log("warning", f"unknown message kind {kind}")
+        except Exception as e:
+            print(e)
+            self._lost_connection()
+            return
+    def _lost_connection(self) -> None:
+        if not self._done:
+            self.printer.log("error", "Lost connection with the server!")
+            self._done = True
+    def _on_audio_input(self, in_data, frames, time_, status) -> None:
+        assert in_data.shape == (self.frame_size, self.channels), in_data.shape
+        self._opus_writer.append_pcm(in_data[:, 0])
+    def _on_audio_output(self, out_data, frames, time_, status) -> None:
+        assert out_data.shape == (self.frame_size, self.channels), out_data.shape
+        try:
+            pcm_data = self._output_queue.get(block=False)
+            # TODO: handle other shapes by using some form of fifo/ring buffer.
+            assert pcm_data.shape == (self.frame_size,), pcm_data.shape
+            out_data[:, 0] = pcm_data
+        except queue.Empty:
+            out_data.fill(0)
+            self.printer.print_lag()
+    async def run(self) -> None:
+        with self._in_stream, self._out_stream:
+            await asyncio.gather(
+                self._recv_loop(), self._decoder_loop(), self._queue_loop()
+            )
+async def run(printer: AnyPrinter, args):
+    if args.url is None:
+        proto = "ws"
+        if args.https:
+            proto += "s"
+        uri = f"{proto}://{args.host}:{args.port}/api/chat"
+    else:
+        proto = "wss"
+        if '://' in args.url:
+            proto, without_proto = args.url.split('://', 1)
+            if proto in ['ws', 'http']:
+                proto = "ws"
+            elif proto in ['wss', 'https']:
+                proto = "wss"
+            else:
+                printer.log("error", "The provided URL {args.url} seems to contain a protocol but it is unknown.")
+                sys.exit(1)
+        else:
+            without_proto = args.url
+        uri = f"{proto}://{without_proto}/api/chat"
+    printer.log("info", "Connecting to {uri}.")
+    async with aiohttp.ClientSession() as session:
+        async with session.ws_connect(uri) as ws:
+            printer.log("info", "connected!")
+            printer.print_header()
+            connection = Connection(printer, ws)
+            await connection.run()
+def main():
+    parser = argparse.ArgumentParser("client_opus")
+    parser.add_argument("--host", default="localhost", type=str, help="Hostname to connect to.")
+    parser.add_argument("--port", default=8998, type=int, help="Port to connect to.")
+    parser.add_argument("--https", action='store_true',
+                        help="Set this flag for using a https connection.")
+    parser.add_argument("--url", type=str, help='Provides directly a URL, e.g. to a gradio tunnel.')
+    args = parser.parse_args()
+    printer: AnyPrinter
+    if sys.stdout.isatty():
+        printer = Printer()
+    else:
+        printer = RawPrinter()
+    try:
+        asyncio.run(run(printer, args))
+    except KeyboardInterrupt:
+        printer.log("warning", "Interrupting, exiting connection.")
+    printer.log("info", "All done!")
+if __name__ == "__main__":
+    main()

moshi/client_utils.py ADDED Viewed

	@@ -0,0 +1,211 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Utilities for the command line client, in particular for handling interactions with the terminal.
+"""
+from dataclasses import dataclass
+import sys
+def colorize(text, color):
+    code = f"\033[{color}m"
+    restore = "\033[0m"
+    return "".join([code, text, restore])
+def make_log(level: str, msg: str) -> str:
+    if level == "warning":
+        prefix = colorize("[Warn]", "1;31")
+    elif level == "info":
+        prefix = colorize("[Info]", "1;34")
+    elif level == "error":
+        prefix = colorize("[Err ]", "1;31")
+    else:
+        raise ValueError(f"Unknown level {level}")
+    return prefix + " " + msg
+class RawPrinter:
+    def __init__(self, stream=sys.stdout, err_stream=sys.stderr):
+        self.stream = stream
+        self.err_stream = err_stream
+    def print_header(self):
+        pass
+    def print_token(self, token: str):
+        self.stream.write(token)
+        self.stream.flush()
+    def log(self, level: str, msg: str):
+        print(f"{level.capitalize()}: {msg}", file=self.err_stream)
+    def print_lag(self):
+        self.err_stream.write(colorize(" [LAG]", "31"))
+        self.err_stream.flush()
+    def print_pending(self):
+        pass
+@dataclass
+class LineEntry:
+    msg: str
+    color: str | None = None
+    def render(self):
+        if self.color is None:
+            return self.msg
+        else:
+            return colorize(self.msg, self.color)
+    def __len__(self):
+        return len(self.msg)
+class Line:
+    def __init__(self, stream):
+        self.stream = stream
+        self._line: list[LineEntry] = []
+        self._has_padding: bool = False
+        self._max_line_length = 0
+    def __bool__(self):
+        return bool(self._line)
+    def __len__(self):
+        return sum(len(entry) for entry in self._line)
+    def add(self, msg: str, color: str | None = None) -> int:
+        entry = LineEntry(msg, color)
+        return self._add(entry)
+    def _add(self, entry: LineEntry) -> int:
+        if self._has_padding:
+            self.erase(count=0)
+        self._line.append(entry)
+        self.stream.write(entry.render())
+        self._max_line_length = max(self._max_line_length, len(self))
+        return len(entry)
+    def erase(self, count: int = 1):
+        if count:
+            entries = list(self._line[:-count])
+        else:
+            entries = list(self._line)
+        self._line.clear()
+        self.stream.write("\r")
+        for entry in entries:
+            self._line.append(entry)
+            self.stream.write(entry.render())
+        self._has_padding = False
+    def newline(self):
+        missing = self._max_line_length - len(self)
+        if missing > 0:
+            self.stream.write(" " * missing)
+        self.stream.write("\n")
+        self._line.clear()
+        self._max_line_length = 0
+        self._has_padding = False
+    def flush(self):
+        missing = self._max_line_length - len(self)
+        if missing > 0:
+            self.stream.write(" " * missing)
+            self._has_padding = True
+        self.stream.flush()
+class Printer:
+    def __init__(self, max_cols: int = 80, stream=sys.stdout, err_stream=sys.stderr):
+        self.max_cols = max_cols
+        self.line = Line(stream)
+        self.stream = stream
+        self.err_stream = err_stream
+        self._pending_count = 0
+        self._pending_printed = False
+    def print_header(self):
+        self.line.add(" " + "-" * (self.max_cols) + " ")
+        self.line.newline()
+        self.line.flush()
+        self.line.add("| ")
+    def _remove_pending(self) -> bool:
+        if self._pending_printed:
+            self._pending_printed = False
+            self.line.erase(1)
+            return True
+        return False
+    def print_token(self, token: str, color: str | None = None):
+        self._remove_pending()
+        remaining = self.max_cols - len(self.line)
+        if len(token) <= remaining:
+            self.line.add(token, color)
+        else:
+            end = " " * remaining + " |"
+            if token.startswith(" "):
+                token = token.lstrip()
+                self.line.add(end)
+                self.line.newline()
+                self.line.add("| ")
+                self.line.add(token, color)
+            else:
+                assert color is None
+                erase_count = None
+                cumulated = ""
+                for idx, entry in enumerate(self.line._line[::-1]):
+                    if entry.color:
+                        # probably a LAG message
+                        erase_count = idx
+                        break
+                    if entry.msg.startswith(" "):
+                        erase_count = idx + 1
+                        cumulated = entry.msg + cumulated
+                        break
+                if erase_count is not None:
+                    if erase_count > 0:
+                        self.line.erase(erase_count)
+                    remaining = self.max_cols - len(self.line)
+                    end = " " * remaining + " |"
+                    self.line.add(end)
+                    self.line.newline()
+                    self.line.add("| ")
+                    token = cumulated.lstrip() + token
+                    self.line.add(token)
+                else:
+                    self.line.add(token[:remaining])
+                    self.line.add(" |")
+                    self.line.newline()
+                    self.line.add("| ")
+                    self.line.add(token[remaining:])
+        self.line.flush()
+    def log(self, level: str, msg: str):
+        msg = make_log(level, msg)
+        self._remove_pending()
+        if self.line:
+            self.line.newline()
+        self.line.flush()
+        print(msg, file=self.err_stream)
+        self.err_stream.flush()
+    def print_lag(self):
+        self.print_token(" [LAG]", "31")
+    def print_pending(self):
+        chars = ["|", "/", "-", "\\"]
+        count = int(self._pending_count / 5)
+        char = chars[count % len(chars)]
+        colors = ["32", "33", "31"]
+        self._remove_pending()
+        self.line.add(char, colors[count % len(colors)])
+        self._pending_printed = True
+        self._pending_count += 1
+AnyPrinter = Printer | RawPrinter

moshi/models/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Models for the compression model Moshi,
+"""
+# flake8: noqa
+from .compression import (
+    CompressionModel,
+    MimiModel,
+)
+from .lm import LMModel, LMGen
+from .loaders import get_mimi, get_moshi_lm

moshi/models/compression.py ADDED Viewed

	@@ -0,0 +1,474 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Part of this file is adapted from encodec.py in https://github.com/facebookresearch/audiocraft
+# released under the following license.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Compression models or wrapper around existing models. In particular, provides the implementation
+for Mimi. Also defines the main interface that a model must follow to be usable as an audio tokenizer.
+"""
+from abc import abstractmethod
+from contextlib import nullcontext
+from dataclasses import dataclass
+import logging
+import typing as tp
+import torch
+from torch import nn
+from ..quantization import (
+    QuantizedResult,
+    BaseQuantizer,
+    SplitResidualVectorQuantizer,
+    ResidualVectorQuantizer,
+)
+from ..modules.resample import ConvDownsample1d, ConvTrUpsample1d
+from ..modules.streaming import StreamingModule, State
+from ..utils.compile import no_compile, CUDAGraphed
+logger = logging.getLogger()
+class CompressionModel(StreamingModule[State]):
+    """Base API for all compression model that aim at being used as audio tokenizers
+    with a language model.
+    """
+    @abstractmethod
+    def forward(self, x: torch.Tensor) -> QuantizedResult: ...
+    @abstractmethod
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """See `MimiModel.encode`."""
+        ...
+    @abstractmethod
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """See `MimiModel.decode`."""
+        ...
+    @abstractmethod
+    def decode_latent(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode from the discrete codes to continuous latent space."""
+        ...
+    @property
+    @abstractmethod
+    def channels(self) -> int: ...
+    @property
+    @abstractmethod
+    def frame_rate(self) -> float: ...
+    @property
+    @abstractmethod
+    def sample_rate(self) -> int: ...
+    @property
+    @abstractmethod
+    def cardinality(self) -> int: ...
+    @property
+    @abstractmethod
+    def num_codebooks(self) -> int: ...
+    @property
+    @abstractmethod
+    def total_codebooks(self) -> int: ...
+    @abstractmethod
+    def set_num_codebooks(self, n: int):
+        """Set the active number of codebooks used by the quantizer."""
+        ...
+@dataclass
+class _MimiState:
+    graphed_tr_enc: CUDAGraphed | None
+    graphed_tr_dec: CUDAGraphed | None
+    def reset(self):
+        pass
+class MimiModel(CompressionModel[_MimiState]):
+    """Mimi model operating on the raw waveform.
+    Args:
+        encoder (nn.Module): Encoder network.
+        decoder (nn.Module): Decoder network.
+        quantizer (qt.BaseQuantizer): Quantizer network.
+        frame_rate (float): Final frame rate of the quantized representatiopn.
+        encoder_frame_rate (float): frame rate of the encoder model. Note that if `frame_rate != encopder_frame_rate`,
+            the latent will be resampled linearly to match the desired `frame_rate` before and after quantization.
+        sample_rate (int): Audio sample rate.
+        channels (int): Number of audio channels.
+        causal (bool): Whether to use a causal version of the model.
+        encoder_transformer (nn.Module or None): optional transformer for the encoder.
+        decoder_transformer (nn.Module or None): optional transformer for the decoder.
+        resample_method (str): method to use for resampling the latent space before the quantizer.
+        upsample_channel_wise_bug (bool): controls whether the upsampling is channel wise.
+            Defaults to true to reproduce bug in original implementation.
+        freeze_encoder: whether to freeze the encoder weights.
+        freeze_quantizer: whether to freeze the quantizer weights.
+        freeze_quantizer_level: If positive, freeze the quantizer up to this level.
+        torch_compile_encoder_decoder (bool): if True, uses torch.compile on the encoder / decoder.
+            Deactivated by default for training as this is incompatible at the moment with weight norm.
+            See https://github.com/pytorch/pytorch/issues/121902
+            Also this seems to work well with 2.2.0, but completely fail with 2.4.0.
+    """
+    def __init__(
+        self,
+        encoder: nn.Module,
+        decoder: nn.Module,
+        quantizer: BaseQuantizer,
+        frame_rate: float,
+        encoder_frame_rate: float,
+        sample_rate: int,
+        channels: int,
+        causal: bool = False,
+        encoder_transformer: tp.Optional[nn.Module] = None,
+        decoder_transformer: tp.Optional[nn.Module] = None,
+        resample_method: str = "interpolate",
+        upsample_channel_wise_bug: bool = True,
+        freeze_encoder: bool = False,
+        freeze_quantizer: bool = False,
+        freeze_quantizer_level: int = -1,
+        torch_compile_encoder_decoder: bool = False,
+    ):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.encoder_transformer = encoder_transformer
+        self.decoder_transformer = decoder_transformer
+        self.quantizer = quantizer
+        self._frame_rate = frame_rate
+        self._sample_rate = sample_rate
+        self._channels = channels
+        self.encoder_frame_rate = encoder_frame_rate
+        self.torch_compile_encoder_decoder = torch_compile_encoder_decoder
+        if freeze_encoder:
+            for p in self.encoder.parameters():
+                p.requires_grad = False
+            if self.encoder_transformer is not None:
+                for p in self.encoder_transformer.parameters():
+                    p.requires_grad = False
+            for name, p in self.quantizer.named_parameters():
+                if name.endswith("input_proj.weight"):
+                    p.requires_grad = False
+        if freeze_quantizer:
+            self.quantizer.ema_frozen_(True)
+        self.freeze_quantizer = freeze_quantizer
+        self.freeze_quantizer_level = (
+            freeze_quantizer_level
+            if freeze_quantizer_level > 0
+            else self.quantizer.num_codebooks
+        )
+        # We will need the dimension for the resampling. In general the encoder will be a SeanetEncoder
+        # which exposes a `dimension` attribute.
+        dimension = encoder.dimension
+        assert isinstance(
+            dimension, int
+        ), f"Dimension should be int, got {dimension} of type {type(dimension)}."
+        self.dimension = dimension
+        assert resample_method in [
+            "interpolate",
+            "conv",
+            "avg_pool",
+        ], f"Invalid resample_method {resample_method}"
+        self.resample_method = resample_method
+        if encoder_frame_rate != frame_rate:
+            assert not (
+                causal and resample_method == "interpolate"
+            ), "Cannot interpolate with causal model."
+            if resample_method in ["conv", "avg_pool"]:
+                assert (
+                    self.encoder_frame_rate > self.frame_rate
+                ), "Cannot upsample with conv."
+                downsample_stride = self.encoder_frame_rate / self.frame_rate
+                assert downsample_stride == int(
+                    downsample_stride
+                ), f"Only integer strides are supported, got {downsample_stride}"
+                learnt = resample_method == "conv"
+                self.downsample = ConvDownsample1d(
+                    int(downsample_stride),
+                    dimension=dimension,
+                    learnt=learnt,
+                    causal=causal,
+                )
+                if freeze_encoder:
+                    for p in self.downsample.parameters():
+                        p.requires_grad = False
+                self.upsample = ConvTrUpsample1d(
+                    int(downsample_stride),
+                    dimension=dimension,
+                    learnt=learnt,
+                    causal=causal,
+                    channel_wise=upsample_channel_wise_bug,
+                )
+    def _init_streaming_state(self, batch_size: int) -> _MimiState:
+        device = next(self.parameters()).device
+        disable = device.type != 'cuda'
+        graphed_tr_dec = None
+        graphed_tr_enc = None
+        if self.encoder_transformer is not None:
+            graphed_tr_enc = CUDAGraphed(self.encoder_transformer, disable=disable)
+        if self.decoder_transformer is not None:
+            graphed_tr_dec = CUDAGraphed(self.decoder_transformer, disable=disable)
+        return _MimiState(graphed_tr_enc, graphed_tr_dec)
+    @property
+    def channels(self) -> int:
+        return self._channels
+    @property
+    def frame_rate(self) -> float:
+        return self._frame_rate
+    @property
+    def sample_rate(self) -> int:
+        return self._sample_rate
+    @property
+    def total_codebooks(self):
+        """Total number of quantizer codebooks available."""
+        return self.quantizer.total_codebooks
+    @property
+    def num_codebooks(self):
+        """Active number of codebooks used by the quantizer."""
+        return self.quantizer.num_codebooks
+    def set_num_codebooks(self, n: int):
+        """Set the active number of codebooks used by the quantizer."""
+        self.quantizer.set_num_codebooks(n)
+    @property
+    def cardinality(self):
+        """Cardinality of each codebook."""
+        return self.quantizer.cardinality
+    def _to_framerate(self, x: torch.Tensor):
+        # Convert from the encoder frame rate to the overall framerate.
+        _, _, length = x.shape
+        frame_rate = self.encoder_frame_rate
+        new_frame_rate = self.frame_rate
+        if frame_rate == new_frame_rate:
+            return x
+        if self.resample_method == "interpolate":
+            target_length = int(length * new_frame_rate / frame_rate)
+            return nn.functional.interpolate(x, size=target_length, mode="linear")
+        else:
+            return self.downsample(x)
+    def _to_encoder_framerate(self, x: torch.Tensor):
+        # Convert from overall framerate to the encoder frame rate.
+        _, _, length = x.shape
+        frame_rate = self.encoder_frame_rate
+        new_frame_rate = self.frame_rate
+        if frame_rate == new_frame_rate:
+            return x
+        if self.resample_method == "interpolate":
+            target_length = int(length * new_frame_rate / frame_rate)
+            return nn.functional.interpolate(x, size=target_length, mode="linear")
+        else:
+            return self.upsample(x)
+    @property
+    def _context_for_encoder_decoder(self):
+        if self.torch_compile_encoder_decoder:
+            return nullcontext()
+        else:
+            return no_compile()
+    def forward(self, x: torch.Tensor) -> QuantizedResult:
+        assert x.dim() == 3
+        length = x.shape[-1]
+        extra_metrics: tp.Dict[str, torch.Tensor] = {}
+        if self.freeze_quantizer:
+            if isinstance(self.quantizer, SplitResidualVectorQuantizer):
+                self.quantizer.rvq_first.eval()
+                for i in range(
+                    self.freeze_quantizer_level - self.quantizer.n_q_semantic
+                ):
+                    self.quantizer.rvq_rest.vq.layers[i].eval()
+            elif isinstance(self.quantizer, ResidualVectorQuantizer):
+                for i in range(self.freeze_quantizer_level):
+                    self.quantizer.vq.layers[i].eval()
+            else:
+                raise ValueError(f"Unsupported quantizer type {type(self.quantizer)}")
+        with self._context_for_encoder_decoder:
+            emb = self.encoder(x)
+        if self.encoder_transformer is not None:
+            (emb,) = self.encoder_transformer(emb)
+        emb = self._to_framerate(emb)
+        expected_length = self.frame_rate * length / self.sample_rate
+        # Checking that we have the proper length given the advertised frame rate.
+        assert abs(emb.shape[-1] - expected_length) < 1, (
+            emb.shape[-1],
+            expected_length,
+        )
+        q_res = self.quantizer(emb, self.frame_rate)
+        emb = q_res.x
+        emb = self._to_encoder_framerate(emb)
+        if self.decoder_transformer is not None:
+            (emb,) = self.decoder_transformer(emb)
+        with self._context_for_encoder_decoder:
+            out = self.decoder(emb)
+        # remove extra padding added by the encoder and decoder
+        assert out.shape[-1] >= length, (out.shape[-1], length)
+        out = out[..., :length]
+        q_res.x = out
+        q_res.metrics.update(extra_metrics)
+        return q_res
+    def _encode_to_unquantized_latent(self, x: torch.Tensor) -> torch.Tensor:
+        """Projects a batch of waveforms to unquantized latent space.
+        Args:
+            x (torch.Tensor): Float tensor of shape [B, C, T].
+        Returns:
+            Unquantized embeddings.
+        """
+        assert (
+            x.dim() == 3
+        ), f"CompressionModel._encode_to_unquantized_latent expects audio of shape [B, C, T] but got {x.shape}"
+        state = self._streaming_state
+        with self._context_for_encoder_decoder:
+            emb = self.encoder(x)
+        if self.encoder_transformer is not None:
+            if state is None:
+                (emb,) = self.encoder_transformer(emb)
+            else:
+                assert state.graphed_tr_enc is not None
+                (emb,) = state.graphed_tr_enc(emb)
+        emb = self._to_framerate(emb)
+        return emb
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Encode the given input tensor to quantized representation.
+        Args:
+            x (torch.Tensor): Float tensor of shape [B, C, T]
+        Returns:
+            codes (torch.Tensor): an int tensor of shape [B, K, T]
+                with K the number of codebooks used and T the timestep.
+        """
+        emb = self._encode_to_unquantized_latent(x)
+        codes = self.quantizer.encode(emb)
+        return codes
+    def encode_to_latent(self, x: torch.Tensor, quantize: bool = True) -> torch.Tensor:
+        """Projects a batch of waveforms to latent space.
+        Args:
+            x (torch.Tensor): Float tensor of shape [B, C, T].
+        Returns:
+            Embeddings, either quantized or not.
+        """
+        emb = self._encode_to_unquantized_latent(x)
+        if not quantize:
+            return emb
+        else:
+            codes = self.quantizer.encode(emb)
+            return self.decode_latent(codes)
+    def decode(self, codes: torch.Tensor):
+        """Decode the given codes to a reconstructed representation.
+        Args:
+            codes (torch.Tensor): Int tensor of shape [B, K, T]
+        Returns:
+            out (torch.Tensor): Float tensor of shape [B, C, T], the reconstructed audio.
+        """
+        state = self._streaming_state
+        emb = self.decode_latent(codes)
+        emb = self._to_encoder_framerate(emb)
+        if self.decoder_transformer is not None:
+            if state is None:
+                (emb,) = self.decoder_transformer(emb)
+            else:
+                assert state.graphed_tr_dec is not None
+                (emb,) = state.graphed_tr_dec(emb)
+        with self._context_for_encoder_decoder:
+            out = self.decoder(emb)
+        # out contains extra padding added by the encoder and decoder
+        return out
+    def decode_latent(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode from the discrete codes to continuous latent space."""
+        return self.quantizer.decode(codes)
+class WrapperCompressionModel(CompressionModel[State]):
+    """Base API for CompressionModel wrappers that do not depend on external frameworks."""
+    def __init__(self, model: CompressionModel):
+        super().__init__()
+        self.model = model
+    def forward(self, x: torch.Tensor) -> QuantizedResult:
+        return self.model.forward(x)
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        return self.model.encode(x)
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        return self.model.decode(codes)
+    def decode_latent(self, codes: torch.Tensor) -> torch.Tensor:
+        return self.model.decode_latent(codes)
+    def set_num_codebooks(self, n: int):
+        self.model.set_num_codebooks(n)
+    @property
+    def quantizer(self):
+        return self.model.quantizer
+    @property
+    def channels(self) -> int:
+        return self.model.channels
+    @property
+    def frame_rate(self) -> float:
+        return self.model.frame_rate
+    @property
+    def sample_rate(self) -> int:
+        return self.model.sample_rate
+    @property
+    def cardinality(self) -> int:
+        return self.model.cardinality
+    @property
+    def num_codebooks(self) -> int:
+        return self.model.num_codebooks
+    @property
+    def total_codebooks(self) -> int:
+        return self.model.total_codebooks

moshi/models/lm.py ADDED Viewed

	@@ -0,0 +1,488 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from functools import partial
+import logging
+import typing as tp
+import torch
+from torch import nn
+from ..utils.sampling import sample_token
+from ..utils.compile import CUDAGraphed
+from ..modules.streaming import StreamingContainer, StreamingModule
+from ..modules.transformer import (
+    StreamingTransformer,
+    create_norm_fn,
+)
+logger = logging.getLogger(__name__)
+class ScaledEmbedding(nn.Embedding):
+    """Boost learning rate for embeddings (with `scale`).
+    Args:
+        norm (bool): if True, uses a layer norm after the embedding.
+        zero_idx (int): special value indicating that the output should be exactly 0.
+    """
+    def __init__(self, *args, norm: bool = False, zero_idx: int = -1, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.norm = None
+        if norm:
+            self.norm = create_norm_fn("layer_norm", self.embedding_dim)
+        assert zero_idx < 0, "Please use negative values for the zero_idx."
+        self.zero_idx = zero_idx
+    def forward(self, input, *args, **kwargs):
+        is_zero = input == self.zero_idx
+        zero = torch.zeros(1, dtype=input.dtype, device=input.device)
+        input = input.clamp(min=0)
+        y = super().forward(input, *args, **kwargs)
+        if self.norm is not None:
+            y = self.norm(y)
+        y = torch.where(is_zero[..., None], zero, y)
+        return y
+class LMModel(StreamingContainer):
+    """Transformer-based language model on multiple streams of codes.
+    Args:
+        n_q (int): Number of parallel streams to model as input.
+        dep_q (int): Number of parallel streams to model in the depformer.
+        card (int): Cardinality, vocabulary size.
+        text_card (int): Cardinality of the text vocabulary.
+        dim (int): Dimension of the transformer encoder.
+        num_heads (int): Number of heads for the transformer encoder.
+        hidden_scale (int): Scale for hidden feed forward dimension of the transformer encoder.
+        norm (str): Normalization method.
+        norm_emb (bool): Whether to normalize embeddings.
+        bias_proj (bool): Use bias for output projections.
+        depformer_*: params used for the Depformer Transformer, all the other will be shared.
+        depformer_multi_linear (bool): if True, uses one linear layer per codebook to project the
+            output of the main transformer to the Depformer latent space.
+        depformer_dim_feedforward (int| list[int]| None): If None, defaults to hidden_scale * depformer_dim.
+        existing_text_padding_id (bool): if True, will use a different token for the initial text token, and
+            the text padding token.
+        same_initial (bool): if True, uses the same initial tokens for both text and audio mode.
+        **kwargs: Additional parameters for the transformer encoder.
+    """
+    def __init__(
+        self,
+        delays: tp.List[int] = [0],
+        n_q: int = 8,
+        dep_q: int = 8,
+        card: int = 1024,
+        text_card: int = 32000,
+        dim: int = 128,
+        num_heads: int = 8,
+        hidden_scale: int = 4,
+        norm: str = "layer_norm",
+        norm_emb: bool = False,
+        bias_proj: bool = False,
+        depformer_dim: int = 256,
+        depformer_dim_feedforward: int | list[int] | None = None,
+        depformer_multi_linear: bool = False,
+        depformer_weights_per_step: bool = False,
+        depformer_pos_emb: str = "sin",
+        existing_text_padding_id: tp.Optional[int] = None,
+        context: tp.Optional[int] = None,
+        device=None,
+        dtype=None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.n_q = n_q
+        self.dep_q = dep_q
+        self.card = card
+        self.text_card = text_card
+        assert len(delays) == self.num_codebooks, "unexpected number of delays"
+        self.delays = delays
+        self.dim = dim
+        self.existing_text_padding_id = existing_text_padding_id
+        self.context = context
+        kwargs["context"] = context
+        EmbeddingFactory = partial(
+            ScaledEmbedding,
+            norm=norm_emb,
+            device=device,
+            dtype=dtype,
+            zero_idx=self.zero_token_id,
+        )
+        self.emb = nn.ModuleList(
+            [EmbeddingFactory(self.card + 1, dim) for _ in range(n_q)]
+        )
+        # Text card + padding token (if not in the original tokenizer)
+        extra_text = self.existing_text_padding_id is None
+        # Unlike for audio, here we authorize the model to output the special token.
+        self.text_emb = EmbeddingFactory(text_card + 1, dim)
+        self.text_linear = nn.Linear(dim, text_card + extra_text, bias=bias_proj)
+        depformer_prefix = "depformer_"
+        main_kwargs = {
+            k: v for k, v in kwargs.items() if not k.startswith(depformer_prefix)
+        }
+        self.transformer = StreamingTransformer(
+            d_model=dim,
+            num_heads=num_heads,
+            dim_feedforward=int(hidden_scale * dim),
+            norm=norm,
+            device=device,
+            dtype=dtype,
+            **main_kwargs,
+        )
+        self.out_norm = create_norm_fn(norm, dim)
+        self.depformer_multi_linear = depformer_multi_linear
+        kwargs_dep = main_kwargs.copy()
+        kwargs_dep.update(
+            {
+                k.removeprefix(depformer_prefix): v
+                for k, v in kwargs.items()
+                if k.startswith(depformer_prefix)
+            }
+        )
+        kwargs_dep["positional_embedding"] = depformer_pos_emb
+        kwargs_dep["context"] = None
+        if depformer_weights_per_step:
+            kwargs_dep["weights_per_step"] = dep_q
+        if depformer_multi_linear:
+            # One linear layer per codebook to project different informations from the main model.
+            self.depformer_in = nn.ModuleList(
+                [nn.Linear(dim, depformer_dim, bias=False) for _ in range(dep_q)]
+            )
+        else:
+            self.depformer_in = nn.ModuleList(
+                [nn.Linear(dim, depformer_dim, bias=False)]
+            )
+        # Only using up to dep_q - 1 because the last codebook is never an input to Depformer.
+        self.depformer_emb = nn.ModuleList(
+            [EmbeddingFactory(self.card + 1, depformer_dim) for _ in range(dep_q - 1)]
+        )
+        self.depformer_text_emb = EmbeddingFactory(text_card + 1, depformer_dim)
+        if depformer_dim_feedforward is None:
+            depformer_dim_feedforward = int(hidden_scale * depformer_dim)
+        self.depformer = StreamingTransformer(
+            d_model=depformer_dim,
+            dim_feedforward=depformer_dim_feedforward,
+            norm=norm,
+            device=device,
+            dtype=dtype,
+            **kwargs_dep,
+        )
+        self.depformer.set_streaming_propagate(False)
+        dim = depformer_dim  # we will directly apply the next linears to the output of the Depformer.
+        self.linears = nn.ModuleList(
+            [nn.Linear(dim, self.card, bias=bias_proj) for _ in range(dep_q)]
+        )
+    @property
+    def initial_token_id(self) -> int:
+        """Token id for the start of sequence (audio)."""
+        return self.card
+    @property
+    def text_initial_token_id(self) -> int:
+        """Token id for the start of sequence (text)."""
+        return self.text_card
+    @property
+    def text_padding_token_id(self) -> int:
+        """Token id for text padding."""
+        if self.existing_text_padding_id is None:
+            return self.text_card
+        else:
+            return self.existing_text_padding_id
+    @property
+    def end_of_text_padding_id(self) -> int:
+        """Token id for optionally marking the last padding step for a word."""
+        return 0
+    @property
+    def zero_token_id(self) -> int:
+        """Special value in the input tokens, indicating that no sampling should
+        happen for that value, and no input should be given to the model."""
+        return -1
+    @property
+    def ungenerated_token_id(self) -> int:
+        """Special value that can be provided in the prompt to indicate that this specific
+        value should be predicted and sampled. This allows for partial teacher forcing, by generating
+        one modality, with the other one fixed.
+        """
+        return -2
+    @property
+    def device(self):
+        first_param = next(iter(self.parameters()))
+        return first_param.device
+    @property
+    def num_codebooks(self) -> int:
+        return self.n_q + 1
+    @property
+    def num_audio_codebooks(self) -> int:
+        return self.n_q
+    @property
+    def audio_offset(self) -> int:
+        return 1
+    def _get_initial_token(self) -> torch.Tensor:
+        # Returns the initial token that will be fed to the model to predict the very first timestep.
+        # The output shape will be [B, K, 1].
+        device = next(iter(self.parameters())).device
+        zero = torch.full(
+            [1, 1, 1], self.zero_token_id, device=device, dtype=torch.long
+        )
+        special = torch.full_like(zero, self.initial_token_id)
+        text_special = torch.full_like(zero, self.text_initial_token_id)
+        audio_token = special
+        text_token = text_special
+        audio_token = audio_token.expand(-1, self.num_audio_codebooks, -1)
+        token = torch.cat([text_token, audio_token], dim=1)
+        return token
+    def forward_text(
+        self,
+        sequence: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        B, K, S = sequence.shape
+        assert (
+            K == self.num_codebooks
+        ), f"Sequence shape {sequence.shape} must match the number of codebooks."
+        input_sequence = sequence
+        input_ = None
+        for cb_index in range(self.num_audio_codebooks):
+            audio_emb = self.emb[cb_index](
+                input_sequence[:, cb_index + self.audio_offset]
+            )
+            input_ = audio_emb if input_ is None else input_ + audio_emb
+        text_emb = self.text_emb(input_sequence[:, 0])
+        input_ = text_emb if input_ is None else input_ + text_emb
+        transformer_out = self.transformer(input_)
+        if self.out_norm:
+            transformer_out = self.out_norm(transformer_out)
+        assert isinstance(transformer_out, torch.Tensor)
+        text_logits = self.text_linear(transformer_out)
+        text_logits = text_logits[:, None]
+        return transformer_out, text_logits
+    def forward_depformer(
+        self,
+        depformer_cb_index: int,
+        sequence: torch.Tensor,
+        transformer_out: torch.Tensor,
+    ) -> torch.Tensor:
+        B, K, S = sequence.shape
+        assert (
+            K == 1
+        ), f"Codebooks for Depformer streaming should be passed 1 by 1, got {K}."
+        assert (
+            S == 1
+        ), f"Steps for Depformer streaming should be passed 1 by 1, got {S}."
+        assert (
+            transformer_out.shape[1] == 1
+        ), "Transformer out should be a for a single step."
+        last_token_input: tp.Optional[torch.Tensor] = None
+        depformer_input = transformer_out
+        if self.depformer_multi_linear:
+            depformer_input = self.depformer_in[depformer_cb_index](depformer_input)
+        else:
+            depformer_input = self.depformer_in[0](depformer_input)
+        if depformer_cb_index == 0:
+            last_token_input = self.depformer_text_emb(sequence[:, 0])
+        else:
+            last_token_input = self.depformer_emb[depformer_cb_index - 1](
+                sequence[:, 0]
+            )
+        depformer_input = depformer_input + last_token_input
+        assert depformer_input.shape[1] == 1
+        # depformer_input is [B, 1, depformer_dim].
+        # The streaming state of the depformer ensures that the proper layer is run.
+        dep_output = self.depformer(depformer_input)
+        logits = self.linears[depformer_cb_index](dep_output)
+        logits = logits[:, None]
+        assert logits.dim() == 4, logits.shape  # [B, Ka, S, card]
+        return logits
+@dataclass
+class _LMGenState:
+    cache: torch.Tensor
+    initial: torch.Tensor
+    graphed_main: CUDAGraphed
+    graphed_depth: CUDAGraphed
+    offset: int = 0
+    def reset(self):
+        self.offset = 0
+class LMGen(StreamingModule[_LMGenState]):
+    def __init__(
+        self,
+        lm_model: LMModel,
+        use_sampling: bool = True,
+        temp: float = 0.8,
+        temp_text: float = 0.7,
+        top_k: int = 250,
+        top_k_text: int = 25,
+        check: bool = False,
+    ):
+        assert not lm_model.training, "generation shouldn't be used in training mode."
+        super().__init__()
+        self.lm_model = lm_model
+        self.use_sampling = use_sampling
+        self.temp = temp
+        self.temp_text = temp_text
+        self.top_k = top_k
+        self.top_k_text = top_k_text
+        self.check = check
+        self.max_delay = max(
+            lm_model.delays
+        )  # with delays, we need to generate a few more time steps.
+        self.delays_cuda = torch.tensor(
+            lm_model.delays, device=lm_model.device, dtype=torch.long
+        )
+    def _init_streaming_state(self, batch_size: int) -> _LMGenState:
+        lm_model = self.lm_model
+        initial = lm_model._get_initial_token()
+        cache = torch.full(
+            (batch_size, self.lm_model.num_codebooks, self.max_delay + 2),
+            lm_model.ungenerated_token_id,
+            device=lm_model.device,
+            dtype=torch.long,
+        )
+        disable = lm_model.device.type != 'cuda'
+        graphed_main = CUDAGraphed(lm_model.forward_text, disable=disable)
+        graphed_depth = CUDAGraphed(self.depformer_step, disable=disable)
+        return _LMGenState(cache, initial, graphed_main, graphed_depth)
+    @torch.no_grad()
+    def step(self, input_tokens: torch.Tensor) -> torch.Tensor | None:
+        state = self._streaming_state
+        if state is None:
+            raise RuntimeError(
+                "You should wrap those calls with a `with lm_gen.streaming(): ...`."
+            )
+        lm_model = self.lm_model
+        assert input_tokens.dim() == 3, "Shape should be [B, K, T]."
+        B, Ki, S = input_tokens.shape
+        assert S == 1, "Only support being given steps one by one."
+        needed_tokens = lm_model.num_codebooks - lm_model.dep_q - 1
+        assert (
+            Ki == needed_tokens
+        ), f"We expect {needed_tokens} tokens from the user stream, got {Ki}."
+        CT = state.cache.shape[2]
+        for q_other in range(input_tokens.shape[1]):
+            k = lm_model.dep_q + 1 + q_other
+            delay = lm_model.delays[k]
+            write_position = (state.offset + delay) % CT
+            state.cache[:, k, write_position : write_position + 1] = input_tokens[
+                :, q_other
+            ]
+        position = state.offset % CT
+        for k, delay in enumerate(lm_model.delays):
+            # Only for the very beginning, we extend the initial token for the acoustic
+            # token that are delayed, and thus have no good value to take.
+            if state.offset <= delay:
+                state.cache[:, k, position] = state.initial[:, k, 0]
+        input_ = state.cache[:, :, position : position + 1]
+        if self.check:
+            # Check that we are not feeding in any value that is not generated yet.
+            assert not (input_ == lm_model.ungenerated_token_id).any(), (
+                state.offset,
+                input_,
+            )
+            assert (input_[:, lm_model.audio_offset :] <= lm_model.card).all(), input_
+            assert (input_[:, :1] <= lm_model.text_card).all()
+        transformer_out, text_logits = state.graphed_main(input_)
+        # Shape of text_logits should be [B, K_text=1, T=1, Card_text]
+        text_token = sample_token(
+            text_logits.float(),
+            self.use_sampling,
+            self.temp_text,
+            self.top_k_text,
+        )
+        assert text_token.dim() == 3, text_token.shape
+        assert text_token.shape[2] == 1
+        assert text_token.shape[1] == 1, "Only one text stream supported."
+        text_token = text_token[:, 0, 0]  # shape is [B]
+        audio_tokens = state.graphed_depth(text_token, transformer_out)
+        # ensure we don't overwrite prompt tokens, we only write over ungenerated tokens
+        state.offset += 1
+        position = state.offset % CT
+        state.cache[:, 0, position] = text_token
+        state.cache[:, 1 : lm_model.dep_q + 1, position] = audio_tokens
+        if state.offset <= self.max_delay:
+            return None
+        B = state.cache.shape[0]
+        gen_delays_cuda = self.delays_cuda[: lm_model.dep_q + 1]
+        index = (
+            ((state.offset - self.max_delay + gen_delays_cuda) % CT)
+            .view(1, -1, 1)
+            .expand(B, -1, 1)
+        )
+        out = state.cache.gather(dim=2, index=index)
+        return out
+    def depformer_step(
+        self,
+        text_token: torch.Tensor,
+        transformer_out: torch.Tensor,
+    ) -> torch.Tensor:
+        (B,) = text_token.shape
+        prev_token = text_token
+        lm_model = self.lm_model
+        depformer_tokens: list[torch.Tensor] = []
+        assert not lm_model.depformer.is_streaming
+        with lm_model.depformer.streaming(B):
+            for cb_index in range(lm_model.dep_q):
+                input_ = prev_token[:, None, None]
+                logits = lm_model.forward_depformer(cb_index, input_, transformer_out)
+                next_token = sample_token(
+                    logits.float(),
+                    self.use_sampling,
+                    self.temp,
+                    self.top_k,
+                )
+                assert next_token.shape == (B, 1, 1)
+                next_token = next_token[:, 0, 0]  # shape is B
+                depformer_tokens.append(next_token)
+                prev_token = next_token
+        assert len(depformer_tokens) == lm_model.dep_q, (
+            len(depformer_tokens),
+            lm_model.dep_q,
+        )
+        out = torch.stack(depformer_tokens, dim=1)
+        assert out.shape == (B, lm_model.dep_q), out.shape
+        return out

moshi/models/loaders.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Retrieves the pretrained models for Moshi and Mimi."""
+from pathlib import Path
+from safetensors.torch import load_model
+import torch
+from .compression import MimiModel
+from .lm import LMModel
+from ..modules import SEANetEncoder, SEANetDecoder, transformer
+from ..quantization import SplitResidualVectorQuantizer
+SAMPLE_RATE = 24000
+FRAME_RATE = 12.5
+TEXT_TOKENIZER_NAME = 'tokenizer_spm_32k_3.model'
+MOSHI_NAME = 'model.safetensors'
+MIMI_NAME = 'tokenizer-e351c8d8-checkpoint125.safetensors'
+DEFAULT_REPO = 'kyutai/moshiko-pytorch-bf16'
+_seanet_kwargs = {
+    "channels": 1,
+    "dimension": 512,
+    "causal": True,
+    "n_filters": 64,
+    "n_residual_layers": 1,
+    "activation": "ELU",
+    "compress": 2,
+    "dilation_base": 2,
+    "disable_norm_outer_blocks": 0,
+    "kernel_size": 7,
+    "residual_kernel_size": 3,
+    "last_kernel_size": 3,
+    # We train using weight_norm but then the weights are pre-processed for inference so
+    # that we can use a normal convolution.
+    "norm": "none",
+    "pad_mode": "constant",
+    "ratios": [8, 6, 5, 4],
+    "true_skip": True,
+}
+_quantizer_kwargs = {
+    "dimension": 256,
+    "n_q": 32,
+    "bins": 2048,
+    "input_dimension": _seanet_kwargs["dimension"],
+    "output_dimension": _seanet_kwargs["dimension"],
+}
+_transformer_kwargs = {
+    "d_model": _seanet_kwargs["dimension"],
+    "num_heads": 8,
+    "num_layers": 8,
+    "causal": True,
+    "layer_scale": 0.01,
+    "context": 250,
+    "conv_layout": True,
+    "max_period": 10000,
+    "gating": "none",
+    "norm": "layer_norm",
+    "positional_embedding": "rope",
+    "dim_feedforward": 2048,
+    "input_dimension": _seanet_kwargs["dimension"],
+    "output_dimensions": [_seanet_kwargs["dimension"]],
+}
+_lm_kwargs = {
+    "dim": 4096,
+    "text_card": 32000,
+    "existing_text_padding_id": 3,
+    "n_q": 16,
+    "dep_q": 8,
+    "card": _quantizer_kwargs["bins"],
+    "num_heads": 32,
+    "num_layers": 32,
+    "hidden_scale": 4.125,
+    "causal": True,
+    "layer_scale": None,
+    "context": 3000,
+    "max_period": 10000,
+    "gating": "silu",
+    "norm": "rms_norm_f32",
+    "positional_embedding": "rope",
+    "depformer_dim": 1024,
+    "depformer_dim_feedforward": int(4.125 * 1024),
+    "depformer_num_heads": 16,
+    "depformer_num_layers": 6,
+    "depformer_causal": True,
+    "depformer_layer_scale": None,
+    "depformer_multi_linear": True,
+    "depformer_context": 8,
+    "depformer_max_period": 10000,
+    "depformer_gating": "silu",
+    "depformer_pos_emb": "none",
+    "depformer_weights_per_step": True,
+    "delays": [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1],
+}
+def _is_safetensors(path: Path | str) -> bool:
+    return Path(path).suffix in (".safetensors", ".sft", ".sfts")
+def get_mimi(filename: str | Path,
+             device: torch.device | str = 'cpu') -> MimiModel:
+    """Return a pretrained Mimi model."""
+    encoder = SEANetEncoder(**_seanet_kwargs)
+    decoder = SEANetDecoder(**_seanet_kwargs)
+    encoder_transformer = transformer.ProjectedTransformer(
+        device=device, **_transformer_kwargs
+    )
+    decoder_transformer = transformer.ProjectedTransformer(
+        device=device, **_transformer_kwargs
+    )
+    quantizer = SplitResidualVectorQuantizer(
+        **_quantizer_kwargs,
+    )
+    model = MimiModel(
+        encoder,
+        decoder,
+        quantizer,
+        channels=1,
+        sample_rate=SAMPLE_RATE,
+        frame_rate=FRAME_RATE,
+        encoder_frame_rate=SAMPLE_RATE / encoder.hop_length,
+        causal=True,
+        resample_method="conv",
+        encoder_transformer=encoder_transformer,
+        decoder_transformer=decoder_transformer,
+    ).to(device=device)
+    model.eval()
+    if _is_safetensors(filename):
+        load_model(model, filename)
+    else:
+        pkg = torch.load(filename, "cpu")
+        model.load_state_dict(pkg["model"])
+    model.set_num_codebooks(8)
+    return model
+def get_moshi_lm(filename: str | Path,
+                 device: torch.device | str = 'cpu') -> LMModel:
+    dtype = torch.bfloat16
+    model = LMModel(
+        device=device,
+        dtype=dtype,
+        **_lm_kwargs,
+    ).to(device=device, dtype=dtype)
+    model.eval()
+    if _is_safetensors(filename):
+        load_model(model, filename)
+    else:
+        pkg = torch.load(
+            filename,
+            "cpu",
+        )
+        model.load_state_dict(pkg["fsdp_best_state"]["model"])
+    return model

moshi/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Modules used for building the models."""
+# flake8: noqa
+from .conv import (
+    NormConv1d,
+    NormConvTranspose1d,
+    StreamingConv1d,
+    StreamingConvTranspose1d,
+    pad_for_conv1d,
+    pad1d,
+    unpad1d,
+)
+from .seanet import SEANetEncoder, SEANetDecoder
+from .transformer import StreamingTransformer

moshi/modules/conv.py ADDED Viewed

	@@ -0,0 +1,329 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+import math
+import typing as tp
+import warnings
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.utils import weight_norm
+from .streaming import RawStreamingConv1d, RawStreamingConvTranspose1d, StreamingModule
+CONV_NORMALIZATIONS = frozenset(["none", "weight_norm"])
+class TransposedLayerNorm(nn.Module):
+    """LayerNorm for [B, C, T] inputs."""
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(**kwargs)
+    def forward(self, x):
+        x = x.transpose(1, 2)
+        x = self.layer_norm(x)
+        return x.transpose(1, 2)
+def apply_parametrization_norm(module: nn.Module, norm: str = "none"):
+    assert norm in CONV_NORMALIZATIONS
+    if norm == "weight_norm":
+        return weight_norm(module)
+    else:
+        # We already check was in CONV_NORMALIZATION, so any other choice
+        # doesn't need reparametrization.
+        return module
+def get_extra_padding_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+) -> int:
+    """See `pad_for_conv1d`."""
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+def pad_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+):
+    """Pad for a convolution to make sure that the last window is full.
+    Extra padding is added at the end. This is required to ensure that we can rebuild
+    an output of the same length, as otherwise, even with padding, some time steps
+    might get removed.
+    For instance, with total padding = 4, kernel size = 4, stride = 2:
+        0 0 1 2 3 4 5 0 0   # (0s are padding)
+        1   2   3           # (output frames of a convolution, last 0 is never used)
+        0 0 1 2 3 4 5 0     # (output of tr. conv., but pos. 5 is going to get removed as padding)
+            1 2 3 4         # once you removed padding, we are missing one time step !
+    """
+    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+    return F.pad(x, (0, extra_padding))
+def pad1d(
+    x: torch.Tensor,
+    paddings: tp.Tuple[int, int],
+    mode: str = "constant",
+    value: float = 0.0,
+):
+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right before the reflection happen.
+    """
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
+    """Remove padding from x, handling properly zero padding. Only for 1d!"""
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) <= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left:end]
+class NormConv1d(nn.Module):
+    """Wrapper around Conv1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+    def __init__(
+        self,
+        *args,
+        causal: bool = False,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        **kwargs,
+    ):
+        super().__init__()
+        self.conv = apply_parametrization_norm(
+            RawStreamingConv1d(*args, **kwargs), norm
+        )
+        self.norm_type = norm
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+class NormConvTranspose1d(nn.Module):
+    """Wrapper around ConvTranspose1d and normalization applied to this conv
+    to provide a uniform interface across normalization approaches.
+    """
+    def __init__(
+        self,
+        *args,
+        causal: bool = False,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        **kwargs,
+    ):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(
+            RawStreamingConvTranspose1d(*args, **kwargs), norm
+        )
+        self.norm_type = norm
+    def forward(self, x):
+        x = self.convtr(x)
+        return x
+@dataclass
+class _StreamingConv1dState:
+    padding_to_add: int
+    original_padding_to_add: int
+    def reset(self):
+        self.padding_to_add = self.original_padding_to_add
+class StreamingConv1d(StreamingModule[_StreamingConv1dState]):
+    """Conv1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        causal: bool = False,
+        norm: str = "none",
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+        pad_mode: str = "reflect",
+    ):
+        super().__init__()
+        # warn user on unusual setup between dilation and stride
+        if stride > 1 and dilation > 1:
+            warnings.warn(
+                "StreamingConv1d has been initialized with stride > 1 and dilation > 1"
+                f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
+            )
+        self.conv = NormConv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            causal=causal,
+            norm=norm,
+            norm_kwargs=norm_kwargs,
+        )
+        self.causal = causal
+        self.pad_mode = pad_mode
+    @property
+    def _stride(self) -> int:
+        return self.conv.conv.stride[0]
+    @property
+    def _kernel_size(self) -> int:
+        return self.conv.conv.kernel_size[0]
+    @property
+    def _effective_kernel_size(self) -> int:
+        dilation = self.conv.conv.dilation[0]
+        return (
+            self._kernel_size - 1
+        ) * dilation + 1  # effective kernel size with dilations
+    @property
+    def _padding_total(self) -> int:
+        return self._effective_kernel_size - self._stride
+    def _init_streaming_state(self, batch_size: int) -> _StreamingConv1dState:
+        assert self.causal, "streaming is only supported for causal convs"
+        return _StreamingConv1dState(self._padding_total, self._padding_total)
+    def forward(self, x):
+        B, C, T = x.shape
+        padding_total = self._padding_total
+        extra_padding = get_extra_padding_for_conv1d(
+            x, self._effective_kernel_size, self._stride, padding_total
+        )
+        state = self._streaming_state
+        if state is None:
+            if self.causal:
+                # Left padding for causal
+                x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
+            else:
+                # Asymmetric padding required for odd strides
+                padding_right = padding_total // 2
+                padding_left = padding_total - padding_right
+                x = pad1d(
+                    x, (padding_left, padding_right + extra_padding), mode=self.pad_mode
+                )
+        else:
+            if state.padding_to_add > 0 and x.shape[-1] > 0:
+                x = pad1d(x, (state.padding_to_add, 0), mode=self.pad_mode)
+                state.padding_to_add = 0
+        return self.conv(x)
+@dataclass
+class _StreamingConvTr1dState:
+    pass
+    def reset(self):
+        pass
+class StreamingConvTranspose1d(StreamingModule[_StreamingConvTr1dState]):
+    """ConvTranspose1d with some builtin handling of asymmetric or causal padding
+    and normalization.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        causal: bool = False,
+        norm: str = "none",
+        trim_right_ratio: float = 1.0,
+        norm_kwargs: tp.Dict[str, tp.Any] = {},
+    ):
+        super().__init__()
+        self.convtr = NormConvTranspose1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            groups=groups,
+            bias=bias,
+            causal=causal,
+            norm=norm,
+            norm_kwargs=norm_kwargs,
+        )
+        self.causal = causal
+        self.trim_right_ratio = trim_right_ratio
+        assert (
+            self.causal or self.trim_right_ratio == 1.0
+        ), "`trim_right_ratio` != 1.0 only makes sense for causal convolutions"
+        assert self.trim_right_ratio >= 0.0 and self.trim_right_ratio <= 1.0
+    def _init_streaming_state(self, batch_size: int) -> _StreamingConvTr1dState:
+        assert self.causal, "streaming is only supported for causal convtrs"
+        return _StreamingConvTr1dState()
+    def forward(self, x):
+        kernel_size = self.convtr.convtr.kernel_size[0]
+        stride = self.convtr.convtr.stride[0]
+        padding_total = kernel_size - stride
+        y = self.convtr(x)
+        if not self.is_streaming:
+            # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+            # removed at the very end, when keeping only the right length for the output,
+            # as removing it here would require also passing the length at the matching layer
+            # in the encoder.
+            if self.causal:
+                # Trim the padding on the right according to the specified ratio
+                # if trim_right_ratio = 1.0, trim everything from right
+                padding_right = math.ceil(padding_total * self.trim_right_ratio)
+                padding_left = padding_total - padding_right
+                y = unpad1d(y, (padding_left, padding_right))
+            else:
+                # Asymmetric padding required for odd strides
+                padding_right = padding_total // 2
+                padding_left = padding_total - padding_right
+                y = unpad1d(y, (padding_left, padding_right))
+        return y

moshi/modules/gating.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch import nn
+from torch.nn import functional as F
+from ..utils.compile import torch_compile_lazy
+@torch_compile_lazy
+def gating_forward_kernel(
+    weight_in: torch.Tensor, weight_out: torch.Tensor, activation, x: torch.Tensor
+):
+    x = F.linear(x, weight_in)
+    B, T, _ = x.shape
+    x = x.view(B, T, 2, -1)
+    x = activation(x[..., 0, :]) * x[..., 1, :]
+    x = F.linear(x, weight_out)
+    return x
+class ActivationGating(nn.Module):
+    """
+    Gating FFN layer, using the given activation.
+    Args:
+        dim (int): dimension of the input and output of the transformer.
+        activation (any callable Tensor to Tensor): activation function to use.
+        **factory_kwargs: other kwargs passed to the linear layer, in particular device and dtype.
+    """
+    _fsdp_final = True
+    def __init__(self, dim: int, dim_feedforward: int, activation, **factory_kwargs):
+        super().__init__()
+        # We should have 8 d^2 param, instead we will have
+        # 2 * h * d + h * d = 3 h * d = 8 d^2
+        # so h = 8 d / 3 but following Hervé's advice we use 21 / 8 as an approx.
+        if dim_feedforward == 4 * dim:
+            hidden = (21 * dim) // 8
+        else:
+            hidden = (2 * dim_feedforward) // 3
+        self.linear_in = nn.Linear(dim, 2 * hidden, bias=False, **factory_kwargs)
+        self.linear_out = nn.Linear(hidden, dim, bias=False, **factory_kwargs)
+        self.activation = activation
+    def forward(self, x: torch.Tensor):
+        return gating_forward_kernel(
+            self.linear_in.weight, self.linear_out.weight, self.activation, x
+        )
+def _get_activation(name: str):
+    if name in ["sigmoid", "tanh", "relu"]:
+        return getattr(torch, name)
+    elif name in ["leaky_relu", "elu", "gelu", "silu", "mish", "softsign"]:
+        return getattr(torch.nn.functional, name)
+    elif name == "identity":
+        return torch.nn.Identity()
+    else:
+        raise ValueError(f"Unknown activation {name}")
+def _make_gating(
+    name: str, dim: int, dim_feedforward: int, **factory_kwargs
+) -> nn.Module:
+    return ActivationGating(
+        dim, dim_feedforward, _get_activation(name), **factory_kwargs
+    )
+def make_gating(
+    name: str, dim: int, dim_feedforward: int, **factory_kwargs
+) -> nn.Module:
+    gating = _make_gating(name, dim, dim_feedforward, **factory_kwargs)
+    max_params = 2 * dim * dim_feedforward
+    params = sum(p.numel() for p in gating.parameters())
+    assert (
+        params <= max_params
+    ), f"{name} gating has {params} params, max is {max_params}"
+    return gating

moshi/modules/resample.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+from einops import rearrange
+import torch
+from torch import nn
+from .conv import StreamingConv1d, StreamingConvTranspose1d
+class ConvDownsample1d(nn.Module):
+    """
+    Downsampling by some integer amount `stride` using convolutions
+    with a kernel size of twice the stride.
+    If `causal` is True, the output uses a causal convolution.
+    """
+    def __init__(
+        self,
+        stride: int,
+        dimension: tp.Optional[int] = None,
+        causal: bool = False,
+        learnt: bool = False,
+        channel_wise: bool = False,
+    ):
+        super().__init__()
+        self.learnt = learnt
+        self.channel_wise = channel_wise
+        groups = 1
+        if learnt:
+            assert dimension is not None, "Dimension required for learnt convolutions."
+            in_channels = dimension
+            out_channels = dimension
+            if channel_wise:
+                groups = dimension
+        else:
+            in_channels = 1
+            out_channels = 1
+        self.conv = StreamingConv1d(
+            in_channels,
+            out_channels,
+            kernel_size=2 * stride,
+            stride=stride,
+            causal=causal,
+            groups=groups,
+            bias=False,
+            pad_mode="replicate",
+        )
+        if not learnt:
+            actual_conv = self.conv.conv.conv
+            actual_conv.weight.requires_grad_(False)
+            actual_conv.weight.data.fill_(1.0 / (2 * stride))
+    def forward(self, x: torch.Tensor):
+        batch_size = len(x)
+        if not self.learnt:
+            x = rearrange(x, "b c t -> (b c) () t")
+        y = self.conv(x)
+        if not self.learnt:
+            y = rearrange(y, "(b c) () t -> b c t", b=batch_size)
+        return y
+class ConvTrUpsample1d(nn.Module):
+    """
+    Upsample by some integer amount `stride` using transposed convolutions.
+    """
+    def __init__(
+        self,
+        stride: int,
+        dimension: tp.Optional[int] = None,
+        causal: bool = False,
+        learnt: bool = False,
+        channel_wise: bool = False,
+    ):
+        super().__init__()
+        self.learnt = learnt
+        self.channel_wise = channel_wise
+        groups = 1
+        if learnt:
+            assert dimension is not None, "Dimension required for learnt convolutions."
+            in_channels = dimension
+            out_channels = dimension
+            if channel_wise:
+                groups = dimension
+        else:
+            in_channels = 1
+            out_channels = 1
+        self.convtr = StreamingConvTranspose1d(
+            in_channels,
+            out_channels,
+            kernel_size=2 * stride,
+            stride=stride,
+            causal=causal,
+            groups=groups,
+            bias=False,
+        )
+        if not learnt:
+            actual_convtr = self.convtr.convtr.convtr
+            actual_convtr.weight.requires_grad_(False)
+            actual_convtr.weight.data.fill_(1.0)
+    def forward(self, x: torch.Tensor):
+        batch_size = len(x)
+        if not self.learnt:
+            x = rearrange(x, "b c t -> (b c) () t")
+        y = self.convtr(x)
+        if not self.learnt:
+            x_for_normalization = torch.ones_like(x[:1])
+            normalization = self.convtr(x_for_normalization)
+            y = y / normalization
+            y = rearrange(y, "(b c) () t -> b c t", b=batch_size)
+        return y

moshi/modules/rope.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from torch import nn
+import math
+import torch
+from ..utils.compile import torch_compile_lazy
+@torch_compile_lazy
+def apply_rope(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    offset: torch.Tensor,
+    max_period: float = 10_000,
+    time_before_heads: bool = False,
+):
+    """
+    Args:
+        q (torch.Tensor): queries, shape `[B, T, H, D]`.
+        k (torch.Tensor): keys, shape `[B, T, H, D]`.
+        offset (int): current offset, e.g. when streaming.
+        max_period (float): maximum period for the cos and sin.
+        time_before_heads (bool):  if True, expected [B, T, H, D], else [B, H, T ,D]
+    """
+    if time_before_heads:
+        B, T, H, D = q.shape
+    else:
+        B, H, T, D = q.shape
+    assert k.shape == q.shape
+    assert D > 0
+    assert D % 2 == 0
+    assert max_period > 0
+    ds = torch.arange(D // 2, device=q.device, dtype=torch.float32)
+    freqs = torch.exp(ds * (-math.log(max_period) * 2 / D))
+    ts = offset.float() + torch.arange(T, device=q.device, dtype=torch.float32)
+    if time_before_heads:
+        ts = ts.view(-1, 1, 1)
+    else:
+        ts = ts.view(1, -1, 1)
+    dims = q.shape[:-1]
+    q = q.view(*dims, D // 2, 2)
+    k = k.view(*dims, D // 2, 2)
+    # convention is `r` suffix is real part, `i` is imaginary.
+    qr = q[..., 0].float()
+    qi = q[..., 1].float()
+    kr = k[..., 0].float()
+    ki = k[..., 1].float()
+    rotr = torch.cos(freqs * ts)
+    roti = torch.sin(freqs * ts)
+    qor = qr * rotr - qi * roti
+    qoi = qr * roti + qi * rotr
+    kor = kr * rotr - ki * roti
+    koi = kr * roti + ki * rotr
+    dtype = q.dtype
+    qo = torch.stack([qor.to(dtype), qoi.to(dtype)], dim=-1)
+    ko = torch.stack([kor.to(dtype), koi.to(dtype)], dim=-1)
+    return qo.view(*dims, D), ko.view(*dims, D)
+class RotaryEmbedding(nn.Module):
+    """Rotary positional embedding (RoPE) from [Su et al 2022](https://arxiv.org/abs/2104.09864).
+    Args:
+        max_period (float): Maximum period of the rotation frequencies.
+    """
+    def __init__(self, max_period: float = 10000.0):
+        super().__init__()
+        self.max_period = max_period
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        offset: torch.Tensor,
+        time_before_heads: bool = False,
+    ):
+        """Apply rope rotation to query or key tensor."""
+        return apply_rope(q, k, offset, self.max_period, time_before_heads)

moshi/modules/seanet.py ADDED Viewed

	@@ -0,0 +1,395 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+import numpy as np
+import torch.nn as nn
+from .conv import StreamingConv1d, StreamingConvTranspose1d
+from .streaming import StreamingContainer, StreamingAdd
+from ..utils.compile import torch_compile_lazy
+class SEANetResnetBlock(StreamingContainer):
+    """Residual block from SEANet model.
+    Args:
+        dim (int): Dimension of the input/output.
+        kernel_sizes (list): List of kernel sizes for the convolutions.
+        dilations (list): List of dilations for the convolutions.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        true_skip (bool): Whether to use true skip connection or a simple
+            (streamable) convolution as the skip connection.
+    """
+    def __init__(
+        self,
+        dim: int,
+        kernel_sizes: tp.List[int] = [3, 1],
+        dilations: tp.List[int] = [1, 1],
+        activation: str = "ELU",
+        activation_params: dict = {"alpha": 1.0},
+        norm: str = "none",
+        norm_params: tp.Dict[str, tp.Any] = {},
+        causal: bool = False,
+        pad_mode: str = "reflect",
+        compress: int = 2,
+        true_skip: bool = True,
+    ):
+        super().__init__()
+        assert len(kernel_sizes) == len(
+            dilations
+        ), "Number of kernel sizes should match number of dilations"
+        act = getattr(nn, activation)
+        hidden = dim // compress
+        block = []
+        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
+            in_chs = dim if i == 0 else hidden
+            out_chs = dim if i == len(kernel_sizes) - 1 else hidden
+            block += [
+                act(**activation_params),
+                StreamingConv1d(
+                    in_chs,
+                    out_chs,
+                    kernel_size=kernel_size,
+                    dilation=dilation,
+                    norm=norm,
+                    norm_kwargs=norm_params,
+                    causal=causal,
+                    pad_mode=pad_mode,
+                ),
+            ]
+        self.block = nn.Sequential(*block)
+        self.add = StreamingAdd()
+        self.shortcut: nn.Module
+        if true_skip:
+            self.shortcut = nn.Identity()
+        else:
+            self.shortcut = StreamingConv1d(
+                dim,
+                dim,
+                kernel_size=1,
+                norm=norm,
+                norm_kwargs=norm_params,
+                causal=causal,
+                pad_mode=pad_mode,
+            )
+    def forward(self, x):
+        u, v = self.shortcut(x), self.block(x)
+        return self.add(u, v)
+class SEANetEncoder(StreamingContainer):
+    """SEANet encoder.
+    Args:
+        channels (int): Audio channels.
+        dimension (int): Intermediate representation dimension.
+        n_filters (int): Base width for the model.
+        n_residual_layers (int): nb of residual layers.
+        ratios (Sequence[int]): kernel size and stride ratios. The encoder uses downsampling ratios instead of
+            upsampling ratios, hence it will use the ratios in the reverse order to the ones specified here
+            that must match the decoder order. We use the decoder order as some models may only employ the decoder.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        kernel_size (int): Kernel size for the initial convolution.
+        last_kernel_size (int): Kernel size for the initial convolution.
+        residual_kernel_size (int): Kernel size for the residual layers.
+        dilation_base (int): How much to increase the dilation with each layer.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        true_skip (bool): Whether to use true skip connection or a simple
+            (streamable) convolution as the skip connection in the residual network blocks.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        disable_norm_outer_blocks (int): Number of blocks for which we don't apply norm.
+            For the encoder, it corresponds to the N first blocks.
+        mask_fn (nn.Module): Optional mask function to apply after convolution layers.
+        mask_position (int): Position of the mask function, with mask_position == 0 for the first convolution layer,
+            mask_position == 1 for the first conv block, etc.
+    """
+    def __init__(
+        self,
+        channels: int = 1,
+        dimension: int = 128,
+        n_filters: int = 32,
+        n_residual_layers: int = 3,
+        ratios: tp.List[int] = [8, 5, 4, 2],
+        activation: str = "ELU",
+        activation_params: dict = {"alpha": 1.0},
+        norm: str = "none",
+        norm_params: tp.Dict[str, tp.Any] = {},
+        kernel_size: int = 7,
+        last_kernel_size: int = 7,
+        residual_kernel_size: int = 3,
+        dilation_base: int = 2,
+        causal: bool = False,
+        pad_mode: str = "reflect",
+        true_skip: bool = True,
+        compress: int = 2,
+        disable_norm_outer_blocks: int = 0,
+        mask_fn: tp.Optional[nn.Module] = None,
+        mask_position: tp.Optional[int] = None,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.dimension = dimension
+        self.n_filters = n_filters
+        self.ratios = list(reversed(ratios))
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = int(np.prod(self.ratios))
+        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
+        self.disable_norm_outer_blocks = disable_norm_outer_blocks
+        assert (
+            self.disable_norm_outer_blocks >= 0 and self.disable_norm_outer_blocks <= self.n_blocks
+        ), (
+            "Number of blocks for which to disable norm is invalid."
+            "It should be lower or equal to the actual number of blocks in the network and greater or equal to 0."
+        )
+        act = getattr(nn, activation)
+        mult = 1
+        model: tp.List[nn.Module] = [
+            StreamingConv1d(
+                channels,
+                mult * n_filters,
+                kernel_size,
+                norm="none" if self.disable_norm_outer_blocks >= 1 else norm,
+                norm_kwargs=norm_params,
+                causal=causal,
+                pad_mode=pad_mode,
+            )
+        ]
+        if mask_fn is not None and mask_position == 0:
+            model += [mask_fn]
+        # Downsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            block_norm = "none" if self.disable_norm_outer_blocks >= i + 2 else norm
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(
+                        mult * n_filters,
+                        kernel_sizes=[residual_kernel_size, 1],
+                        dilations=[dilation_base**j, 1],
+                        norm=block_norm,
+                        norm_params=norm_params,
+                        activation=activation,
+                        activation_params=activation_params,
+                        causal=causal,
+                        pad_mode=pad_mode,
+                        compress=compress,
+                        true_skip=true_skip,
+                    )
+                ]
+            # Add downsampling layers
+            model += [
+                act(**activation_params),
+                StreamingConv1d(
+                    mult * n_filters,
+                    mult * n_filters * 2,
+                    kernel_size=ratio * 2,
+                    stride=ratio,
+                    norm=block_norm,
+                    norm_kwargs=norm_params,
+                    causal=causal,
+                    pad_mode=pad_mode,
+                ),
+            ]
+            mult *= 2
+            if mask_fn is not None and mask_position == i + 1:
+                model += [mask_fn]
+        model += [
+            act(**activation_params),
+            StreamingConv1d(
+                mult * n_filters,
+                dimension,
+                last_kernel_size,
+                norm=(
+                    "none" if self.disable_norm_outer_blocks == self.n_blocks else norm
+                ),
+                norm_kwargs=norm_params,
+                causal=causal,
+                pad_mode=pad_mode,
+            ),
+        ]
+        self.model = nn.Sequential(*model)
+    @torch_compile_lazy
+    def forward(self, x):
+        return self.model(x)
+class SEANetDecoder(StreamingContainer):
+    """SEANet decoder.
+    Args:
+        channels (int): Audio channels.
+        dimension (int): Intermediate representation dimension.
+        n_filters (int): Base width for the model.
+        n_residual_layers (int): nb of residual layers.
+        ratios (Sequence[int]): kernel size and stride ratios.
+        activation (str): Activation function.
+        activation_params (dict): Parameters to provide to the activation function.
+        final_activation (str): Final activation function after all convolutions.
+        final_activation_params (dict): Parameters to provide to the activation function.
+        norm (str): Normalization method.
+        norm_params (dict): Parameters to provide to the underlying normalization used along with the convolution.
+        kernel_size (int): Kernel size for the initial convolution.
+        last_kernel_size (int): Kernel size for the initial convolution.
+        residual_kernel_size (int): Kernel size for the residual layers.
+        dilation_base (int): How much to increase the dilation with each layer.
+        causal (bool): Whether to use fully causal convolution.
+        pad_mode (str): Padding mode for the convolutions.
+        true_skip (bool): Whether to use true skip connection or a simple.
+            (streamable) convolution as the skip connection in the residual network blocks.
+        compress (int): Reduced dimensionality in residual branches (from Demucs v3).
+        disable_norm_outer_blocks (int): Number of blocks for which we don't apply norm.
+            For the decoder, it corresponds to the N last blocks.
+        trim_right_ratio (float): Ratio for trimming at the right of the transposed convolution under the causal setup.
+            If equal to 1.0, it means that all the trimming is done at the right.
+    """
+    def __init__(
+        self,
+        channels: int = 1,
+        dimension: int = 128,
+        n_filters: int = 32,
+        n_residual_layers: int = 3,
+        ratios: tp.List[int] = [8, 5, 4, 2],
+        activation: str = "ELU",
+        activation_params: dict = {"alpha": 1.0},
+        final_activation: tp.Optional[str] = None,
+        final_activation_params: tp.Optional[dict] = None,
+        norm: str = "none",
+        norm_params: tp.Dict[str, tp.Any] = {},
+        kernel_size: int = 7,
+        last_kernel_size: int = 7,
+        residual_kernel_size: int = 3,
+        dilation_base: int = 2,
+        causal: bool = False,
+        pad_mode: str = "reflect",
+        true_skip: bool = True,
+        compress: int = 2,
+        disable_norm_outer_blocks: int = 0,
+        trim_right_ratio: float = 1.0,
+    ):
+        super().__init__()
+        self.dimension = dimension
+        self.channels = channels
+        self.n_filters = n_filters
+        self.ratios = ratios
+        del ratios
+        self.n_residual_layers = n_residual_layers
+        self.hop_length = int(np.prod(self.ratios))
+        self.n_blocks = len(self.ratios) + 2  # first and last conv + residual blocks
+        self.disable_norm_outer_blocks = disable_norm_outer_blocks
+        assert (
+            self.disable_norm_outer_blocks >= 0 and self.disable_norm_outer_blocks <= self.n_blocks
+        ), (
+            "Number of blocks for which to disable norm is invalid."
+            "It should be lower or equal to the actual number of blocks in the network and greater or equal to 0."
+        )
+        act = getattr(nn, activation)
+        mult = int(2 ** len(self.ratios))
+        model: tp.List[nn.Module] = [
+            StreamingConv1d(
+                dimension,
+                mult * n_filters,
+                kernel_size,
+                norm=(
+                    "none" if self.disable_norm_outer_blocks == self.n_blocks else norm
+                ),
+                norm_kwargs=norm_params,
+                causal=causal,
+                pad_mode=pad_mode,
+            )
+        ]
+        # Upsample to raw audio scale
+        for i, ratio in enumerate(self.ratios):
+            block_norm = (
+                "none"
+                if self.disable_norm_outer_blocks >= self.n_blocks - (i + 1)
+                else norm
+            )
+            # Add upsampling layers
+            model += [
+                act(**activation_params),
+                StreamingConvTranspose1d(
+                    mult * n_filters,
+                    mult * n_filters // 2,
+                    kernel_size=ratio * 2,
+                    stride=ratio,
+                    norm=block_norm,
+                    norm_kwargs=norm_params,
+                    causal=causal,
+                    trim_right_ratio=trim_right_ratio,
+                ),
+            ]
+            # Add residual layers
+            for j in range(n_residual_layers):
+                model += [
+                    SEANetResnetBlock(
+                        mult * n_filters // 2,
+                        kernel_sizes=[residual_kernel_size, 1],
+                        dilations=[dilation_base**j, 1],
+                        activation=activation,
+                        activation_params=activation_params,
+                        norm=block_norm,
+                        norm_params=norm_params,
+                        causal=causal,
+                        pad_mode=pad_mode,
+                        compress=compress,
+                        true_skip=true_skip,
+                    )
+                ]
+            mult //= 2
+        # Add final layers
+        model += [
+            act(**activation_params),
+            StreamingConv1d(
+                n_filters,
+                channels,
+                last_kernel_size,
+                norm="none" if self.disable_norm_outer_blocks >= 1 else norm,
+                norm_kwargs=norm_params,
+                causal=causal,
+                pad_mode=pad_mode,
+            ),
+        ]
+        # Add optional final activation to decoder (eg. tanh)
+        if final_activation is not None:
+            final_act = getattr(nn, final_activation)
+            final_activation_params = final_activation_params or {}
+            model += [final_act(**final_activation_params)]
+        self.model = nn.Sequential(*model)
+    @torch_compile_lazy
+    def forward(self, z):
+        y = self.model(z)
+        return y

moshi/modules/streaming.py ADDED Viewed

	@@ -0,0 +1,363 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Streaming module API that should be implemented by all Streaming components,
+"""
+import abc
+from contextlib import contextmanager
+from dataclasses import dataclass
+import itertools
+import math
+import typing as tp
+from torch import nn
+import torch
+class Resetable(tp.Protocol):
+    def reset(self) -> None:
+        pass
+State = tp.TypeVar("State", bound=Resetable)
+class StreamingModule(abc.ABC, nn.Module, tp.Generic[State]):
+    """Common API for streaming components.
+    Each streaming component has a streaming state, which is just a dict[str, Tensor].
+    By convention, the first dim of each tensor must be the batch size.
+    Don't use dots in the key names, as this would clash with submodules
+    (like in state_dict).
+    If `self._is_streaming` is True, the component should use and remember
+    the proper state inside `self._streaming_state`.
+    To set a streaming component in streaming state, use
+        with module.streaming():
+            ...
+    This will automatically reset the streaming state when exiting the context manager.
+    This also automatically propagates to all streaming children module.
+    Some module might also implement the `StreamingModule.flush` method, although
+    this one is trickier, as all parents module must be StreamingModule and implement
+    it as well for it to work properly. See `StreamingSequential` after.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self._streaming_state: State | None = None
+        self._streaming_propagate: bool = True
+    @property
+    def is_streaming(self):
+        return self._streaming_state is not None
+    def set_streaming_propagate(self, streaming_propagate: bool):
+        self._streaming_propagate = streaming_propagate
+    def _apply_named_streaming(self, fn: tp.Any):
+        def _handle_module(prefix: str, module: nn.Module, recurse: bool = True):
+            propagate = True
+            if isinstance(module, StreamingModule):
+                if module._streaming_propagate:
+                    fn(prefix, module)
+                else:
+                    propagate = False
+            if not recurse:
+                return
+            if propagate:
+                for name, child in module.named_children():
+                    _handle_module(prefix + "." + name, child)
+        _handle_module("", self, recurse=False)
+        for name, child in self.named_children():
+            _handle_module(name, child)
+    def _start_streaming(self, batch_size: int):
+        def _start_streaming(name: str, module: StreamingModule):
+            module._streaming_state = module._init_streaming_state(batch_size)
+        self._apply_named_streaming(_start_streaming)
+    def _stop_streaming(self):
+        def _stop_streaming(name: str, module: StreamingModule):
+            module._streaming_state = None
+        self._apply_named_streaming(_stop_streaming)
+    @abc.abstractmethod
+    def _init_streaming_state(self, batch_size: int) -> State: ...
+    def streaming_forever(self, batch_size: int):
+        self._start_streaming(batch_size)
+    @contextmanager
+    def streaming(self, batch_size: int):
+        """Context manager to enter streaming mode. Reset streaming state on exit."""
+        self._start_streaming(batch_size)
+        try:
+            yield
+        finally:
+            self._stop_streaming()
+    def reset_streaming(self):
+        """Reset the streaming state."""
+        def _reset(name: str, module: StreamingModule):
+            state = module._streaming_state
+            if state is None:
+                raise ValueError(
+                    f"Trying to reset streaming, but {name} wasn't streaming."
+                )
+            state.reset()
+        self._apply_named_streaming(_reset)
+    def get_streaming_state(self) -> dict[str, tp.Any]:
+        """Return the complete streaming state, including that of sub-modules."""
+        state: dict[str, tp.Any] = {}
+        def _add(name: str, module: StreamingModule):
+            state[name] = module._streaming_state
+        self._apply_named_streaming(_add)
+        return state
+    def set_streaming_state(self, state: dict[str, tp.Any]):
+        """Set the streaming state, including that of sub-modules."""
+        state = dict(state)
+        def _set(name: str, module: StreamingModule):
+            if name in state:
+                module._streaming_state = state[name]
+                state.pop(name)
+            else:
+                raise RuntimeError(f"Expected to find a streaming state for {name}.")
+        self._apply_named_streaming(_set)
+        if state:
+            raise RuntimeError(f"Some states were not consumed: {list(state.keys())}")
+@dataclass
+class _NullState:
+    pass
+    def reset(self) -> None:
+        pass
+class StreamingContainer(StreamingModule[_NullState]):
+    def _init_streaming_state(self, batch_size: int) -> _NullState:
+        return _NullState()
+@dataclass
+class _StreamingAddState:
+    previous_x: torch.Tensor | None = None
+    previous_y: torch.Tensor | None = None
+    def reset(self):
+        self.previous_x = None
+        self.previous_y = None
+class StreamingAdd(StreamingModule[_StreamingAddState]):
+    def _init_streaming_state(self, batch_size: int) -> _StreamingAddState:
+        return _StreamingAddState()
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        if self._streaming_state is None:
+            return x + y
+        else:
+            prev_x = self._streaming_state.previous_x
+            prev_y = self._streaming_state.previous_y
+            if prev_x is not None:
+                x = torch.cat([prev_x, x], dim=-1)
+            if prev_y is not None:
+                y = torch.cat([prev_y, y], dim=-1)
+            m_l = min(x.shape[-1], y.shape[-1])
+            self._streaming_state.previous_x = x[..., m_l:]
+            self._streaming_state.previous_y = y[..., m_l:]
+            return x[..., :m_l] + y[..., :m_l]
+@dataclass
+class _StreamingConvState:
+    previous: torch.Tensor | None = None
+    def reset(self):
+        self.previous = None
+class RawStreamingConv1d(nn.Conv1d, StreamingModule[_StreamingConvState]):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.padding[0] == 0, "Padding should be handled outside."
+        assert (
+            self.stride[0] <= self.kernel_size[0]
+        ), "stride must be less than kernel_size."
+    def _init_streaming_state(self, batch_size: int) -> _StreamingConvState:
+        return _StreamingConvState()
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        stride = self.stride[0]
+        # Effective kernel size accounting for dilation.
+        kernel = (self.kernel_size[0] - 1) * self.dilation[0] + 1
+        if self._streaming_state is None:
+            return super().forward(input)
+        else:
+            # Due to the potential overlap, we might have some cache of the previous time steps.
+            previous = self._streaming_state.previous
+            if previous is not None:
+                input = torch.cat([previous, input], dim=-1)
+            B, C, T = input.shape
+            # We now compute the number of full convolution frames, i.e. the frames
+            # that are ready to be computed.
+            num_frames = max(0, int(math.floor((T - kernel) / stride) + 1))
+            offset = num_frames * stride
+            # We will compute `num_frames` outputs, and we are advancing by `stride`
+            # for each of the frame, so we know the data before `stride * num_frames`
+            # will never be used again.
+            self._streaming_state.previous = input[..., offset:]
+            if num_frames > 0:
+                input_length = (num_frames - 1) * stride + kernel
+                out = super().forward(input[..., :input_length])
+            else:
+                # Not enough data as this point to output some new frames.
+                out = torch.empty(
+                    B, self.out_channels, 0, device=input.device, dtype=input.dtype
+                )
+            return out
+@dataclass
+class _StreamingConvTrState:
+    partial: torch.Tensor | None = None
+    def reset(self):
+        self.partial = None
+class RawStreamingConvTranspose1d(
+    nn.ConvTranspose1d, StreamingModule[_StreamingConvTrState]
+):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.padding[0] == 0, "Padding should be handled outside."
+        assert self.dilation[0] == 1, "No dilation for now"
+        assert (
+            self.stride[0] <= self.kernel_size[0]
+        ), "stride must be less than kernel_size."
+        assert self.output_padding[0] == 0, "Output padding not supported."
+    def _init_streaming_state(self, batch_size: int) -> _StreamingConvTrState:
+        return _StreamingConvTrState()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore
+        B, C, T = x.shape
+        stride = self.stride[0]
+        kernel = self.kernel_size[0]
+        if self._streaming_state is None:
+            return super().forward(x)
+        else:
+            if T == 0:
+                return torch.empty(
+                    B, self.out_channels, 0, device=x.device, dtype=x.dtype
+                )
+            out = super().forward(x)
+            OT = out.shape[-1]
+            partial = self._streaming_state.partial
+            if partial is not None:
+                # Due to the potential overlap, the rightmost output of the conv transpose is not
+                # ready to be output, as it will receive contributions from the next input frames.
+                # Here we recover those `partial` output frames. We know that the first time step
+                # of the `partial` tensor corresponds to the first time step of `out` as anything
+                # coming before the first time step of `out` would have been already flushed.
+                PT = partial.shape[-1]
+                if self.bias is not None:
+                    out[..., :PT] += partial - self.bias[:, None]
+                else:
+                    out[..., :PT] += partial
+            # The input is T, the output is S * (T - 1) + K.
+            # The offset of the left of the next frame will be S * T
+            # so everything between 0 and S * T is ready to be output, and we need
+            # to keep in the internal state everything beyond that, i.e. S (T - 1) + K - S T = K - S
+            invalid_steps = kernel - stride
+            partial = out[..., OT - invalid_steps :]
+            out = out[..., : OT - invalid_steps]
+            self._streaming_state.partial = partial
+            return out
+def test():
+    torch.manual_seed(1234)
+    device = "cpu"
+    if torch.cuda.is_available():
+        # Avoid the cuda optimizations that would take place on single precision
+        # floats for convolutions.
+        torch.backends.cudnn.enabled = True
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
+        device = "cuda:0"
+    kernel_sizes = [1, 3, 4, 8, 15, 16]
+    strides = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+    chin = 6
+    chout = 12
+    for kernel, stride in itertools.product(kernel_sizes, strides):
+        if stride > kernel:
+            continue
+        conv = RawStreamingConv1d(chin, chout, kernel, stride).to(device)
+        convtr = RawStreamingConvTranspose1d(chout, chin, kernel, stride).to(device)
+        for length in [4, 8, 32, 54, 65, 128, 1043]:
+            print(f"ksize {kernel} strides {stride} len {length}")
+            if length < kernel:
+                continue
+            batch_size = 3
+            x = torch.randn(batch_size, chin, length).to(device)
+            y = conv(x)
+            z = convtr(y)
+            for chunk_size in [1, 3, 5, 8]:
+                ys = []
+                zs = []
+                with conv.streaming(batch_size), convtr.streaming(batch_size):
+                    for offset in range(0, length, chunk_size):
+                        chunk = x[..., offset : offset + chunk_size]
+                        ys.append(conv(chunk))
+                        zs.append(convtr(ys[-1]))
+                y_stream = torch.cat(ys, dim=-1)
+                z_stream = torch.cat(zs, dim=-1)
+                y = y[..., : y_stream.shape[-1]]
+                z = z[..., : z_stream.shape[-1]]
+                assert y.shape == y_stream.shape, (y.shape, y_stream.shape)
+                delta = (y_stream - y).norm() / y.norm()
+                assert delta <= 1e-6, delta
+                num_frames = int((length - kernel) / stride) + 1
+                assert num_frames == y_stream.shape[-1]
+                assert z.shape == z_stream.shape, (z.shape, z_stream.shape)
+                delta = (z_stream - z).norm() / z.norm()
+                assert delta <= 1e-6, (delta, (z_stream - z).abs().mean(dim=(0, 1)))
+if __name__ == "__main__":
+    with torch.no_grad():
+        test()

moshi/modules/transformer.py ADDED Viewed

	@@ -0,0 +1,750 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Transformer model, with streaming support, + CUDA Graphable.
+Optimized for inference.
+See `StreamingTransformer` for more information.
+"""
+from contextlib import ExitStack
+from dataclasses import dataclass
+import typing as tp
+from einops import rearrange
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from ..utils.compile import no_compile
+from .gating import make_gating
+from .rope import RotaryEmbedding
+from .streaming import StreamingModule, StreamingContainer
+class LayerNormF32(nn.LayerNorm):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        x_f32 = input.float()
+        out_f32 = super().forward(x_f32)
+        return out_f32.to(input.dtype)
+def _rms_norm(
+    x: torch.Tensor,
+    alpha: torch.Tensor,
+    dtype: tp.Optional[torch.dtype],
+    eps: float,
+):
+    assert x.dim() == 3, f"RMSNorm expects 3D inputs but got {x.shape}"
+    x_dtype = x.dtype
+    if dtype is not None:
+        x = x.to(dtype)
+    var = eps + torch.mean(x**2, dim=2, keepdim=True)
+    y = (x * (alpha.to(var) * torch.rsqrt(var))).to(x_dtype)
+    return y
+class RMSNorm(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-5,
+        dtype: tp.Optional[torch.dtype] = None,
+        device=None,
+    ):
+        super().__init__()
+        self.eps = eps
+        self.dtype = dtype
+        self.alpha = nn.Parameter(
+            torch.full((1, 1, dim), 1.0, requires_grad=True, device=device, dtype=dtype)
+        )
+    def forward(self, x: torch.Tensor):
+        return _rms_norm(x, self.alpha, self.dtype, self.eps)
+class LayerScale(nn.Module):
+    """Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
+    This rescales diagonally the residual outputs close to 0, with a learnt scale.
+    Args:
+        channels (int): Number of channels.
+        init (float): Initial scale.
+        channel_last (bool): If True, expect `[*, C]` shaped tensors, otherwise, `[*, C, T]`.
+        device (torch.device or str, optional): Device on which to initialize the module.
+        dtype (torch.dtype, optional): dtype to use to initialize the module.
+    """
+    def __init__(
+        self,
+        channels: int,
+        init: float = 1e-4,
+        channel_last: bool = True,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        self.channel_last = channel_last
+        self.scale = nn.Parameter(
+            torch.full(
+                (channels,), init, requires_grad=True, device=device, dtype=dtype
+            )
+        )
+    def forward(self, x: torch.Tensor):
+        if self.channel_last:
+            return self.scale * x
+        else:
+            return self.scale[:, None] * x
+def create_norm_fn(norm_type: str, dim: int, **kwargs) -> nn.Module:
+    """Create normalization module for transformer encoder layer.
+    Args:
+        norm_type (str): Normalization method.
+        dim (int): Dimension of the normalized layer.
+        **kwargs (dict): Additional parameters for normalization layer.
+    Returns:
+        nn.Module: Normalization module.
+    """
+    if norm_type == "layer_norm":
+        return nn.LayerNorm(dim, eps=1e-5, **kwargs)
+    elif norm_type == "layer_norm_f32":
+        kwargs.pop("dtype", None)
+        return LayerNormF32(dim, eps=1e-8, **kwargs)
+    elif norm_type in {"rms_norm"}:
+        return RMSNorm(dim, eps=1e-5, **kwargs)
+    elif norm_type in {"rms_norm_f32"}:
+        kwargs.pop("dtype", None)
+        return RMSNorm(dim, eps=1e-8, dtype=torch.float, **kwargs)
+    else:
+        raise ValueError(f"Unknown norm type: {norm_type}")
+def create_sin_embedding(
+    positions: torch.Tensor,
+    dim: int,
+    max_period: float = 10000,
+    dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """Create sinusoidal positional embedding, with shape `[B, T, C]`.
+    Args:
+        positions (torch.Tensor): LongTensor of positions.
+        dim (int): Dimension of the embedding.
+        max_period (float): Maximum period of the cosine/sine functions.
+        dtype (torch.dtype or str): dtype to use to generate the embedding.
+    Returns:
+        torch.Tensor: Sinusoidal positional embedding.
+    """
+    # We aim for BTC format
+    assert dim % 2 == 0
+    half_dim = dim // 2
+    positions = positions.to(dtype)
+    adim = torch.arange(half_dim, device=positions.device, dtype=dtype).view(1, 1, -1)
+    max_period_tensor = torch.full(
+        [], max_period, device=positions.device, dtype=dtype
+    )  # avoid sync point
+    phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
+    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
+def multi_linear(
+    num_linear: int,
+    weight: torch.Tensor,
+    x: torch.Tensor,
+    offset: int,
+):
+    """Utility to apply a multi linear layer to the given input. A multi linear layer
+    applies a different set of weight for each time step.
+    Args:
+        num_linear (int): Number of possible time steps and so number of linears.
+        weight (torch.Tensor): Weight tensor, with shape `[num_linear * chout, chin]`.
+        x (torch.Tensor): Input tensor, with shape `[B, T, C]`.
+        offset (int): offset for the current time step, in particular for decoding, with
+            time steps provided one by one.
+    """
+    B, T, C = x.shape
+    ys = []
+    chout, chin = weight.shape
+    weight = weight.view(num_linear, -1, chin)
+    for t in range(T):
+        y = F.linear(x[:, t], weight[t + offset])
+        ys.append(y)
+    out = torch.stack(ys, 1)
+    return out
+def set_attention_context(model: nn.Module, context: tp.Optional[int] = None) -> None:
+    """Deactivates or changes the context span (in time steps) in a model.
+    Args:
+        model (nn.Module): model over which to look for attentions.
+        context (int or None): new temporary context value.
+    ..Note:: this is not a context manager but a plain function changing the context forever.
+        Initially, it was a context manager, but that led to interesting bugs when using
+        activation checkpointing, with the context being inconsistent between the forward
+        and backward.
+    """
+    for module in model.modules():
+        if isinstance(module, StreamingMultiheadAttention):
+            module.context = context
+class KVCacheResult(tp.NamedTuple):
+    keys: torch.Tensor
+    values: torch.Tensor
+    positions: torch.Tensor
+    @staticmethod
+    def from_kv(keys: torch.Tensor, values: torch.Tensor) -> "KVCacheResult":
+        B, H, T, D = keys.shape
+        assert tuple(values.shape[:-1]) == (B, H, T)
+        positions = torch.arange(T, device=keys.device, dtype=torch.long)
+        return KVCacheResult(keys, values, positions)
+class RingKVCache:
+    """Efficient streaming KVCache to be compatible with Cuda Graph.
+    Args:
+        batch_size (int): Batch size.
+        num_heads (int): Number of heads in the attention.
+        dim_per_head (int): Dimension per head.
+        device (torch.device): Device on which to initialize the cache.
+        dtype (torch.dtype): dtype to use for the cache.
+    """
+    def __init__(
+        self,
+        batch_size: int,
+        num_heads: int,
+        dim_per_head: int,
+        capacity: int,
+        device: torch.device = torch.device("cuda"),
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        self.capacity = capacity
+        self.cache = torch.zeros(
+            (2, batch_size, num_heads, capacity, dim_per_head),
+            device=device,
+            dtype=dtype,
+        )
+        self.end_offset = torch.zeros(1, device=device, dtype=torch.long)
+    def reset(self):
+        self.end_offset.zero_()
+    def complete(self, k: torch.Tensor, v: torch.Tensor) -> KVCacheResult:
+        assert k.shape[:-1] == v.shape[:-1], (k.shape, v.shape)
+        B, H, T, D = k.shape
+        indexes = torch.arange(T, device=self.end_offset.device, dtype=self.end_offset.dtype) + self.end_offset
+        indexes = indexes % self.capacity
+        self.cache[0].index_copy_(2, indexes, k)
+        self.cache[1].index_copy_(2, indexes, v)
+        self.end_offset.add_(T)
+        keys = self.cache[0]
+        values = self.cache[1]
+        indexes = torch.arange(
+            self.capacity, device=self.end_offset.device, dtype=torch.long
+        )
+        invalid = indexes >= self.end_offset
+        end_index = self.end_offset % self.capacity
+        delta = indexes - end_index
+        # If last key is for step S, and capacity is C, last key was written at index S % C.
+        # then end_offset = S + 1, and end_index = (S + 1) % C.
+        # Then for index = (S % C), delta = -1, and the next code gives us:
+        # position(index) = (S + 1) - 1 = S, all good.
+        # Now the time step at end_offset is actually the oldest in the KVCache, e.g., its
+        # position should be (S - self.capacity + 1).
+        # The following code gives us:
+        # position(index + 1) = S + 1 + 0 - self.capacity.
+        positions = torch.where(
+            delta <= 0,
+            self.end_offset + delta,
+            self.end_offset + delta - self.capacity,
+        )
+        positions = torch.where(invalid, torch.full_like(positions, -1), positions)
+        return KVCacheResult(keys, values, positions)
+@dataclass
+class _MHAState:
+    kv_cache: RingKVCache
+    offset: torch.Tensor
+    offset_cpu: int
+    def reset(self):
+        self.kv_cache.reset()
+        self.offset.zero_()
+        self.offset_cpu = 0
+class StreamingMultiheadAttention(StreamingModule[_MHAState]):
+    """Similar to `nn.MultiheadAttention` but with support for streaming, causal evaluation.
+    Args:
+        embed_dim (int): Dimension to project to.
+        num_heads (int): Number of heads.
+        causal (bool): Causal mask applied automatically.
+        context (int, optional): Number of time steps the attention can access to.
+            When causal, can access `context` time steps into the past, and when non causal,
+            can access `context // 2` steps in the past, and the same in the future.
+        rope (`RotaryEmbedding`, optional): Rope embedding to use.
+        weights_per_step (int): use different weights per time step. If non zero, should correspond to the
+            number of possible time steps.
+        device (torch.device, optional): Device on which to initialize.
+        dtype (torch.dtype, optional): dtype to use.
+    """
+    _fsdp_final = True
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        causal: bool = False,
+        context: tp.Optional[int] = None,
+        rope: tp.Optional[RotaryEmbedding] = None,
+        weights_per_step: int = 0,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.embed_dim = embed_dim
+        self.causal = causal
+        self.context = context
+        self.rope = rope
+        self.num_heads = num_heads
+        out_dim = embed_dim
+        out_dim = 3 * embed_dim
+        mult = 1
+        self.weights_per_step = weights_per_step
+        if weights_per_step:
+            mult = weights_per_step
+        in_proj = nn.Linear(embed_dim, mult * out_dim, bias=False, **factory_kwargs)
+        # We try to follow the default PyTorch MHA convention, to easily compare results.
+        self.in_proj_weight = in_proj.weight
+        self.in_proj_bias = in_proj.bias
+        self.out_proj = nn.Linear(
+            embed_dim, mult * embed_dim, bias=False, **factory_kwargs
+        )
+    def _init_streaming_state(self, batch_size: int) -> _MHAState:
+        if self.context is None:
+            if self.weights_per_step:
+                capacity = self.weights_per_step
+            else:
+                raise RuntimeError(
+                    "Cannot create a streaming KVCache without a context to estimate capacity."
+                )
+        else:
+            capacity = self.context
+        device = self.in_proj_weight.device
+        # TODO: the following estimation will not work great with FSDP.
+        dtype = self.in_proj_weight.dtype
+        dim_per_head = self.embed_dim // self.num_heads
+        kv_cache = RingKVCache(
+            batch_size, self.num_heads, dim_per_head, capacity, device, dtype
+        )
+        return _MHAState(
+            kv_cache,
+            offset=torch.zeros(1, device=device, dtype=torch.long),
+            offset_cpu=0,
+        )
+    def _complete_kv(self, k, v) -> KVCacheResult:
+        state = self._streaming_state
+        if state is None:
+            return KVCacheResult.from_kv(k, v)
+        else:
+            return state.kv_cache.complete(k, v)
+    def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor):
+        state = self._streaming_state
+        T = query.shape[1]
+        if state is None:
+            offset = torch.zeros(1, device=query.device, dtype=torch.long)
+            offset_cpu = 0
+        else:
+            assert self.causal, "Streaming only available for causal"
+            offset = state.offset
+            offset_cpu = state.offset_cpu
+        if self.weights_per_step:
+            projected = multi_linear(
+                self.weights_per_step, self.in_proj_weight, query, offset_cpu
+            )
+        else:
+            projected = nn.functional.linear(query, self.in_proj_weight)
+        q, k, v = rearrange(
+            projected, "b t (p h d) -> p b h t d", p=3, h=self.num_heads
+        )
+        if self.rope:
+            q, k = self.rope(q, k, offset, time_before_heads=False)
+        k, v, pos_k = self._complete_kv(k, v)
+        if self.causal:
+            pos_k = pos_k.view(1, -1)
+            pos_q = offset + torch.arange(T, device=q.device, dtype=torch.long).view(
+                -1, 1
+            )
+            delta = pos_q - pos_k
+            attn_bias = (pos_k >= 0) & (delta >= 0)
+            if self.context is not None:
+                attn_bias = attn_bias & (delta < self.context)
+        else:
+            attn_bias = None
+        x = F.scaled_dot_product_attention(q, k, v, attn_bias, dropout_p=0.0)
+        x = rearrange(x, "b h t d -> b t (h d)")
+        if self.weights_per_step:
+            x = multi_linear(self.weights_per_step, self.out_proj.weight, x, offset_cpu)
+        else:
+            x = self.out_proj(x)
+        if state is not None:
+            state.offset.add_(T)
+            state.offset_cpu += T
+        return x
+@dataclass
+class _LayerState:
+    offset_cpu: int
+    def reset(self):
+        self.offset_cpu = 0
+class StreamingTransformerLayer(StreamingModule[_LayerState]):
+    """TransformerLayer with Streaming / Causal support.
+    Args:
+        d_model (int): Dimension of the data.
+        num_heads (int): Number of heads.
+        dim_feedforward (int): Intermediate dimension of FF module.
+        causal (bool): Causal mask applied automatically.
+        context (int, optional): Receptive field for the causal mask, infinite if None.
+        custom (bool): Use custom MHA implementation, for testing / benchmarking.
+        rope (`RotaryEmbedding`, optional): Rope embedding to use.
+        norm (str): Normalization to use. Currently, only 'layer_norm' is supported.
+        layer_scale (float, optional): If not None, LayerScale will be used with the given value as initial scale.
+        gating (str): if provided, replaces FFN with special gating, like GLU, GSiGLU etc.
+        weights_per_step (int): use different weights per time step. If non zero, should correspond to the
+            number of possible time steps.
+        skip_self_attn: If true, skips the self attention module and the norm
+        device (torch.device, optional): Device on which to initialize.
+        dtype (torch.dtype, optional): dtype to use.
+    """
+    _fsdp_final = True
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        dim_feedforward: int | list[int] = 2048,
+        causal: bool = False,
+        context: tp.Optional[int] = None,
+        rope: tp.Optional[RotaryEmbedding] = None,
+        norm: str = "layer_norm",
+        layer_scale: tp.Optional[float] = None,
+        gating: str = "none",
+        weights_per_step: int = 0,
+        activation=F.gelu,
+        skip_self_attn: bool = False,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        factory_kwargs = {"device": device, "dtype": dtype}
+        # Redefine self_attn to our streaming multi-head attention
+        attn_kwargs: tp.Dict[str, tp.Any] = {
+            "embed_dim": d_model,
+            "num_heads": num_heads,
+        }
+        if not skip_self_attn:
+            self.self_attn: StreamingMultiheadAttention = StreamingMultiheadAttention(
+                causal=causal,
+                context=context,
+                rope=rope,
+                weights_per_step=weights_per_step,
+                **attn_kwargs,  # type: ignore
+                **factory_kwargs,  # type: ignore
+            )  # type: ignore
+            self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs)
+        self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs)
+        # Redefine feedforward layers to expose bias parameter
+        self.weights_per_step = weights_per_step
+        self.gating: tp.Optional[nn.Module] = None
+        self.linear1: tp.Optional[nn.Module] = None
+        self.linear2: tp.Optional[nn.Module] = None
+        self.activation = activation
+        self.skip_self_attn = skip_self_attn
+        if isinstance(dim_feedforward, list):
+            assert dim_feedforward
+            assert len(dim_feedforward) == weights_per_step, (
+                "Length of dim_feedforward must match weights_per_step,"
+                f" got {len(dim_feedforward)} != {weights_per_step}"
+            )
+        if gating == "none":
+            assert (
+                not weights_per_step
+            ), "weights_per_step without gating not supported for now."
+            assert not isinstance(
+                dim_feedforward, list
+            ), "List dim_feedforward without gating not supported for now."
+            self.linear1 = nn.Linear(
+                d_model, dim_feedforward, bias=False, **factory_kwargs
+            )
+            self.linear2 = nn.Linear(
+                dim_feedforward, d_model, bias=False, **factory_kwargs
+            )
+        else:
+            self.linear1 = None
+            self.linear2 = None
+            if weights_per_step:
+                if isinstance(dim_feedforward, int):
+                    dim_feedforward = [dim_feedforward] * weights_per_step
+                assert isinstance(dim_feedforward, list), dim_feedforward
+                self.gating = nn.ModuleList(
+                    [
+                        make_gating(gating, d_model, dim, **factory_kwargs)
+                        for dim in dim_feedforward
+                    ]
+                )
+            else:
+                assert isinstance(dim_feedforward, int)
+                self.gating = make_gating(
+                    gating, d_model, dim_feedforward, **factory_kwargs
+                )
+        self.layer_scale_1: nn.Module
+        self.layer_scale_2: nn.Module
+        if layer_scale is None:
+            self.layer_scale_1 = nn.Identity()
+            self.layer_scale_2 = nn.Identity()
+        else:
+            self.layer_scale_1 = LayerScale(d_model, layer_scale, **factory_kwargs)  # type: ignore
+            self.layer_scale_2 = LayerScale(d_model, layer_scale, **factory_kwargs)  # type: ignore
+    def _init_streaming_state(self, batch_size: int) -> _LayerState:
+        return _LayerState(offset_cpu=0)
+    # feed forward block
+    def _ff_block(self, x: torch.Tensor) -> torch.Tensor:
+        state = self._streaming_state
+        offset = 0
+        if state is not None:
+            offset = state.offset_cpu
+        x_orig = x
+        x = self.norm2(x)
+        if self.gating is None:
+            assert self.linear1 is not None
+            assert self.linear2 is not None
+            update = self.linear2(self.activation(self.linear1(x)))
+        else:
+            if self.weights_per_step:
+                assert isinstance(self.gating, nn.ModuleList)
+                B, T, D = x.shape
+                ys = []
+                for t in range(T):
+                    y = self.gating[offset + t](x[:, t : t + 1])
+                    ys.append(y)
+                update = torch.cat(ys, dim=1)
+            else:
+                update = self.gating(x)
+        return x_orig + self.layer_scale_2(update)
+    def _sa_block(self, x: torch.Tensor):
+        if self.skip_self_attn:
+            return x
+        x_orig = x
+        x = self.norm1(x)
+        update = self.self_attn(x, x, x)
+        return x_orig + self.layer_scale_1(update)
+    def forward(self, x: torch.Tensor):
+        with ExitStack() as stack:
+            if x.device.type != 'cuda':
+                stack.enter_context(no_compile())
+            x = self._sa_block(x)
+            x = self._ff_block(x)
+            state = self._streaming_state
+            if state:
+                state.offset_cpu += x.shape[1]
+            return x
+@dataclass
+class _TransformerState:
+    offset: torch.Tensor
+    def reset(self):
+        self.offset.zero_()
+class StreamingTransformer(StreamingModule[_TransformerState]):
+    """Transformer with Streaming / Causal support.
+    Args:
+        d_model (int): Dimension of the data.
+        num_heads (int): Number of heads.
+        dim_feedforward (int): Intermediate dimension of FF module.
+        causal (bool): Causal mask applied automatically.
+        context (int, optional): Receptive field for the causal mask, infinite if None.
+        layer_scale (float, optional): If not None, LayerScale will be used
+            with the given value as initial scale.
+        positional_embedding (str): Positional embedding strategy (sin, rope, sin_rope, or none).
+        max_period (float): Maximum period of the time embedding.
+        positional_scale (float): Scale of positional embedding, set to 0 to deactivate.
+        layer_class: (subclass of `StreamingTransformerLayer): class to use
+            to initialize the layers, allowing further customization outside of AudioCraft.
+        device (torch.device, optional): Device on which to initialize.
+        dtype (torch.dtype, optional): dtype to use.
+        **kwargs: See `StreamingTransformerLayer`.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        num_layers: int,
+        dim_feedforward: int | list[int] = 2048,
+        causal: bool = False,
+        context: tp.Optional[int] = None,
+        positional_embedding: str = "sin",
+        max_period: float = 10_000,
+        positional_scale: float = 1.0,
+        betas: tp.Optional[tp.Tuple[float, float]] = None,
+        layer_class: tp.Type[StreamingTransformerLayer] = StreamingTransformerLayer,
+        device=None,
+        dtype=None,
+        **kwargs,
+    ):
+        super().__init__()
+        assert d_model % num_heads == 0
+        self.positional_embedding = positional_embedding
+        self.max_period = max_period
+        self.positional_scale = positional_scale
+        self.betas = betas
+        assert positional_embedding in {"sin", "rope", "sin_rope", "none"}
+        self.rope: tp.Optional[RotaryEmbedding] = None
+        if self.positional_embedding in {"rope", "sin_rope"}:
+            self.rope = RotaryEmbedding(max_period=max_period)
+        self.layers = nn.ModuleList()
+        for _ in range(num_layers):
+            self.layers.append(
+                layer_class(
+                    d_model=d_model,
+                    num_heads=num_heads,
+                    dim_feedforward=dim_feedforward,
+                    causal=causal,
+                    context=context,
+                    rope=self.rope,
+                    device=device,
+                    dtype=dtype,
+                    **kwargs,
+                )
+            )
+    def _init_streaming_state(self, batch_size: int) -> _TransformerState:
+        device = next(self.parameters()).device
+        return _TransformerState(offset=torch.zeros(1, device=device, dtype=torch.long))
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        B, T, C = x.shape
+        state = self._streaming_state
+        if state is None:
+            offset = torch.zeros(1, dtype=torch.long, device=x.device)
+        else:
+            offset = state.offset
+        if self.positional_embedding in {"sin", "sin_rope"}:
+            positions = torch.arange(T, device=x.device).view(1, -1, 1)
+            positions = positions + offset.view(-1, 1, 1)
+            pos_emb = create_sin_embedding(
+                positions, C, max_period=self.max_period, dtype=x.dtype
+            )
+            x = x + self.positional_scale * pos_emb
+        for layer in self.layers:
+            x = layer(x, *args, **kwargs)
+        if state is not None:
+            state.offset.add_(T)
+        return x
+class ProjectedTransformer(StreamingContainer):
+    """Transformer with optional projections of the input and output to different dimensions when needed.
+    Supports multiple outputs.
+    Args:
+        input_dimension (int): dimension of the input.
+        output_dimensions (tuple[int]): dimensions of the outputs.
+        d_model (int): inner dimension of the Transformer.
+        conv_layout (bool): If True, expects `[B, C, T]` shaped tensors, otherwise, `[B, T, C]`.
+            Similarly, the output will have the same layout.
+    """
+    def __init__(
+        self,
+        input_dimension: int,
+        output_dimensions: tp.Tuple[int, ...],
+        d_model: int,
+        *,
+        conv_layout: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.transformer = StreamingTransformer(d_model=d_model, **kwargs)
+        self.input_dimension = input_dimension
+        self.output_dimensions = output_dimensions
+        self.conv_layout = conv_layout
+        self.input_proj = None
+        if d_model != input_dimension:
+            self.input_proj = nn.Linear(input_dimension, d_model, bias=False)
+        self.output_projs = nn.ModuleList()
+        for output_dimension in output_dimensions:
+            if d_model == output_dimension:
+                self.output_projs.append(nn.Identity())
+            else:
+                self.output_projs.append(
+                    nn.Linear(d_model, output_dimension, bias=False)
+                )
+    def forward(self, x, *args, **kwargs):
+        if self.conv_layout:
+            x = x.transpose(1, 2)
+        if self.input_proj is not None:
+            x = self.input_proj(x)
+        z = self.transformer(x, *args, **kwargs)
+        ys = []
+        for output_proj in self.output_projs:
+            y = output_proj(z)
+            if self.conv_layout:
+                y = y.transpose(1, 2)
+            ys.append(y)
+        return ys

moshi/quantization/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""RVQ."""
+# flake8: noqa
+from .vq import ResidualVectorQuantizer, SplitResidualVectorQuantizer
+from .base import BaseQuantizer, DummyQuantizer, QuantizedResult

moshi/quantization/base.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Base class for all quantizers.
+"""
+from dataclasses import dataclass, field
+import typing as tp
+import torch
+from torch import nn
+@dataclass
+class QuantizedResult:
+    x: torch.Tensor
+    codes: torch.Tensor
+    bandwidth: torch.Tensor  # bandwidth in kb/s used, per batch item.
+    penalty: tp.Optional[torch.Tensor] = None
+    metrics: dict = field(default_factory=dict)
+class BaseQuantizer(nn.Module):
+    """Base class for quantizers."""
+    def __init__(self):
+        super().__init__()
+        self._ema_frozen = False
+    def forward(self, x: torch.Tensor, frame_rate: int) -> QuantizedResult:
+        """
+        Given input tensor x, returns first the quantized (or approximately quantized)
+        representation along with quantized codes, bandwidth, and any penalty term for the loss.
+        Finally, this returns a dict of metrics to update logging etc.
+        Frame rate must be passed so that the bandwidth is properly computed.
+        """
+        raise NotImplementedError()
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Encode a given input tensor with the specified sample rate at the given bandwidth."""
+        raise NotImplementedError()
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode the given codes to the quantized representation."""
+        raise NotImplementedError()
+    @property
+    def cardinality(self) -> int:
+        """Cardinality of each codebook."""
+        raise NotImplementedError()
+    @property
+    def total_codebooks(self) -> int:
+        """Total number of codebooks."""
+        raise NotImplementedError()
+    @property
+    def num_codebooks(self) -> int:
+        """Number of active codebooks."""
+        raise NotImplementedError()
+    @property
+    def semantic_quantizer(self) -> 'BaseQuantizer':
+        """This returns the quantizer that models the first level of the hierarchy (typically semantic).
+        In this case, it's the quantizer itself.
+        """
+        return self
+    @property
+    def acoustic_quantizer(self) -> 'BaseQuantizer':
+        """This returns the quantizer that models the higher levels of the hierarchy (typically acoustic).
+        In this case, it's the quantizer itself.
+        """
+        return self
+    def set_num_codebooks(self, n: int) -> None:
+        """Set the number of active codebooks."""
+        raise NotImplementedError()
+    @property
+    def ema_frozen(self) -> bool:
+        """Whether to apply ema to the codebooks."""
+        return self._ema_frozen
+    def ema_frozen_(self, ema_frozen: bool) -> None:
+        """Set whether ema should be applied to the codebooks."""
+        self._ema_frozen = ema_frozen
+class DummyQuantizer(BaseQuantizer):
+    """Fake quantizer that actually does not perform any quantization."""
+    def __init__(
+        self,
+        dimension: int,
+        input_dimension: tp.Optional[int] = None,
+        output_dimension: tp.Optional[int] = None,
+    ):
+        super().__init__()
+        self.dimension = dimension
+        self.input_dimension = input_dimension or dimension
+        self.output_dimension = output_dimension or dimension
+        self.input_proj: torch.nn.Module
+        self.output_proj: torch.nn.Module
+        if self.input_dimension == self.dimension:
+            self.input_proj = torch.nn.Identity()
+        else:
+            self.input_proj = torch.nn.Conv1d(
+                self.input_dimension, self.dimension, 1, bias=False
+            )
+        if self.input_dimension == self.dimension:
+            self.output_proj = torch.nn.Identity()
+        else:
+            self.output_proj = torch.nn.Conv1d(
+                self.dimension, self.output_dimension, 1, bias=False
+            )
+    def forward(self, x: torch.Tensor, frame_rate: int):
+        q = x.unsqueeze(1)
+        x = self.output_proj(self.input_proj(x))
+        return QuantizedResult(
+            x, q, torch.tensor(q.numel() * 32 * frame_rate / 1000 / len(x)).to(x)
+        )
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Encode a given input tensor with the specified sample rate at the given bandwidth.
+        In the case of the DummyQuantizer, the codes are actually identical
+        to the input and resulting quantized representation as no quantization is done.
+        """
+        x = self.input_proj(x)
+        return x.unsqueeze(1)
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode the given codes to the quantized representation.
+        In the case of the DummyQuantizer, the codes are actually identical
+        to the input and resulting quantized representation as no quantization is done.
+        """
+        y = codes.squeeze(1)
+        return self.output_proj(y)
+    @property
+    def total_codebooks(self):
+        """Total number of codebooks."""
+        return 1
+    @property
+    def num_codebooks(self):
+        """Total number of codebooks."""
+        return self.total_codebooks
+    def set_num_codebooks(self, n: int):
+        """Set the number of active codebooks."""
+        raise AttributeError(
+            "Cannot override the number of codebooks for the dummy quantizer"
+        )
+    @property
+    def cardinality(self) -> int:
+        """Cardinality of each codebook."""
+        return 1

moshi/quantization/core_vq.py ADDED Viewed

	@@ -0,0 +1,384 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import typing as tp
+from einops import rearrange
+import torch
+from torch import nn
+from torch import distributed
+import torch.nn.functional as F
+class _CodebookForwardResult(tp.NamedTuple):
+    quantized: torch.Tensor
+    codes: torch.Tensor
+    metrics: tp.Dict[str, torch.Tensor]
+class _VQForwardResult(tp.NamedTuple):
+    quantized: torch.Tensor
+    codes: torch.Tensor
+    loss: torch.Tensor
+    metrics: tp.Dict[str, torch.Tensor]
+def _ema_inplace(moving_avg: torch.Tensor, new: torch.Tensor, decay: float) -> None:
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+def _uniform_init(*shape: int) -> torch.Tensor:
+    t = torch.empty(shape)
+    nn.init.kaiming_uniform_(t)
+    return t
+def _sample_vectors(samples: torch.Tensor, num: int) -> torch.Tensor:
+    num_samples, device = samples.shape[0], samples.device
+    if num_samples >= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+    return samples[indices]
+def _compute_entropy(usage: torch.Tensor) -> torch.Tensor:
+    # Usage is some unnormalized distribution.
+    proba = usage / usage.sum()
+    p_log_p = torch.where(
+        proba == 0, zero_scalar(usage.device), proba * torch.log(proba)
+    )
+    return -p_log_p.sum()
+def _is_distributed() -> bool:
+    # Checks if we need to use distributed routines.
+    return distributed.is_initialized() and distributed.get_world_size() > 1
+def zero_scalar(device) -> torch.Tensor:
+    """Returns a 0. value on the given device without introducing a synchronization point."""
+    return torch.zeros([1], device=device)[0]
+class EuclideanCodebook(nn.Module):
+    """Codebook with Euclidean distance.
+    Args:
+        dim (int): Dimension.
+        codebook_size (int): Codebook size.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_usage_ratio (float): Defines the threshold for the cluster usage under which a centroid
+            is replaced. This is expressed as a fraction of the usage a centroid would get under
+            a uniform distribution, so that it doesn't depend on the batch size etc.
+        replaced_usage_ratio (float): When replacing a centroid, use this as an initial centroid usage,
+            to avoid the centroid getting replaced too quickly.
+        check_unused_every (int): Check for unused centroids every `check_unused_every` iterations.
+            This is to avoid too many synchronization points.
+    Buffers:
+        cluster_usage (torch.Tensor): EMA of the cluster usage per batch, e.g. this will
+            be dependent on the batch size etc.
+        embedding_sum (torch.Tensor): EMA of the sum of the assigned points to each cluster.
+            In particular, this can be normalized by `cluster_usage` to obtain the
+            actual cluster centroids.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        threshold_usage_ratio: float = 0.1,
+        replaced_usage_ratio: float = 1.0,
+        check_unused_every: int = 5,
+    ):
+        super().__init__()
+        self.decay = decay
+        embedding = torch.zeros(codebook_size, dim)
+        self.dim = dim
+        self.codebook_size = codebook_size
+        self.epsilon = epsilon
+        self.threshold_usage_ratio = threshold_usage_ratio
+        self.replaced_usage_ratio = replaced_usage_ratio
+        self.check_unused_every = check_unused_every
+        self._next_unused_check = check_unused_every
+        self.register_buffer("_initialized", torch.tensor([False], dtype=torch.float))
+        self.register_buffer("cluster_usage", torch.ones(codebook_size))
+        self.register_buffer("embedding_sum", embedding)
+        self.register_buffer("_embedding", None, persistent=False)
+        self._cached_initialized = False
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs) -> None:
+        # Mapping old names to new names
+        mappings = {
+            "inited": "_initialized",
+            "cluster_size": "cluster_usage",
+            "embed_avg": "embedding_sum",
+            "embed_sum": "embedding_sum",
+        }
+        for old_name, new_name in mappings.items():
+            old_name = prefix + old_name
+            if old_name in state_dict:
+                value = state_dict.pop(old_name)
+                if new_name is not None:
+                    state_dict[prefix + new_name] = value
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+    @property
+    def embedding(self) -> torch.Tensor:
+        if self._embedding is None:
+            embedding = (
+                self.embedding_sum / self.cluster_usage.clamp(min=self.epsilon)[:, None]
+            )
+            self.register_buffer("_embedding", embedding, persistent=False)
+            return embedding
+        return self._embedding
+    def _broadcast_buffers(self) -> None:
+        if _is_distributed():
+            for buffer in self.buffers():
+                distributed.broadcast(buffer, 0)
+    def _replace_expired_codes(self, samples: torch.Tensor, mask: torch.Tensor) -> None:
+        # Replaces expired centroids, as indicated by `mask` (a true value indicate the code needs to be replaced).
+        # The new codes are sampled from the batch `samples`.
+        new_vectors = _sample_vectors(samples, self.codebook_size)
+        replace_cluster_usage = (
+            self.replaced_usage_ratio * self.cluster_usage.sum() / self.codebook_size
+        )
+        self.embedding_sum[:] = torch.where(
+            mask[:, None], replace_cluster_usage * new_vectors, self.embedding_sum
+        )
+        self.cluster_usage[:] = torch.where(
+            mask, replace_cluster_usage, self.cluster_usage
+        )
+    def _reshape_input(self, x: torch.Tensor) -> torch.Tensor:
+        # Flattens all the dimensions but the last one, e.g. return a vector of shape `[N, D]`.
+        x = rearrange(x, "... d -> (...) d")
+        return x
+    def _reshape_codes(self, codes: torch.Tensor, shape: torch.Size) -> torch.Tensor:
+        return codes.view(*shape[:-1])
+    def _quantize(self, x: torch.Tensor) -> torch.Tensor:
+        # Projects each vector in `x` over the nearest centroid and return its index.
+        # `x` should be `[N, D]` with `N` the number of input vectors and `D` the dimension.
+        assert x.dim() == 2
+        dists = torch.cdist(x[None], self.embedding[None], p=2)[0]
+        codes = dists.argmin(dim=-1)
+        return codes
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Given a tensor `x` of shape `[*, D]`, returns a tensor of integer codes of shape `[*]`.
+        The codes are defined as the indexes of the centroids nearest to each vector in `x`.
+        """
+        assert x.dtype.is_floating_point, f"Input should be floats, got {x.dtype}"
+        shape = x.shape
+        x = self._reshape_input(x)
+        codes = self._quantize(x)
+        codes = self._reshape_codes(codes, shape)
+        return codes
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Given a tensor of codes of shape `[*]`, returns a tensor of shape `[*, D]`,
+        corresponding to the centroids associated to each code index.
+        """
+        assert (
+            not codes.dtype.is_floating_point
+        ), f"Codes should be integers, got {codes.dtype}"
+        quantized = F.embedding(codes, self.embedding)
+        return quantized
+    def forward(
+        self, x: torch.Tensor, initialize: bool = True
+    ) -> _CodebookForwardResult:
+        shape = x.shape
+        x = self._reshape_input(x)
+        flat_codes = self._quantize(x)
+        codes = self._reshape_codes(flat_codes, shape)
+        quantized = self.decode(codes)
+        metrics: tp.Dict[str, torch.Tensor] = {}
+        return _CodebookForwardResult(quantized, codes, metrics)
+class VectorQuantization(nn.Module):
+    """Vector quantization implementation.
+    Currently supports only euclidean distance.
+    Args:
+        dim (int): Dimension
+        codebook_size (int): Codebook size
+        codebook_dim (int): Codebook dimension. If not defined, uses the specified dimension in dim.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_usage_ratio (float): Defines the threshold for the cluster usage under which a centroid
+            is replaced. This is expressed as a fraction of the usage a centroid would get under
+            a uniform distribution, so that it doesn't depend on the batch size etc.
+        replaced_usage_ratio (float): When replacing a centroid, use this as an initial centroid usage,
+            to avoid the centroid getting replaced too quickly.
+        check_unused_every (int): Check for unused centroids every `check_unused_every` iterations.
+            This is to avoid too many synchronization points.
+    """
+    def __init__(
+        self,
+        dim: int,
+        codebook_size: int,
+        codebook_dim: tp.Optional[int] = None,
+        decay: float = 0.99,
+        epsilon: float = 1e-5,
+        threshold_usage_ratio: float = 0.1,
+        **kwargs,
+    ):
+        super().__init__()
+        if codebook_dim is None:
+            codebook_dim = dim
+        requires_projection = codebook_dim != dim
+        self.project_in = (
+            nn.Linear(dim, codebook_dim) if requires_projection else nn.Identity()
+        )
+        self.project_out = (
+            nn.Linear(codebook_dim, dim) if requires_projection else nn.Identity()
+        )
+        self.epsilon = epsilon
+        self._codebook = EuclideanCodebook(
+            dim=codebook_dim,
+            codebook_size=codebook_size,
+            decay=decay,
+            epsilon=epsilon,
+            threshold_usage_ratio=threshold_usage_ratio,
+            **kwargs,
+        )
+        self.codebook_size = codebook_size
+    @property
+    def embedding(self):
+        return self._codebook.embedding
+    def _rearrange_input(self, x):
+        x = rearrange(x, "b d n -> b n d")
+        return x
+    def _rearrange_output(self, quantized):
+        quantized = rearrange(quantized, "b n d -> b d n")
+        return quantized
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Encodes `x` into discrete integer codes."""
+        x = self._rearrange_input(x)
+        x = self.project_in(x)
+        codes = self._codebook.encode(x)
+        return codes
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Converts integer codes into quantized vectors."""
+        quantized = self._codebook.decode(codes)
+        quantized = self.project_out(quantized)
+        quantized = self._rearrange_output(quantized)
+        return quantized
+    def forward(self, x: torch.Tensor, initialize: bool = True) -> _VQForwardResult:
+        x = self._rearrange_input(x)
+        quantized, codes, metrics = self._codebook(x, initialize=initialize)
+        loss = zero_scalar(x.device)
+        quantized = self.project_out(quantized)
+        quantized = self._rearrange_output(quantized)
+        return _VQForwardResult(quantized, codes, loss, metrics)
+class ResidualVectorQuantization(nn.Module):
+    """Residual vector quantization implementation.
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    """
+    def __init__(self, *, num_quantizers: int, codebook_offset: int, **kwargs):
+        super().__init__()
+        self.layers = nn.ModuleList(
+            [VectorQuantization(**kwargs) for _ in range(num_quantizers)]
+        )
+        self.codebook_offset = codebook_offset
+    def forward(
+        self, x: torch.Tensor, n_q: tp.Optional[int] = None
+    ) -> _VQForwardResult:
+        """
+        Args:
+            x (torch.Tensor): input tensor to quantize, of shape `[B, C, T]`.
+            n_q (int or None): if provided, number of codebook levels to use in RVQ.
+        """
+        quantized_out = zero_scalar(x.device)
+        residual = x
+        all_losses = []
+        all_codes = []
+        all_metrics: tp.Dict[str, torch.Tensor] = {}
+        n_q = n_q or len(self.layers)
+        previous_layer_is_initialized = True
+        for i, layer in enumerate(self.layers[:n_q]):  # type: ignore
+            quantized, codes, loss, metrics = layer(
+                residual, initialize=previous_layer_is_initialized
+            )
+            quantized = quantized.detach()
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_codes.append(codes)
+            all_losses.append(loss)
+            for key, value in metrics.items():
+                if key in all_metrics:
+                    all_metrics[key] += value / n_q
+                else:
+                    all_metrics[key] = value / n_q
+                all_metrics[key + f"_{i + self.codebook_offset}"] = value
+        out_losses, out_codes = map(torch.stack, (all_losses, all_codes))
+        return _VQForwardResult(quantized_out, out_codes, out_losses, all_metrics)
+    def encode(self, x: torch.Tensor, n_q: tp.Optional[int] = None) -> torch.Tensor:
+        """Encodes `x` into discrete integer codes. If `n_q` is provided, only uses the first `n_q` codebook levels."""
+        residual = x
+        all_indices = []
+        n_q = n_q or len(self.layers)
+        for layer in self.layers[:n_q]:  # type: ignore
+            indices = layer.encode(residual)
+            quantized = layer.decode(indices)
+            residual = residual - quantized
+            all_indices.append(indices)
+        out_indices = torch.stack(all_indices)
+        return out_indices
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Converts the integer codes into quantized vectors."""
+        quantized = zero_scalar(codes.device)
+        for idx, layer_codes in enumerate(codes):
+            layer = self.layers[idx]
+            quantized = quantized + layer.decode(layer_codes)
+        return quantized

moshi/quantization/vq.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import typing as tp
+import torch
+from .base import BaseQuantizer, QuantizedResult
+from .core_vq import ResidualVectorQuantization
+class ResidualVectorQuantizer(BaseQuantizer):
+    """Residual Vector Quantizer.
+    Args:
+        dimension (int): Dimension of the codebooks.
+        input_dimension (None or int): dimension of the input, defaults to `dimension` if not provided.
+        output_dimension (None or int): dimension of the output, defaults to `dimension` if not provided.
+        n_q (int): Number of vector quantizers used.
+        q_dropout (bool): Random quantizer drop out at train time.
+        no_quantization_rate (float): Gives the probability of applying no quantization at all
+            at train time. The RVQ codebooks will still get the input value to learn the proper codebook.
+        bins (int): Codebook size.
+        decay (float): Decay for exponential moving average over the codebooks.
+        threshold_usage_ratio (float): Defines the threshold for the cluster usage under which a centroid
+            is replaced. This is expressed as a fraction of the usage a centroid would get under
+            a uniform distribution, so that it doesn't depend on the batch size etc.
+        replaced_usage_ratio (float): When replacing a centroid, use this as an initial centroid usage,
+            to avoid the centroid getting replaced too quickly.
+        codebook_offset (int): Offset to use for the codebook indices. This is useful when using multiple quantizers
+            such as in SplitResidualVectorQuantizer.
+        force_projection (bool): Whether to force input and output projections even when dimension is constant.
+        generator_seed (int or None): seed used to initialize the RNG used for no quantization.
+    """
+    def __init__(
+        self,
+        dimension: int = 128,
+        input_dimension: tp.Optional[int] = None,
+        output_dimension: tp.Optional[int] = None,
+        n_q: int = 8,
+        q_dropout: bool = False,
+        q_first_only_proba: float = 0.0,
+        no_quantization_rate: float = 0.0,
+        bins: int = 1024,
+        decay: float = 0.99,
+        threshold_usage_ratio: float = 0.1,
+        replaced_usage_ratio: float = 1.0,
+        codebook_offset: int = 0,
+        force_projection: bool = False,
+        generator_seed: tp.Optional[int] = None,
+    ):
+        super().__init__()
+        self.max_n_q = n_q
+        self.n_q = n_q
+        self.q_dropout = q_dropout
+        self.no_quantization_rate = no_quantization_rate
+        self.q_first_only_proba = q_first_only_proba
+        self.dimension = dimension
+        self.input_dimension = input_dimension or dimension
+        self.output_dimension = output_dimension or dimension
+        self.bins = bins
+        self.decay = decay
+        self.input_proj: torch.nn.Module
+        self.output_proj: torch.nn.Module
+        self.generator = None
+        if generator_seed is not None:
+            self.generator = torch.Generator(
+                device="cuda" if torch.cuda.is_available() else "cpu"
+            )
+            self.generator.manual_seed(generator_seed)
+        if self.input_dimension == self.dimension and not force_projection:
+            self.input_proj = torch.nn.Identity()
+        else:
+            self.input_proj = torch.nn.Conv1d(
+                self.input_dimension, self.dimension, 1, bias=False
+            )
+        if self.output_dimension == self.dimension and not force_projection:
+            self.output_proj = torch.nn.Identity()
+        else:
+            self.output_proj = torch.nn.Conv1d(
+                self.dimension, self.output_dimension, 1, bias=False
+            )
+        self.vq = ResidualVectorQuantization(
+            dim=self.dimension,
+            codebook_size=self.bins,
+            num_quantizers=self.n_q,
+            decay=self.decay,
+            threshold_usage_ratio=threshold_usage_ratio,
+            replaced_usage_ratio=replaced_usage_ratio,
+            codebook_offset=codebook_offset,
+        )
+    def forward(self, x: torch.Tensor, frame_rate: int):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape [B, C, T] with `C` number of channels.
+            frame_rate (int): frame rate of the input (e.g `T = frame_rate * duration`), used to compute
+                the bandwidth.
+        Returns:
+            QuantizedResult: Quantized result with the following attributes:
+                - `x` (torch.Tensor): Quantized tensor of shape [B, C, T].
+                - `codes` (torch.Tensor): Quantized codes of shape [B, K, T] with `K` number of codebooks.
+                - `bw` (torch.Tensor): Bandwidth of the quantized tensor in kbits per second.
+                - `penalty` (torch.Tensor): Commitment loss.
+                - `metrics` (dict): RVQ metrics, in particular rate of dead code replacement, and entropy.
+        """
+        n_q = self.n_q
+        x = self.input_proj(x)
+        bw_per_q = math.log2(self.bins) * frame_rate / 1000
+        quantized, codes, commit_loss, metrics = self.vq(x, n_q=n_q)
+        B, _, _ = quantized.shape
+        quantized = self.output_proj(quantized)
+        codes = codes.transpose(0, 1)
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        bw = torch.tensor(n_q * bw_per_q).to(x)
+        return QuantizedResult(
+            quantized, codes, bw, penalty=torch.mean(commit_loss), metrics=metrics
+        )
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Encode a given input tensor with the specified frame rate at the given bandwidth.
+        The RVQ encode method sets the appropriate number of quantizer to use
+        and returns indices for each quantizer.
+        """
+        n_q = self.n_q
+        if x.shape[-1] == 0:
+            return torch.empty((x.shape[0], n_q, 0), device=x.device, dtype=torch.int64)
+        x = self.input_proj(x)
+        codes = self.vq.encode(x, n_q=n_q)
+        codes = codes.transpose(0, 1)
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        return codes
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode the given codes to the quantized representation."""
+        # codes is [B, K, T], with T frames, K nb of codebooks, vq.decode expects [K, B, T].
+        codes = codes.transpose(0, 1)
+        quantized = self.vq.decode(codes)
+        quantized = self.output_proj(quantized)
+        return quantized
+    @property
+    def total_codebooks(self):
+        return self.max_n_q
+    @property
+    def num_codebooks(self):
+        return self.n_q
+    def set_num_codebooks(self, n: int):
+        assert n >= 0 and n <= self.max_n_q
+        self.n_q = n
+    @property
+    def cardinality(self) -> int:
+        return self.bins
+class SplitResidualVectorQuantizer(BaseQuantizer):
+    """Residual Vector Quantizer with separate projections for the first quantizer and the rest.
+    Args:
+        n_q (int): Number of residual vector quantizers used.
+        n_semantic_q (int): Number of residual vector quantizers used for the semantic quantizer.
+        no_quantization_mode (str): if 'true_skip', when doing no quantization, the input will not go
+            through the sub quantizers. If `independent`, independent decisions are taken by
+            the semantic and acoustic quantizers. If `same` (the default), the same decision is taken by both.
+        **kwargs: Arguments to the constructor of `ResidualVectorQuantizer` that are shared between both.
+    """
+    def __init__(
+        self,
+        *,
+        n_q: int = 8,
+        no_quantization_rate: float = 0.0,
+        no_quantization_mode: str = "same",
+        n_q_semantic: int = 1,
+        **kwargs,
+    ):
+        super().__init__()
+        assert n_q > n_q_semantic, (
+            f"Number of quantizers {n_q} must be larger "
+            f"than the number of semantic quantizers {n_q_semantic}."
+        )
+        self.max_n_q = n_q
+        self.n_q_semantic = n_q_semantic
+        self.n_q_acoustic = n_q - n_q_semantic
+        if no_quantization_mode == "true_skip":
+            self.no_quantization_rate = no_quantization_rate
+            # Setting to zero for the underlying RVQ.
+            no_quantization_rate = 0.0
+        else:
+            self.no_quantization_rate = 0.0
+        if no_quantization_mode == "same":
+            kwargs["generator_seed"] = 1234
+        kwargs["no_quantization_rate"] = no_quantization_rate
+        q_dropout = kwargs.pop("q_dropout", False)
+        self.rvq_first = ResidualVectorQuantizer(
+            n_q=n_q_semantic, force_projection=True, q_dropout=False, **kwargs
+        )
+        self.rvq_rest = ResidualVectorQuantizer(
+            n_q=n_q - n_q_semantic,
+            codebook_offset=1,
+            force_projection=True,
+            q_dropout=q_dropout,
+            **kwargs,
+        )
+        if no_quantization_mode == "true_skip":
+            assert self.rvq_first.input_dimension == self.rvq_first.output_dimension
+            assert self.rvq_rest.input_dimension == self.rvq_rest.output_dimension
+    def _renorm_and_add(
+        self,
+        first_val: torch.Tensor,
+        rest_val: torch.Tensor,
+        n_q_semantic: int,
+        n_q_acoustic: int,
+    ):
+        """Renormalizes values from `rvq_first` and `rvq_rest` and adds them.
+        This allows correcting statistics that are normalized by the number of quantizers. To renormalize, we use the
+        number of quantizers that are actually used, e.g. taking into account quantizer dropout.
+        """
+        n_q = n_q_semantic + n_q_acoustic
+        renorm_first_val = first_val * n_q_semantic / n_q
+        renorm_rest_val = rest_val * n_q_acoustic / n_q
+        return renorm_first_val + renorm_rest_val
+    def forward(self, x: torch.Tensor, frame_rate: int):
+        """
+        Args:
+            x (torch.Tensor): Input tensor of shape [B, C, T] with `C` number of channels.
+            frame_rate (int): frame rate of the input (e.g `T = frame_rate * duration`), used to compute
+                the bandwidth.
+        Returns:
+            QuantizedResult: Quantized result with the following attributes:
+                - `x` (torch.Tensor): Quantized tensor of shape [B, C, T].
+                - `codes` (torch.Tensor): Quantized codes of shape [B, K, T] with `K` number of codebooks.
+                - `bw` (torch.Tensor): Bandwidth of the quantized tensor in kbits per second.
+                - `penalty` (torch.Tensor): Commitment loss.
+                - `metrics` (dict): RVQ metrics, in particular rate of dead code replacement, and entropy.
+        """
+        semantic_result = self.rvq_first(x, frame_rate)
+        if self.n_q == self.n_q_semantic:
+            return semantic_result
+        acoustic_result = self.rvq_rest(x, frame_rate)
+        full_quantized_emb = semantic_result.x + acoustic_result.x
+        full_quantized_codes = torch.cat(
+            [semantic_result.codes, acoustic_result.codes], dim=1
+        )
+        # This is the actual number of quantizers used,  e.g. taking into account quantizer dropout.
+        n_q_semantic = semantic_result.codes.shape[1]
+        n_q_acoustic = acoustic_result.codes.shape[1]
+        full_quantized_bandwidth = semantic_result.bandwidth + acoustic_result.bandwidth
+        full_quantized_penalty = self._renorm_and_add(
+            semantic_result.penalty, acoustic_result.penalty, n_q_semantic, n_q_acoustic
+        )
+        full_quantized_metrics = semantic_result.metrics
+        for key, value in acoustic_result.metrics.items():
+            if key in full_quantized_metrics:
+                full_quantized_metrics[key] = self._renorm_and_add(
+                    full_quantized_metrics[key], value, n_q_semantic, n_q_acoustic
+                )
+            else:
+                full_quantized_metrics[key] = value
+        return QuantizedResult(
+            full_quantized_emb,
+            full_quantized_codes,
+            full_quantized_bandwidth,
+            penalty=full_quantized_penalty,
+            metrics=full_quantized_metrics,
+        )
+    def encode(self, x: torch.Tensor) -> torch.Tensor:
+        """Encode a given input tensor with the specified frame rate at the given bandwidth.
+        The RVQ encode method sets the appropriate number of quantizer to use
+        and returns indices for each quantizer.
+        """
+        codes = self.rvq_first.encode(x)
+        if self.n_q > self.n_q_semantic:
+            acoustic_codes = self.rvq_rest.encode(x)
+            codes = torch.cat([codes, acoustic_codes], dim=1)
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        return codes
+    def decode(self, codes: torch.Tensor) -> torch.Tensor:
+        """Decode the given codes to the quantized representation."""
+        # codes is [B, K, T], with T frames, K nb of codebooks.
+        quantized = self.rvq_first.decode(codes[:, : self.n_q_semantic])
+        if codes.shape[1] > self.n_q_semantic:
+            quantized += self.rvq_rest.decode(codes[:, self.n_q_semantic :])
+        return quantized
+    @property
+    def total_codebooks(self):
+        return self.rvq_first.max_n_q + self.rvq_rest.max_n_q
+    @property
+    def num_codebooks(self):
+        return self.rvq_first.num_codebooks + self.rvq_rest.num_codebooks
+    @property
+    def n_q(self):
+        return self.rvq_first.n_q + self.rvq_rest.n_q
+    @property
+    def dimension(self):
+        return self.rvq_first.dimension
+    @property
+    def semantic_quantizer(self) -> ResidualVectorQuantizer:
+        """This returns the quantizer that models the first level of the hierarchy (typically semantic)."""
+        return self.rvq_first
+    @property
+    def acoustic_quantizer(self) -> ResidualVectorQuantizer:
+        """This returns the quantizer that models the higher levels of the hierarchy (typically acoustic)."""
+        return self.rvq_rest
+    def set_num_codebooks(self, n: int):
+        assert n >= self.n_q_semantic and n <= self.total_codebooks
+        self.rvq_rest.set_num_codebooks(n - self.n_q_semantic)
+    @property
+    def cardinality(self) -> int:
+        assert self.rvq_rest.cardinality == self.rvq_first.cardinality
+        return self.rvq_first.cardinality

moshi/server.py ADDED Viewed

	@@ -0,0 +1,256 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import asyncio
+from dataclasses import dataclass
+import random
+import os
+from pathlib import Path
+import tarfile
+import time
+import secrets
+import sys
+import aiohttp
+from aiohttp import web
+from huggingface_hub import hf_hub_download
+import numpy as np
+import sentencepiece
+import sphn
+import torch
+from .client_utils import make_log
+from .models import loaders, MimiModel, LMModel, LMGen
+def log(level: str, msg: str):
+    print(make_log(level, msg))
+def seed_all(seed):
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)  # for multi-GPU setups
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.backends.cudnn.deterministic = False
+    torch.backends.cudnn.benchmark = False
+@dataclass
+class ServerState:
+    mimi: MimiModel
+    text_tokenizer: sentencepiece.SentencePieceProcessor
+    lm_gen: LMGen
+    lock: asyncio.Lock
+    def __init__(self, mimi: MimiModel, text_tokenizer: sentencepiece.SentencePieceProcessor,
+                 lm: LMModel, device: str | torch.device):
+        self.mimi = mimi
+        self.text_tokenizer = text_tokenizer
+        self.lm_gen = LMGen(lm)
+        self.device = device
+        self.frame_size = int(self.mimi.sample_rate / self.mimi.frame_rate)
+        self.lock = asyncio.Lock()
+        self.mimi.streaming_forever(1)
+        self.lm_gen.streaming_forever(1)
+    def warmup(self):
+        for chunk in range(4):
+            chunk = torch.zeros(1, 1, self.frame_size, dtype=torch.float32, device=self.device)
+            codes = self.mimi.encode(chunk)
+            for c in range(codes.shape[-1]):
+                tokens = self.lm_gen.step(codes[:, :, c: c + 1])
+                if tokens is None:
+                    continue
+                _ = self.mimi.decode(tokens[:, 1:])
+        torch.cuda.synchronize()
+    async def handle_chat(self, request):
+        ws = web.WebSocketResponse()
+        await ws.prepare(request)
+        async def recv_loop():
+            nonlocal close
+            try:
+                async for message in ws:
+                    if message.type == aiohttp.WSMsgType.ERROR:
+                        log("error", f"{ws.exception()}")
+                        break
+                    elif message.type == aiohttp.WSMsgType.CLOSED:
+                        break
+                    elif message.type != aiohttp.WSMsgType.BINARY:
+                        log("error", f"unexpected message type {message.type}")
+                        continue
+                    message = message.data
+                    if not isinstance(message, bytes):
+                        log("error", f"unsupported message type {type(message)}")
+                        continue
+                    if len(message) == 0:
+                        log("warning", "empty message")
+                        continue
+                    kind = message[0]
+                    if kind == 1:  # audio
+                        payload = message[1:]
+                        opus_reader.append_bytes(payload)
+                    else:
+                        log("warning", f"unknown message kind {kind}")
+            finally:
+                close = True
+                log("info", "connection closed")
+        async def opus_loop():
+            all_pcm_data = None
+            while True:
+                if close:
+                    return
+                await asyncio.sleep(0.001)
+                pcm = opus_reader.read_pcm()
+                if pcm.shape[-1] == 0:
+                    continue
+                if all_pcm_data is None:
+                    all_pcm_data = pcm
+                else:
+                    all_pcm_data = np.concatenate((all_pcm_data, pcm))
+                while all_pcm_data.shape[-1] >= self.frame_size:
+                    be = time.time()
+                    chunk = all_pcm_data[: self.frame_size]
+                    all_pcm_data = all_pcm_data[self.frame_size:]
+                    chunk = torch.from_numpy(chunk)
+                    chunk = chunk.to(device=self.device)[None, None]
+                    codes = self.mimi.encode(chunk)
+                    for c in range(codes.shape[-1]):
+                        tokens = self.lm_gen.step(codes[:, :, c: c + 1])
+                        if tokens is None:
+                            continue
+                        assert tokens.shape[1] == self.lm_gen.lm_model.dep_q + 1
+                        main_pcm = self.mimi.decode(tokens[:, 1:])
+                        main_pcm = main_pcm.cpu()
+                        opus_writer.append_pcm(main_pcm[0, 0].numpy())
+                        text_token = tokens[0, 0, 0].item()
+                        if text_token not in (0, 3):
+                            _text = self.text_tokenizer.id_to_piece(text_token)  # type: ignore
+                            _text = _text.replace("▁", " ")
+                            msg = b"\x02" + bytes(_text, encoding="utf8")
+                            log("info", f"text token '{_text}'")
+                            await ws.send_bytes(msg)
+                    log("info", f"frame handled in {1000 * (time.time() - be):.1f}ms")
+        async def send_loop():
+            while True:
+                if close:
+                    return
+                await asyncio.sleep(0.001)
+                msg = opus_writer.read_bytes()
+                if len(msg) > 0:
+                    await ws.send_bytes(b"\x01" + msg)
+        log("info", "accepted connection")
+        close = False
+        async with self.lock:
+            opus_writer = sphn.OpusStreamWriter(self.mimi.sample_rate)
+            opus_reader = sphn.OpusStreamReader(self.mimi.sample_rate)
+            self.mimi.reset_streaming()
+            self.lm_gen.reset_streaming()
+            # Send the handshake.
+            await ws.send_bytes(b"\x00")
+            await asyncio.gather(opus_loop(), recv_loop(), send_loop())
+        log("info", "done with connection")
+        return ws
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", default="localhost", type=str)
+    parser.add_argument("--port", default=8998, type=int)
+    parser.add_argument("--static", type=str)
+    parser.add_argument("--gradio-tunnel", action='store_true', help='Activate a gradio tunnel.')
+    parser.add_argument("--gradio-tunnel-token",
+                        help='Provide a custom (secret) token here to keep getting the same URL.')
+    parser.add_argument("--tokenizer", type=str, help="Path to a local tokenizer file.")
+    parser.add_argument("--moshi-weight", type=str, help="Path to a local checkpoint file for Moshi.")
+    parser.add_argument("--mimi-weight", type=str, help="Path to a local checkpoint file for Mimi.")
+    parser.add_argument("--hf-repo", type=str, default=loaders.DEFAULT_REPO,
+                        help="HF repo to look into, defaults Moshiko. "
+                             "Use this to select a different pre-trained model.")
+    parser.add_argument("--device", type=str, default="cuda", help="Device on which to run, defaults to 'cuda'.")
+    args = parser.parse_args()
+    seed_all(42424242)
+    setup_tunnel = None
+    tunnel_token = ''
+    if args.gradio_tunnel:
+        try:
+            from gradio import networking  # type: ignore
+        except ImportError:
+            log("error", "Cannot find gradio which is required to activate a tunnel. "
+                         "Please install with `pip install gradio`.")
+            sys.exit(1)
+        setup_tunnel = networking.setup_tunnel
+        if args.gradio_tunnel_token is None:
+            tunnel_token = secrets.token_urlsafe(32)
+        else:
+            tunnel_token = args.gradio_tunnel_token
+    log("info", "loading mimi")
+    if args.mimi_weight is None:
+        args.mimi_weight = hf_hub_download(args.hf_repo, loaders.MIMI_NAME)
+    mimi = loaders.get_mimi(args.mimi_weight, args.device)
+    log("info", "mimi loaded")
+    if args.tokenizer is None:
+        args.tokenizer = hf_hub_download(args.hf_repo, loaders.TEXT_TOKENIZER_NAME)
+    text_tokenizer = sentencepiece.SentencePieceProcessor(args.tokenizer)  # type: ignore
+    log("info", "loading moshi")
+    if args.moshi_weight is None:
+        args.moshi_weight = hf_hub_download(args.hf_repo, loaders.MOSHI_NAME)
+    lm = loaders.get_moshi_lm(args.moshi_weight, args.device)
+    log("info", "moshi loaded")
+    state = ServerState(mimi, text_tokenizer, lm, args.device)
+    log("info", "warming up the model")
+    state.warmup()
+    app = web.Application()
+    app.router.add_get("/api/chat", state.handle_chat)
+    static_path: None | str = None
+    if args.static is None:
+        log("info", "retrieving the static content")
+        dist_tgz = hf_hub_download("kyutai/moshi-artifacts", "dist.tgz")
+        dist_tgz = Path(dist_tgz)
+        dist = dist_tgz.parent / "dist"
+        if not dist.exists():
+            with tarfile.open(dist_tgz, "r:gz") as tar:
+                tar.extractall(path=dist_tgz.parent)
+        static_path = str(dist)
+    elif args.static != "none":
+        # When set to the "none" string, we don't serve any static content.
+        static_path = args.static
+    if static_path is not None:
+        async def handle_root(_):
+            return web.FileResponse(os.path.join(static_path, "index.html"))
+        log("info", f"serving static content from {static_path}")
+        app.router.add_get("/", handle_root)
+        app.router.add_static(
+            "/", path=static_path, follow_symlinks=True, name="static"
+        )
+    log("info", f"Access the Web UI directly at http://{args.host}:{args.port}")
+    if setup_tunnel is not None:
+        tunnel = setup_tunnel('localhost', args.port, tunnel_token, None)
+        log("info", f"Tunnel started, if executing on a remote GPU, you can use {tunnel}.")
+        log("info", "Note that this tunnel goes through the US and you might experience high latency in Europe.")
+    web.run_app(app, port=args.port)
+with torch.no_grad():
+    main()

moshi/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Utilities."""

moshi/utils/autocast.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+class TorchAutocast:
+    """TorchAutocast utility class.
+    Allows you to enable and disable autocast. This is specially useful
+    when dealing with different architectures and clusters with different
+    levels of support.
+    Args:
+        enabled (bool): Whether to enable torch.autocast or not.
+        args: Additional args for torch.autocast.
+        kwargs: Additional kwargs for torch.autocast
+    """
+    def __init__(self, enabled: bool, *args, **kwargs):
+        self.autocast = torch.autocast(*args, **kwargs) if enabled else None
+    def __enter__(self):
+        if self.autocast is None:
+            return
+        try:
+            self.autocast.__enter__()
+        except RuntimeError:
+            device = self.autocast.device
+            dtype = self.autocast.fast_dtype
+            raise RuntimeError(
+                f"There was an error autocasting with dtype={dtype} device={device}\n"
+                "If you are on the FAIR Cluster, you might need to use autocast_dtype=float16"
+            )
+    def __exit__(self, *args, **kwargs):
+        if self.autocast is None:
+            return
+        self.autocast.__exit__(*args, **kwargs)

moshi/utils/compile.py ADDED Viewed

	@@ -0,0 +1,284 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Provides some extra utilities around torch compile, in particular with a way
+to fully deactivate it easily with a context manager.
+Provides a simple activation checkpointing that is compatible with FSDP and torch compile.
+Finally, provides some utilities for CUDA graphing functions.
+"""
+from contextlib import contextmanager
+from functools import wraps
+import inspect
+import os
+import typing as tp
+import torch
+from torch import cuda
+_compile_disabled: bool = False
+@contextmanager
+def no_compile():
+    """Disable torch.compile locally. Now Pytorch 2.4 provides a function to do that."""
+    global _compile_disabled
+    prev_disabled = _compile_disabled
+    _compile_disabled = True
+    try:
+        yield
+    finally:
+        _compile_disabled = prev_disabled
+def torch_compile_lazy(fun):
+    """torch.compile creates a huge pool of processes, even when not using the function at all,
+    e.g. with Dora. This can polute stderr when doing CTRL+C. So we do it in a lazy way.
+    """
+    if os.environ.get("NO_TORCH_COMPILE"):
+        return fun
+    fun_compiled = None
+    @wraps(fun)
+    def _wrapped(*args, **kwargs):
+        nonlocal fun_compiled
+        if _compile_disabled:
+            return fun(*args, **kwargs)
+        if fun_compiled is None:
+            fun_compiled = torch.compile(fun)
+        return fun_compiled(*args, **kwargs)
+    return _wrapped
+class Checkpoint(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, function, *args) -> tp.Any:
+        to_save = []
+        ctx.others = []
+        ctx.function = function
+        # Sources will indicate whether the arg in position N is
+        # a tensor stored in ctx.save_for_backward, or inside ctx.others.
+        ctx.sources = []
+        new_args = []
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                to_save.append(arg)
+                ctx.sources.append("tensor")
+                new_args.append(arg.detach())
+            else:
+                ctx.sources.append("other")
+                ctx.others.append(arg)
+                new_args.append(arg)
+        ctx.save_for_backward(*to_save)
+        # During the forward, we just make a pass with no gradient computed.
+        with torch.no_grad():
+            res = function(*new_args)
+        return res
+    @staticmethod
+    def backward(ctx, *grads) -> tp.Tuple[tp.Optional[torch.Tensor], ...]:
+        pseudo_tensors = []
+        with torch.set_grad_enabled(True):
+            # We create leaf tensors to collect the output gradients.
+            # We call them pseudo_tensors because they are pretending to be the input
+            # to `function` but are not directly
+            for tensor in ctx.saved_tensors:
+                pseudo_tensor = tensor.detach()
+                pseudo_tensor.requires_grad_(True)
+                pseudo_tensors.append(pseudo_tensor)
+            pseudo_tensors_copy = list(pseudo_tensors)
+            args = []
+            for source in ctx.sources:
+                if source == "other":
+                    args.append(ctx.others.pop(0))
+                else:
+                    assert source == "tensor"
+                    args.append(pseudo_tensors_copy.pop(0))
+            res = ctx.function(*args)
+            # The second forward with grad computation allows us to connect the input leaf tensors
+            # inside pseudo_tensors, to the outputs of the function called.
+        if not isinstance(res, tuple):
+            res = (res,)
+        # Now we just ask Torch to compute the derivative of `res` given the gradient coming from above
+        # `grads`. The computed gradient will end up into the `pseudo_tensors` grad attributes.
+        torch.autograd.backward(res, grads)
+        out: tp.List[tp.Optional[torch.Tensor]] = [None]
+        for source in ctx.sources:
+            # We still need to output `None` values for non tensor parameters.
+            if source == "other":
+                out.append(None)
+            else:
+                assert source == "tensor"
+                out.append(pseudo_tensors.pop(0).grad)
+        return tuple(out)
+def simple_checkpoint(module: torch.nn.Module, *args, **kwargs):
+    """Custom implementation of checkpointing in PyTorch as the builtin implementation is broken
+    when using torch compile. Only supports wrapping a `nn.Module` with a forward with no `*args` or `**kwargs`.
+    https://github.com/pytorch/pytorch/issues/97436.
+    Should be resolved in nightlies, but it is quite fun and simple to code it ourselves.
+    """
+    if hasattr(module, "_fsdp_wrapped_module"):
+        module_for_sig = module._fsdp_wrapped_module
+    else:
+        module_for_sig = module
+    sig = inspect.signature(module_for_sig.forward)
+    # We first flatten all arguments to use only *args, to make things easier and because
+    # torch.autograd.Function has weird support for kwargs.
+    bounded = sig.bind(*args, **kwargs)
+    new_args = []
+    for name, param in sig.parameters.items():
+        if param.kind in {
+            inspect.Parameter.VAR_POSITIONAL,
+            inspect.Parameter.VAR_KEYWORD,
+        }:
+            raise RuntimeError("simple_checkpoint doesn't support var args.")
+        if name not in bounded.arguments:
+            break
+        new_args.append(bounded.arguments[name])
+    return Checkpoint.apply(module, *new_args)
+_in_cuda_graph = False
+_disable_cuda_graph = False
+def in_cuda_graph() -> bool:
+    """Indicate whether we are in a function that is CUDA Graphed (or will be soon)."""
+    return _in_cuda_graph
+@contextmanager
+def _set_in_cuda_graph():
+    global _in_cuda_graph
+    assert not _in_cuda_graph
+    _in_cuda_graph = True
+    try:
+        yield
+    finally:
+        _in_cuda_graph = False
+def _is_cuda_graph_enabled() -> bool:
+    if _disable_cuda_graph:
+        return False
+    no_cuda_graph = os.environ.get("NO_CUDA_GRAPH", "")
+    if no_cuda_graph.lower() not in {"0", "no", "n", ""}:
+        return False
+    return True
+@contextmanager
+def no_cuda_graph():
+    """Deactivate CUDA Graphing for all the calls in this context manager."""
+    global _disable_cuda_graph
+    old_value = _disable_cuda_graph
+    _disable_cuda_graph = True
+    try:
+        yield
+    finally:
+        _disable_cuda_graph = old_value
+class CUDAGraphed:
+    """Allow simple CUDA Graphing of a function.
+    Args:
+        func: callable, taking any number of arguments. Its tensors arguments should
+            be top level args, not nested in structures (tuples, dicts, etc). Keyword
+            arguments are NOT supported for simplicity.
+        warmup_steps: how many call to make normally before CUDA Graphing. In particular, this
+            allows torch.compiled functions to get properly compiled.
+        disabled: if True, just call the func directly, useful to quickly deactivate on CPU.
+    """
+    def __init__(self, func: tp.Callable, warmup_steps: int = 1, disable: bool = False):
+        self.func = func
+        self.warmup_steps = warmup_steps
+        self.disable = disable
+        self._graph: cuda.CUDAGraph | None = None
+        self._output: tuple | None = None
+        self._args: tuple | None = None
+    def reset(self, warmup_steps: int = 0) -> None:
+        """Reset the state, meaning the next call we get CUDA Graphed again. Useful if some
+        shapes have changed, or external state (e.g. KVCache) has changed."""
+        self.warmup_steps = warmup_steps
+        self._graph = None
+        self._output = None
+        self._args = None
+    def __call__(self, *args, **kwargs) -> tp.Any:
+        if kwargs:
+            raise RuntimeError("Named arguments not supported for now.")
+        if self.disable or not _is_cuda_graph_enabled() or in_cuda_graph():
+            return self.func(*args, **kwargs)
+        def _clone_tensors(args: tuple) -> tuple:
+            out: list = []
+            for arg in args:
+                if isinstance(arg, torch.Tensor):
+                    arg = arg.clone()
+                out.append(arg)
+            return tuple(out)
+        def _match_values_copy_tensors(args: tuple, target_args: tuple) -> None:
+            if len(args) != len(target_args):
+                raise ValueError(
+                    f"Expected {len(target_args)}, but got {args} for CUDA Graphed function."
+                )
+            for idx, (source, target) in enumerate(zip(args, target_args)):
+                if isinstance(target, torch.Tensor):
+                    if not isinstance(source, torch.Tensor):
+                        raise ValueError(
+                            f"Argument #{idx} was a tensor, and is no longer (now {source})."
+                        )
+                    if source.shape != target.shape:
+                        raise ValueError(
+                            f"Argument #{idx} had shape {target.shape}, but got shae {source.shape}"
+                        )
+                    target.copy_(source)
+                else:
+                    if isinstance(source, torch.Tensor):
+                        raise ValueError(
+                            f"Argument #{idx} was not a tensor {target}, but is now one."
+                        )
+                    if source is not target and source != target:
+                        raise ValueError(
+                            f"Argument #{idx} changed value from {target} to {source}."
+                        )
+        with _set_in_cuda_graph():
+            # Prevent any one under us to try and CUDA Graph things.
+            if self._graph is None:
+                if self.warmup_steps <= 0:
+                    self._graph = cuda.CUDAGraph()
+                    # Making a copy just to ensure those are not used else where.
+                    self._args = _clone_tensors(args)
+                    with cuda.graph(self._graph):
+                        self._output = self.func(*self._args)
+                    # At this point nothing really happened, so we have to make it run for real.
+                    self._graph.replay()
+                    return self._output
+                else:
+                    self.warmup_steps -= 1
+                    return self.func(*args)
+            else:
+                assert self._args is not None
+                assert self._output is not None
+                _match_values_copy_tensors(args, self._args)
+                self._graph.replay()
+                return self._output
+def cuda_graph(func: tp.Callable, warmup_steps: int = 1):
+    """Just calls `CUDAGraphed` on the given function."""
+    if not _is_cuda_graph_enabled():
+        return func
+    return CUDAGraphed(func, warmup_steps)

moshi/utils/sampling.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright (c) Kyutai, all rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+def multinomial(
+    input: torch.Tensor, num_samples: int, replacement=False, *, generator=None
+):
+    """torch.multinomial with arbitrary number of dimensions, and number of candidates on the last dimension.
+    Args:
+        input (torch.Tensor): The input tensor containing probabilities.
+        num_samples (int): Number of samples to draw.
+        replacement (bool): Whether to draw with replacement or not.
+    Keywords args:
+        generator (torch.Generator): A pseudorandom number generator for sampling.
+    Returns:
+        torch.Tensor: Last dimension contains num_samples indices
+            sampled from the multinomial probability distribution
+            located in the last dimension of tensor input.
+    """
+    input_ = input.reshape(-1, input.shape[-1])
+    # We should probably be able to remove this once the following PR has landed:
+    # https://github.com/pytorch/pytorch/pull/134818/files
+    # In the meantime, we specialize the case no-replacement, nsamples=1 so as not
+    # to have a synchronization point.
+    if replacement or num_samples != 1:
+        output_ = torch.multinomial(
+            input_,
+            num_samples=num_samples,
+            replacement=replacement,
+            generator=generator,
+        )
+    else:
+        q = torch.empty_like(input_).exponential_(1, generator=generator)
+        q = input_ / q
+        output_ = q.argmax(dim=-1, keepdim=True)
+    output = output_.reshape(*list(input.shape[:-1]), -1)
+    return output
+def sample_top_k(probs: torch.Tensor, k: int) -> torch.Tensor:
+    """Sample next token from top K values along the last dimension of the input probs tensor.
+    Args:
+        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
+        k (int): The k in “top-k”.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    """
+    probs, indices = torch.topk(probs, k, dim=-1)
+    next_token = multinomial(probs, num_samples=1)
+    next_token = indices.gather(-1, next_token)
+    return next_token
+def sample_top_p(probs: torch.Tensor, p: float) -> torch.Tensor:
+    """Sample next token from top P probabilities along the last dimension of the input probs tensor.
+    Args:
+        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
+        p (int): The p in “top-p”.
+    Returns:
+        torch.Tensor: Sampled tokens.
+    """
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort *= (~mask).float()
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token
+def sample_token(
+    logits: torch.Tensor,
+    use_sampling: bool = False,
+    temp: float = 1.0,
+    top_k: int = 0,
+    top_p: float = 0.0,
+) -> torch.Tensor:
+    """Given logits of shape [*, Card], returns a LongTensor of shape [*]."""
+    # Apply softmax for sampling if temp > 0. Else, do greedy sampling to avoid zero division error.
+    if use_sampling and temp > 0.0:
+        probs = torch.softmax(logits / temp, dim=-1)
+        if top_p > 0.0:
+            next_token = sample_top_p(probs, p=top_p)
+        elif top_k > 0:
+            next_token = sample_top_k(probs, k=top_k)
+        else:
+            next_token = multinomial(probs, num_samples=1)
+    else:
+        next_token = torch.argmax(logits, dim=-1, keepdim=True)
+    assert next_token.shape[-1] == 1
+    return next_token[..., 0]
+if __name__ == "__main__":
+    torch.manual_seed(1234)
+    device = "cpu"
+    if torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
+        device = "cuda:0"
+    ps = torch.tensor([5.0, 2.0, 12.0, 6.0, 8.0, 1.0, 0.0, 4.0], device=device)
+    cnts = torch.zeros(ps.shape, dtype=torch.long, device=device)
+    total_samples = 1000
+    for _ in range(total_samples):
+        vs = multinomial(ps, num_samples=1, replacement=False)
+        cnts[vs] += 1
+    diff = cnts / cnts.sum() - ps / ps.sum()
+    max_diff = diff.abs().max().cpu().item()
+    print(ps / ps.sum())
+    print(cnts / cnts.sum())
+    assert max_diff < 1.5e-2

pyproject.toml ADDED Viewed

	@@ -0,0 +1,33 @@

+[project]
+name = "moshi"
+requires-python = ">= 3.10"
+description = "Moshi is moshi"
+dependencies = [
+    "numpy >= 1.26, < 2.2",
+    "safetensors >= 0.4.0, < 0.5",
+    "huggingface-hub >= 0.24, < 0.25",
+    "einops == 0.7",
+    "sentencepiece == 0.2",
+    "sounddevice == 0.5",
+    "sphn >= 0.1.4",
+    "torch >= 2.2.0, < 2.5",
+    "aiohttp>=3.10.5, <3.11",
+]
+authors = [{name="Laurent Mazaré", email="laurent@kyutai.org"}]
+maintainers = [{name="Laurent Mazaré", email="laurent@kyutai.org"}]
+license = {text = "MIT"}
+dynamic = ["version"]
+[tool.setuptools.dynamic]
+version = {attr = "moshi.__version__"}
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+[project.optional-dependencies]
+dev = [
+    "pyright",
+    "flake8",
+    "pre-commit",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+einops==0.7.0
+safetensors=0.4.4
+sentencepiece==0.2.0
+sounddevice==0.5.0
+soundfile==0.12.1
+sphn==0.1.4
+torch==2.2.0
+numpy==1.26.4
+aiohttp>=3.10.5, <3.11
+huggingface-hub==0.24.6

setup.cfg ADDED Viewed

	@@ -0,0 +1,10 @@

+[pep8]
+max-line-length = 120
+[flake8]
+max-line-length = 120
+ignore = E203,E704
+exclude =
+    dist
+    build