Upload WavCoch random-init model (WavCochV8192CausalConfig)

Browse files

Files changed (6) hide show

README.md +47 -0
config.json +71 -0
configuration_wavcoch.py +73 -0
configure_wavcoch.py +8 -0
model.safetensors +3 -0
modeling_wavcoch.py +583 -0

README.md ADDED Viewed

	@@ -0,0 +1,47 @@

+---
+license: apache-2.0
+tags:
+- audio
+- speech
+- tokenizer
+- vocoder
+- wavcoch
+library_name: transformers
+---
+# WavCochCausalV8192-vocoder-randinit
+**WavCoch** is a causal waveform-to-cochleagram tokenizer by **Greta Tuckute** and **Klemen Kotar**.
+This repository contains a freshly initialized `WavCochV8192CausalConfig` model with a bundled random-initialized vocoder. The weights are random and have not been trained from a checkpoint.
+## Model Details
+| Parameter | Value |
+|-----------|-------|
+| Parameters | ~24.42M |
+| Window Size | 1001 |
+| Hop Length | 80 |
+| Encoder Dim | 512 |
+| Vocabulary Size | 8192 |
+| Includes Vocoder | True |
+## Usage
+```python
+from transformers import AutoModel
+wavcoch = AutoModel.from_pretrained(
+    "TuKoResearch/WavCochCausalV8192-vocoder-randinit",
+    trust_remote_code=True,
+)
+codes = wavcoch.quantize(waveform_tensor)
+coch = wavcoch.decode(codes)
+audio = wavcoch.decode_audio(codes)
+```
+## Notes
+This repo includes a bundled vocoder and supports `decode_audio(...)` for end-to-end waveform synthesis.

config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "model_type": "wavcoch",
+  "architectures": [
+    "WavCoch"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_wavcoch.WavCochConfig",
+    "AutoModel": "modeling_wavcoch.WavCoch"
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.0",
+  "sample_rate": 16000,
+  "causal_pad_mode": "repeat",
+  "out_channels": 211,
+  "has_vocoder": true,
+  "vocoder_upsample_rates": [
+    5,
+    4,
+    2,
+    2
+  ],
+  "vocoder_upsample_kernel_sizes": [
+    10,
+    8,
+    4,
+    4
+  ],
+  "vocoder_upsample_initial_channel": 512,
+  "vocoder_resblock": "1",
+  "vocoder_resblock_kernel_sizes": [
+    11,
+    7,
+    3
+  ],
+  "vocoder_resblock_dilation_sizes": [
+    [
+      1,
+      3,
+      5
+    ],
+    [
+      1,
+      3,
+      5
+    ],
+    [
+      1,
+      3,
+      5
+    ]
+  ],
+  "window_size": 1001,
+  "window_padding": 1000,
+  "hop_length": 80,
+  "causal_convs": true,
+  "encoder_layers": 8,
+  "encoder_dim": 512,
+  "encoder_kernel_size": 3,
+  "decoder_layers": 8,
+  "decoder_dim": 512,
+  "decoder_kernel_size": 9,
+  "quantizer": "FSQ",
+  "channels": [
+    8,
+    8,
+    8,
+    4,
+    4
+  ],
+  "vocab_size": 8192
+}

configuration_wavcoch.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+WavCoch configuration for Hugging Face Transformers.
+"""
+from transformers import PretrainedConfig
+class WavCochConfig(PretrainedConfig):
+    """Configuration class for WavCoch checkpoints with optional vocoder."""
+    model_type = "wavcoch"
+    def __init__(
+        self,
+        window_size: int = 1001,
+        window_padding: int = 1000,
+        hop_length: int = 80,
+        out_channels: int = 211,
+        causal_convs: bool = True,
+        causal_pad_mode: str = "repeat",
+        encoder_layers: int = 8,
+        encoder_dim: int = 512,
+        encoder_kernel_size: int = 3,
+        decoder_layers: int = 8,
+        decoder_dim: int = 512,
+        decoder_kernel_size: int = 9,
+        quantizer: str = "FSQ",
+        channels=None,
+        vocab_size: int = None,
+        sample_rate: int = 16000,
+        has_vocoder: bool = False,
+        vocoder_upsample_rates=None,
+        vocoder_upsample_kernel_sizes=None,
+        vocoder_upsample_initial_channel: int = 512,
+        vocoder_resblock: str = "1",
+        vocoder_resblock_kernel_sizes=None,
+        vocoder_resblock_dilation_sizes=None,
+        **kwargs,
+    ):
+        channels = list(channels or [8, 8, 8, 4, 4])
+        if vocab_size is None:
+            vocab_size = 1
+            for level in channels:
+                vocab_size *= int(level)
+        self.window_size = int(window_size)
+        self.window_padding = int(window_padding)
+        self.hop_length = int(hop_length)
+        self.out_channels = int(out_channels)
+        self.causal_convs = bool(causal_convs)
+        self.causal_pad_mode = str(causal_pad_mode)
+        self.encoder_layers = int(encoder_layers)
+        self.encoder_dim = int(encoder_dim)
+        self.encoder_kernel_size = int(encoder_kernel_size)
+        self.decoder_layers = int(decoder_layers)
+        self.decoder_dim = int(decoder_dim)
+        self.decoder_kernel_size = int(decoder_kernel_size)
+        self.quantizer = str(quantizer)
+        self.channels = channels
+        self.vocab_size = int(vocab_size)
+        self.sample_rate = int(sample_rate)
+        self.has_vocoder = bool(has_vocoder)
+        self.vocoder_upsample_rates = list(vocoder_upsample_rates or [5, 4, 2, 2])
+        self.vocoder_upsample_kernel_sizes = list(vocoder_upsample_kernel_sizes or [10, 8, 4, 4])
+        self.vocoder_upsample_initial_channel = int(vocoder_upsample_initial_channel)
+        self.vocoder_resblock = str(vocoder_resblock)
+        self.vocoder_resblock_kernel_sizes = list(vocoder_resblock_kernel_sizes or [11, 7, 3])
+        self.vocoder_resblock_dilation_sizes = [
+            list(d) for d in (vocoder_resblock_dilation_sizes or [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+        ]
+        super().__init__(**kwargs)

configure_wavcoch.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+Backward-compatible import shim for older WavCoch repos.
+"""
+from .configuration_wavcoch import WavCochConfig
+__all__ = ["WavCochConfig"]

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:487d0a8c2dba58367919fe898e3a1812bcaf931b5ef5b97792d7c3ac8f4de15a
+size 97726648

modeling_wavcoch.py ADDED Viewed

	@@ -0,0 +1,583 @@

+"""
+WavCoch model for Hugging Face Transformers.
+This implementation is self-contained so HF-hosted WavCoch checkpoints do not
+depend on the local auristream package or vector_quantize_pytorch.
+"""
+import math
+import os
+from typing import List, Optional
+os.environ.setdefault("USE_TORCH_XLA", "0")
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+try:
+    from torch.nn.utils.parametrizations import weight_norm
+except ImportError:  # pragma: no cover - older PyTorch compatibility
+    from torch.nn.utils import weight_norm
+from transformers import PreTrainedModel
+try:
+    from transformers.tokenization_utils_base import BatchEncoding
+except ImportError:  # pragma: no cover - compatibility with older Transformers
+    from transformers.tokenization_utils import BatchEncoding
+import transformers.modeling_utils as transformers_modeling_utils
+import transformers.utils.import_utils as transformers_import_utils
+transformers_import_utils.is_torch_xla_available = lambda *args, **kwargs: False
+transformers_modeling_utils.is_torch_xla_available = lambda *args, **kwargs: False
+try:
+    from .configuration_wavcoch import WavCochConfig
+except ImportError:  # pragma: no cover - compatibility with older repos
+    from .configure_wavcoch import WavCochConfig
+class CausalConv1d(nn.Module):
+    """1D causal convolution with left-only padding."""
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        bias: bool = True,
+        groups: int = 1,
+        pad_mode: str = "repeat",
+        constant_value: float = 0.0,
+    ):
+        super().__init__()
+        left_pad = dilation * (kernel_size - 1)
+        if pad_mode == "repeat":
+            self.pad = nn.ReplicationPad1d((left_pad, 0))
+        elif pad_mode == "constant":
+            self.pad = nn.ConstantPad1d((left_pad, 0), constant_value)
+        else:
+            raise ValueError(f"Unsupported pad_mode: {pad_mode}")
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=0,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.conv(self.pad(x))
+class FSQ(nn.Module):
+    """Finite Scalar Quantization with the subset of functionality needed for inference."""
+    def __init__(self, levels: List[int], dim: int):
+        super().__init__()
+        if not levels:
+            raise ValueError("FSQ levels must be non-empty")
+        self.levels = [int(level) for level in levels]
+        self.codebook_dim = len(self.levels)
+        self.dim = int(dim)
+        level_tensor = torch.tensor(self.levels, dtype=torch.int32)
+        basis = torch.cumprod(torch.tensor([1] + self.levels[:-1], dtype=torch.int32), dim=0)
+        self.register_buffer("_levels", level_tensor, persistent=False)
+        self.register_buffer("_basis", basis, persistent=False)
+        if self.dim != self.codebook_dim:
+            self.project_in = nn.Linear(self.dim, self.codebook_dim)
+            self.project_out = nn.Linear(self.codebook_dim, self.dim)
+        else:
+            self.project_in = nn.Identity()
+            self.project_out = nn.Identity()
+    def _refresh_level_buffers(self, device: Optional[torch.device] = None):
+        level_values = [int(level) for level in self.levels]
+        if device is None:
+            if isinstance(self.project_in, nn.Linear):
+                device = self.project_in.weight.device
+            elif isinstance(self.project_out, nn.Linear):
+                device = self.project_out.weight.device
+            else:
+                device = self._levels.device
+        self._levels = torch.tensor(level_values, dtype=torch.int32, device=device)
+        self._basis = torch.cumprod(
+            torch.tensor([1] + level_values[:-1], dtype=torch.int32, device=device),
+            dim=0,
+        )
+    def bound(self, z: torch.Tensor, eps: float = 1e-3) -> torch.Tensor:
+        levels = self._levels.to(dtype=z.dtype, device=z.device)
+        half_l = (levels - 1) * (1 + eps) / 2
+        offset = torch.where(
+            (self._levels % 2).to(device=z.device) == 0,
+            torch.tensor(0.5, device=z.device, dtype=z.dtype),
+            torch.tensor(0.0, device=z.device, dtype=z.dtype),
+        )
+        shift = (offset / half_l).atanh()
+        return (z + shift).tanh() * half_l - offset
+    def _scale_and_shift(self, zhat_normalized: torch.Tensor) -> torch.Tensor:
+        half_width = (self._levels // 2).to(dtype=zhat_normalized.dtype, device=zhat_normalized.device)
+        return (zhat_normalized * half_width) + half_width
+    def _scale_and_shift_inverse(self, zhat: torch.Tensor) -> torch.Tensor:
+        half_width = (self._levels // 2).to(dtype=zhat.dtype, device=zhat.device)
+        return (zhat - half_width) / half_width
+    def quantize_values(self, z: torch.Tensor) -> torch.Tensor:
+        self._refresh_level_buffers(device=z.device)
+        half_width = (self._levels // 2).to(dtype=z.dtype, device=z.device)
+        return self.bound(z).round() / half_width
+    def codes_to_indices(self, zhat: torch.Tensor) -> torch.Tensor:
+        self._refresh_level_buffers(device=zhat.device)
+        zhat = self._scale_and_shift(zhat)
+        basis = self._basis.to(device=zhat.device, dtype=zhat.dtype)
+        return (zhat * basis).sum(dim=-1).to(torch.int32)
+    def indices_to_level_indices(self, indices: torch.Tensor) -> torch.Tensor:
+        self._refresh_level_buffers(device=indices.device)
+        indices = indices.unsqueeze(-1)
+        levels = self._levels.to(device=indices.device)
+        basis = self._basis.to(device=indices.device)
+        return (indices // basis) % levels
+    def indices_to_codes(self, indices: torch.Tensor) -> torch.Tensor:
+        self._refresh_level_buffers(device=indices.device)
+        level_indices = self.indices_to_level_indices(indices)
+        codes = self._scale_and_shift_inverse(level_indices.to(dtype=torch.float32))
+        return self.project_out(codes)
+    def forward(self, z: torch.Tensor):
+        orig_dtype = z.dtype
+        z = self.project_in(z.to(torch.float32))
+        q = self.quantize_values(z)
+        indices = self.codes_to_indices(q)
+        out = self.project_out(q).to(orig_dtype)
+        return out, indices.long()
+LRELU_SLOPE = 0.1
+def get_padding(kernel_size: int, dilation: int = 1) -> int:
+    return int((kernel_size * dilation - dilation) / 2)
+def init_weights(module, mean: float = 0.0, std: float = 0.01):
+    classname = module.__class__.__name__
+    if classname.find("Conv") != -1 and hasattr(module, "weight"):
+        module.weight.data.normal_(mean, std)
+class ResBlock1(nn.Module):
+    __constants__ = ["lrelu_slope"]
+    def __init__(self, channels: int, kernel_size: int = 3, dilation=(1, 3, 5)):
+        super().__init__()
+        self.lrelu_slope = LRELU_SLOPE
+        ch = channels
+        ks = kernel_size
+        self.convs1 = nn.Sequential(
+            weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, dilation[0]), dilation[0])),
+            weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, dilation[1]), dilation[1])),
+            weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, dilation[2]), dilation[2])),
+        )
+        self.convs2 = nn.Sequential(
+            weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, 1))),
+            weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, 1))),
+            weight_norm(Conv1d(ch, ch, ks, 1, get_padding(ks, 1))),
+        )
+        self.convs1.apply(init_weights)
+        self.convs2.apply(init_weights)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, self.lrelu_slope)
+            xt = conv1(xt)
+            xt = F.leaky_relu(xt, self.lrelu_slope)
+            xt = conv2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            remove_weight_norm(layer)
+        for layer in self.convs2:
+            remove_weight_norm(layer)
+class ResBlock2(nn.Module):
+    __constants__ = ["lrelu_slope"]
+    def __init__(self, channels: int, kernel_size: int = 3, dilation=(1, 3)):
+        super().__init__()
+        self.lrelu_slope = LRELU_SLOPE
+        ch = channels
+        ks = kernel_size
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(Conv1d(ch, ch, ks, 1, get_padding(kernel_size, dilation[0]), dilation[0])),
+                weight_norm(Conv1d(ch, ch, ks, 1, get_padding(kernel_size, dilation[1]), dilation[1])),
+            ]
+        )
+        self.convs.apply(init_weights)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for conv in self.convs:
+            xt = F.leaky_relu(x, self.lrelu_slope)
+            xt = conv(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for layer in self.convs:
+            remove_weight_norm(layer)
+class Generator(nn.Module):
+    __constants__ = ["lrelu_slope", "num_kernels", "num_upsamples"]
+    def __init__(
+        self,
+        out_channels: int = 211,
+        upsample_rates=None,
+        upsample_kernel_sizes=None,
+        upsample_initial_channel: int = 512,
+        resblock: str = "1",
+        resblock_kernel_sizes=None,
+        resblock_dilation_sizes=None,
+    ):
+        super().__init__()
+        upsample_rates = list(upsample_rates or [5, 4, 2, 2])
+        upsample_kernel_sizes = list(upsample_kernel_sizes or [10, 8, 4, 4])
+        resblock_kernel_sizes = list(resblock_kernel_sizes or [11, 7, 3])
+        resblock_dilation_sizes = [list(d) for d in (resblock_dilation_sizes or [[1, 3, 5], [1, 3, 5], [1, 3, 5]])]
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.lrelu_slope = LRELU_SLOPE
+        self.conv_pre = weight_norm(Conv1d(out_channels, upsample_initial_channel, 7, 1, padding=3))
+        resblock_cls = ResBlock1 if resblock == "1" else ResBlock2
+        ups = []
+        for i, (rate, kernel) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2 ** i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        kernel,
+                        rate,
+                        padding=(kernel - rate) // 2,
+                    )
+                )
+            )
+        self.ups = nn.Sequential(*ups)
+        resblocks = []
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            resblocks.append(
+                nn.Sequential(
+                    *[
+                        resblock_cls(ch, kernel, dilation)
+                        for kernel, dilation in zip(resblock_kernel_sizes, resblock_dilation_sizes)
+                    ]
+                )
+            )
+        self.resblocks = nn.Sequential(*resblocks)
+        self.conv_post = weight_norm(Conv1d(ch, 1, 17, 1, padding=0))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def load_state_dict(self, state_dict, strict: bool = True):
+        new_state_dict = {}
+        for key, value in state_dict.items():
+            new_key = key
+            if "resblocks" in key:
+                parts = key.split(".")
+                if len(parts) == 5:
+                    layer = int(parts[1])
+                    new_key = f"resblocks.{layer // 3}.{layer % 3}.{'.'.join(parts[2:])}"
+            new_state_dict[new_key] = value
+        current_state = self.state_dict()
+        for key, value in list(new_state_dict.items()):
+            if key not in current_state:
+                continue
+            len_diff = value.dim() - current_state[key].dim()
+            if len_diff == -1:
+                new_state_dict[key] = value.unsqueeze(-1)
+            elif len_diff == 1:
+                new_state_dict[key] = value.squeeze(-1)
+        super().load_state_dict(new_state_dict, strict=strict)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.conv_pre(x.permute(0, 2, 1))
+        for upsample_layer, resblock_group in zip(self.ups, self.resblocks):
+            x = F.leaky_relu(x, self.lrelu_slope)
+            x = upsample_layer(x)
+            xs = 0
+            for resblock in resblock_group:
+                xs = xs + resblock(x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        return torch.tanh(x)
+    def remove_weight_norm(self):
+        for layer in self.ups:
+            remove_weight_norm(layer)
+        for group in self.resblocks:
+            for block in group:
+                block.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class WavCoch(PreTrainedModel):
+    """Causal waveform-to-cochleagram tokenizer with optional vocoder."""
+    config_class = WavCochConfig
+    main_input_name = "wav"
+    def __init__(self, config: WavCochConfig):
+        super().__init__(config)
+        self.config = config
+        self.N = int(config.window_size)
+        self.hop_length = int(config.hop_length)
+        self.window_padding = int(getattr(config, "window_padding", self.N - self.hop_length))
+        self.causal_convs = bool(getattr(config, "causal_convs", True))
+        self.causal_pad_mode = getattr(config, "causal_pad_mode", "repeat")
+        out_bins = self.N // 2 + 1
+        self.conv_real_filters = nn.Conv1d(1, out_bins, kernel_size=self.N, stride=self.hop_length)
+        self.conv_imag_filters = nn.Conv1d(1, out_bins, kernel_size=self.N, stride=self.hop_length)
+        self._initialize_conv_filters()
+        self.encoder = self._build_conv_stack(
+            in_channels=out_bins,
+            out_channels=config.encoder_dim,
+            num_layers=config.encoder_layers,
+            kernel_size=config.encoder_kernel_size,
+            causal=self.causal_convs,
+        )
+        self.quantizer = FSQ(levels=list(config.channels), dim=config.encoder_dim)
+        self.decoder = self._build_conv_stack(
+            in_channels=config.decoder_dim,
+            out_channels=config.out_channels,
+            num_layers=config.decoder_layers,
+            kernel_size=config.decoder_kernel_size,
+            causal=self.causal_convs,
+        )
+        self.has_vocoder = bool(getattr(config, "has_vocoder", False))
+        if self.has_vocoder:
+            if int(config.out_channels) != 211:
+                raise ValueError("Bundled vocoder currently expects 211 cochleagram channels")
+            self.vocoder = Generator(
+                out_channels=config.out_channels,
+                upsample_rates=config.vocoder_upsample_rates,
+                upsample_kernel_sizes=config.vocoder_upsample_kernel_sizes,
+                upsample_initial_channel=config.vocoder_upsample_initial_channel,
+                resblock=config.vocoder_resblock,
+                resblock_kernel_sizes=config.vocoder_resblock_kernel_sizes,
+                resblock_dilation_sizes=config.vocoder_resblock_dilation_sizes,
+            )
+        else:
+            self.vocoder = None
+        self._vocab_size = int(config.vocab_size)
+        self.post_init()
+    def _build_conv_stack(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_layers: int,
+        kernel_size: int,
+        causal: bool,
+    ) -> nn.Sequential:
+        layers = []
+        for layer_idx in range(int(num_layers)):
+            input_channels = in_channels if layer_idx == 0 else out_channels
+            if causal:
+                conv = CausalConv1d(
+                    input_channels,
+                    out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    pad_mode=self.causal_pad_mode,
+                )
+            else:
+                conv = nn.Conv1d(
+                    input_channels,
+                    out_channels,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=kernel_size // 2,
+                )
+            layers.extend([conv, nn.ReLU()])
+        return nn.Sequential(*layers)
+    def _compute_twiddle_factors(self):
+        n = torch.arange(self.N, dtype=torch.float32).unsqueeze(1)
+        k = torch.arange(self.N, dtype=torch.float32).unsqueeze(0)
+        angles = -2.0 * math.pi * n * k / float(self.N)
+        return torch.cos(angles), torch.sin(angles)
+    def _initialize_conv_filters(self):
+        with torch.no_grad():
+            cos_matrix, sin_matrix = self._compute_twiddle_factors()
+            cos_matrix = cos_matrix[: self.N // 2 + 1, :]
+            sin_matrix = sin_matrix[: self.N // 2 + 1, :]
+            window = torch.hann_window(self.N, periodic=True).view(1, 1, -1)
+            real_weights = (cos_matrix.unsqueeze(1) * window).to(dtype=self.conv_real_filters.weight.dtype)
+            imag_weights = (sin_matrix.unsqueeze(1) * window).to(dtype=self.conv_imag_filters.weight.dtype)
+            self.conv_real_filters.weight.copy_(real_weights)
+            self.conv_imag_filters.weight.copy_(imag_weights)
+        for param in self.conv_real_filters.parameters():
+            param.requires_grad_(False)
+        for param in self.conv_imag_filters.parameters():
+            param.requires_grad_(False)
+    def _normalize_sample_rate(self, sample_rate: Optional[int], sampling_rate: Optional[int]) -> int:
+        if sample_rate is not None and sampling_rate is not None and sample_rate != sampling_rate:
+            raise ValueError(f"sample_rate ({sample_rate}) and sampling_rate ({sampling_rate}) conflict")
+        resolved = int(sample_rate or sampling_rate or self.config.sample_rate)
+        if resolved != int(self.config.sample_rate):
+            raise ValueError(
+                f"WavCoch expects {self.config.sample_rate} Hz audio, but received {resolved} Hz"
+            )
+        return resolved
+    def _prepare_wav_batch(self, wav) -> torch.Tensor:
+        if isinstance(wav, list):
+            wav = [item if isinstance(item, torch.Tensor) else torch.tensor(item) for item in wav]
+            normalized = []
+            for item in wav:
+                if item.ndim == 1:
+                    normalized.append(item)
+                elif item.ndim == 2 and 1 in item.shape:
+                    normalized.append(item.reshape(-1))
+                else:
+                    raise ValueError(f"Unexpected list element shape {tuple(item.shape)}")
+            wav = torch.nn.utils.rnn.pad_sequence(normalized, batch_first=True).unsqueeze(1)
+        elif isinstance(wav, torch.Tensor):
+            if wav.ndim == 1:
+                wav = wav.unsqueeze(0).unsqueeze(0)
+            elif wav.ndim == 2:
+                wav = wav.unsqueeze(1)
+            elif wav.ndim != 3:
+                raise ValueError(f"Unexpected tensor shape {tuple(wav.shape)}, expected 1D, 2D or 3D")
+        else:
+            raise TypeError(f"Unsupported input type: {type(wav)}")
+        return wav.to(dtype=torch.float32)
+    @property
+    def vocab_size(self) -> int:
+        return self._vocab_size
+    def forward(
+        self,
+        wav: torch.Tensor,
+        coch: Optional[torch.Tensor] = None,
+        return_tensors: str = "pt",
+        sample_rate: Optional[int] = None,
+        sampling_rate: Optional[int] = None,
+        pad: bool = True,
+    ):
+        del return_tensors  # unused, kept for tokenizer-like API compatibility
+        self._normalize_sample_rate(sample_rate, sampling_rate)
+        wav = self._prepare_wav_batch(wav)
+        if coch is None:
+            codes = self.quantize(wav, pad=pad)
+            return BatchEncoding({"input_values": codes, "input_ids": codes})
+        if pad:
+            wav = F.pad(wav, (self.window_padding, 0), mode="constant", value=0.0)
+        with torch.no_grad():
+            real_part = self.conv_real_filters(wav)
+            imag_part = self.conv_imag_filters(wav)
+        x = real_part + imag_part
+        x = self.encoder(x).permute(0, 2, 1)
+        quantized, _ = self.quantizer(x)
+        pred_coch = self.decoder(quantized.permute(0, 2, 1)).permute(0, 2, 1)
+        loss = F.l1_loss(pred_coch, coch)
+        return pred_coch, loss, None
+    @torch.no_grad()
+    def quantize(self, wav: torch.Tensor, pad: bool = True) -> torch.Tensor:
+        wav = self._prepare_wav_batch(wav)
+        if pad:
+            wav = F.pad(wav, (self.window_padding, 0), mode="constant", value=0.0)
+        real_part = self.conv_real_filters(wav)
+        imag_part = self.conv_imag_filters(wav)
+        x = real_part + imag_part
+        x = self.encoder(x).permute(0, 2, 1)
+        _, indices = self.quantizer(x)
+        return indices.long()
+    @torch.no_grad()
+    def decode(self, indices: torch.Tensor) -> torch.Tensor:
+        if indices.ndim == 1:
+            indices = indices.unsqueeze(0)
+        emb = self.quantizer.indices_to_codes(indices.long())
+        return self.decoder(emb.permute(0, 2, 1)).permute(0, 2, 1)
+    @torch.no_grad()
+    def wav2coch(self, wav: torch.Tensor, pad: bool = True) -> torch.Tensor:
+        wav = self._prepare_wav_batch(wav)
+        if pad:
+            wav = F.pad(wav, (self.window_padding, 0), mode="constant", value=0.0)
+        real_part = self.conv_real_filters(wav)
+        imag_part = self.conv_imag_filters(wav)
+        x = real_part + imag_part
+        x = self.encoder(x).permute(0, 2, 1)
+        quantized, _ = self.quantizer(x)
+        return self.decoder(quantized.permute(0, 2, 1)).permute(0, 2, 1)
+    @torch.no_grad()
+    def vocode(self, coch: torch.Tensor) -> torch.Tensor:
+        if self.vocoder is None:
+            raise ValueError("This WavCoch checkpoint does not include a bundled vocoder")
+        if coch.ndim == 2:
+            coch = coch.unsqueeze(0)
+        elif coch.ndim != 3:
+            raise ValueError(f"Unexpected cochleagram shape {tuple(coch.shape)}")
+        if coch.shape[-1] != self.config.out_channels and coch.shape[1] == self.config.out_channels:
+            coch = coch.transpose(1, 2)
+        return self.vocoder(coch)
+    @torch.no_grad()
+    def decode_audio(self, indices: torch.Tensor) -> torch.Tensor:
+        return self.vocode(self.decode(indices))