TuKoResearch
/

WavTokenizer

PyTorch

wavtokenizer

custom_code

Model card Files Files and versions

xet

Community

klemenk commited on Nov 24, 2025

Commit

bc6461b

verified ·

1 Parent(s): 230e45f

Update modeling_wavtokenizer.py

Browse files

Files changed (1) hide show

modeling_wavtokenizer.py +451 -617

modeling_wavtokenizer.py CHANGED Viewed

@@ -1,611 +1,541 @@
 """
-WavTokenizer Model for HuggingFace Transformers
-This module contains the complete implementation of WavTokenizer,
-an acoustic discrete codec tokenizer for audio language modeling.
-All dependencies are included to avoid external imports.
-The architecture follows the original WavTokenizer implementation:
-- Encoder: Strided convolutions for audio compression
-- VQ: Vector quantization with single codebook
-- Decoder: Vocos-style backbone with ConvNeXt blocks + iSTFT head
-Reference: https://github.com/jishengpeng/WavTokenizer
-Paper: "WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio Language Modeling"
 """
 import math
-from typing import Dict, List, Optional, Tuple, Union
-from dataclasses import dataclass
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch import Tensor
-from torch.nn.utils import weight_norm, remove_weight_norm
 from transformers import PreTrainedModel
-from transformers.tokenization_utils import BatchEncoding
 from .configuration_wavtokenizer import WavTokenizerConfig
-# ==============================================================================
-# Utility Functions
-# ==============================================================================
-def convert_audio(wav: Tensor, sr: int, target_sr: int, target_channels: int) -> Tensor:
-    """
-    Convert audio to target sample rate and number of channels.
-    Args:
-        wav: Input waveform [C, T] or [T]
-        sr: Source sample rate
-        target_sr: Target sample rate
-        target_channels: Target number of channels (1 for mono, 2 for stereo)
-    Returns:
-        Converted waveform [target_channels, T']
-    """
-    import torchaudio
-    # Ensure 2D
     if wav.dim() == 1:
-        wav = wav.unsqueeze(0)
-    # Convert channels
-    if wav.size(0) > target_channels:
-        wav = wav.mean(dim=0, keepdim=True)
-    elif wav.size(0) < target_channels:
-        wav = wav.expand(target_channels, -1)
-    # Resample if needed
     if sr != target_sr:
-        wav = torchaudio.functional.resample(wav, sr, target_sr)
     return wav
-# ==============================================================================
-# Encoder Components (DAC-style)
-# ==============================================================================
-def WNConv1d(*args, **kwargs):
-    """Weight-normalized Conv1d."""
-    return weight_norm(nn.Conv1d(*args, **kwargs))
-def WNConvTranspose1d(*args, **kwargs):
     """Weight-normalized ConvTranspose1d."""
-    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
-class ResidualUnit(nn.Module):
-    """Residual unit with dilated convolution."""
-    def __init__(self, dim: int = 16, dilation: int = 1):
         super().__init__()
-        pad = ((7 - 1) * dilation) // 2
-        self.block = nn.Sequential(
-            nn.ELU(),
-            WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
-            nn.ELU(),
-            WNConv1d(dim, dim, kernel_size=1),
-        )
-    def forward(self, x: Tensor) -> Tensor:
-        return x + self.block(x)
-class EncoderBlock(nn.Module):
-    """Encoder block with residual units and downsampling."""
-    def __init__(self, dim: int = 16, stride: int = 1):
         super().__init__()
-        self.block = nn.Sequential(
-            ResidualUnit(dim // 2, dilation=1),
-            ResidualUnit(dim // 2, dilation=3),
-            ResidualUnit(dim // 2, dilation=9),
-            nn.ELU(),
-            WNConv1d(
-                dim // 2, dim,
-                kernel_size=2 * stride,
-                stride=stride,
-                padding=math.ceil(stride / 2),
-            ),
-        )
-    def forward(self, x: Tensor) -> Tensor:
-        return self.block(x)
-class Encoder(nn.Module):
     """
-    DAC-style encoder that compresses waveform to latent representation.
-    Uses strided convolutions for downsampling.
     """
-    def __init__(
-        self,
-        d_model: int = 64,
-        strides: List[int] = [8, 5, 4, 2],
-        d_latent: int = 512,
-    ):
         super().__init__()
-        # Initial conv
-        self.block = [WNConv1d(1, d_model, kernel_size=7, padding=3)]
-        # Encoder blocks with increasing channels
-        for stride in strides:
-            d_model *= 2
-            self.block.append(EncoderBlock(d_model, stride=stride))
-        # Final projection
-        self.block.extend([
-            nn.ELU(),
-            WNConv1d(d_model, d_latent, kernel_size=3, padding=1),
-        ])
-        self.block = nn.Sequential(*self.block)
-        self.enc_dim = d_model
-    def forward(self, x: Tensor) -> Tensor:
-        return self.block(x)
-# ==============================================================================
-# Vector Quantization
-# ==============================================================================
-class VectorQuantize(nn.Module):
-    """
-    Improved vector quantization with EMA codebook updates.
-    Uses L2-normalized codes for better stability.
-    """
-    def __init__(
-        self,
-        input_dim: int,
-        codebook_size: int,
-        codebook_dim: int,
-        commitment: float = 0.25,
-    ):
         super().__init__()
-        self.input_dim = input_dim
-        self.codebook_size = codebook_size
-        self.codebook_dim = codebook_dim
-        self.commitment = commitment
-        # Projections
-        requires_projection = input_dim != codebook_dim
-        self.project_in = nn.Linear(input_dim, codebook_dim) if requires_projection else nn.Identity()
-        self.project_out = nn.Linear(codebook_dim, input_dim) if requires_projection else nn.Identity()
-        # Codebook
-        self.codebook = nn.Embedding(codebook_size, codebook_dim)
-        nn.init.uniform_(self.codebook.weight, -1.0 / codebook_size, 1.0 / codebook_size)
-    def forward(self, z: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
         """
-        Forward pass.
         Args:
-            z: Input [B, D, T]
         Returns:
-            z_q: Quantized [B, D, T]
-            commitment_loss: Loss scalar
-            indices: Codes [B, T]
         """
-        # [B, D, T] -> [B, T, D]
-        z = z.transpose(1, 2)
-        z_e = self.project_in(z)
         # L2 normalize
-        z_e_norm = F.normalize(z_e, dim=-1)
-        codebook_norm = F.normalize(self.codebook.weight, dim=-1)
-        # Find nearest codes
-        dist = (
-            z_e_norm.pow(2).sum(-1, keepdim=True)
-            + codebook_norm.pow(2).sum(-1)
-            - 2 * torch.einsum('btd,kd->btk', z_e_norm, codebook_norm)
-        )
-        indices = dist.argmin(dim=-1)
-        # Look up quantized values
-        z_q = F.embedding(indices, codebook_norm)
-        # Commitment loss
-        commitment_loss = F.mse_loss(z_e_norm, z_q.detach()) * self.commitment
         # Straight-through
-        z_q = z_e_norm + (z_q - z_e_norm).detach()
-        # Project out and transpose back
-        z_q = self.project_out(z_q)
-        z_q = z_q.transpose(1, 2)  # [B, D, T]
-        return z_q, commitment_loss, indices
-    def decode(self, indices: Tensor) -> Tensor:
-        """Decode indices to vectors."""
-        codebook = F.normalize(self.codebook.weight, dim=-1)
-        z_q = F.embedding(indices, codebook)
-        z_q = self.project_out(z_q)
-        return z_q.transpose(1, 2)
-class ResidualVectorQuantize(nn.Module):
-    """Residual VQ with multiple codebooks (typically 1 for WavTokenizer)."""
-    def __init__(
-        self,
-        input_dim: int = 512,
-        codebook_size: int = 4096,
-        codebook_dim: int = 8,
-        num_quantizers: int = 1,
-        commitment: float = 0.25,
-    ):
         super().__init__()
-        self.num_quantizers = num_quantizers
-        self.quantizers = nn.ModuleList([
-            VectorQuantize(input_dim, codebook_size, codebook_dim, commitment)
-            for _ in range(num_quantizers)
         ])
-    def forward(
-        self, z: Tensor, n_quantizers: int = None
-    ) -> Tuple[Tensor, Tensor, Tensor]:
-        n_q = n_quantizers or self.num_quantizers
-        residual = z
-        z_q = torch.zeros_like(z)
-        all_indices = []
-        all_losses = []
-        for i, quantizer in enumerate(self.quantizers[:n_q]):
-            _z_q, loss, indices = quantizer(residual)
-            residual = residual - _z_q
-            z_q = z_q + _z_q
-            all_indices.append(indices)
-            all_losses.append(loss)
-        codes = torch.stack(all_indices, dim=0)  # [N_q, B, T]
-        commitment_loss = sum(all_losses)
-        return z_q, commitment_loss, codes
-    def decode(self, codes: Tensor) -> Tensor:
-        """Decode codes to vectors."""
-        if codes.dim() == 2:
-            codes = codes.unsqueeze(0)
-        z_q = None
-        for i, quantizer in enumerate(self.quantizers[:codes.size(0)]):
-            _z_q = quantizer.decode(codes[i])
-            z_q = _z_q if z_q is None else z_q + _z_q
-        return z_q
-# ==============================================================================
-# Decoder Components (Vocos-style)
-# ==============================================================================
 class ConvNeXtBlock(nn.Module):
-    """ConvNeXt block with depthwise conv + pointwise expansion."""
-    def __init__(
-        self,
-        dim: int,
-        intermediate_dim: int,
-        kernel_size: int = 7,
-        layer_scale_init_value: float = 1e-6,
-    ):
         super().__init__()
         padding = (kernel_size - 1) // 2
         self.dwconv = nn.Conv1d(dim, dim, kernel_size, padding=padding, groups=dim)
-        self.norm = nn.LayerNorm(dim)
         self.pwconv1 = nn.Linear(dim, intermediate_dim)
-        self.act = nn.GELU()
         self.pwconv2 = nn.Linear(intermediate_dim, dim)
-        self.gamma = nn.Parameter(
-            layer_scale_init_value * torch.ones(dim)
-        ) if layer_scale_init_value > 0 else None
-    def forward(self, x: Tensor) -> Tensor:
         residual = x
         x = self.dwconv(x)
-        x = x.transpose(1, 2)  # [B, T, D]
-        x = self.norm(x)
         x = self.pwconv1(x)
-        x = self.act(x)
         x = self.pwconv2(x)
-        if self.gamma is not None:
-            x = self.gamma * x
-        x = x.transpose(1, 2)  # [B, D, T]
         return residual + x
-class VocosBackbone(nn.Module):
-    """Vocos backbone with attention and ConvNeXt blocks."""
-    def __init__(
-        self,
-        input_dim: int,
-        dim: int,
-        intermediate_dim: int,
-        num_blocks: int,
-        kernel_size: int = 7,
-        layer_scale_init_value: float = 1e-6,
-        use_attention: bool = True,
-        num_heads: int = 8,
-        num_attention_layers: int = 1,
-    ):
         super().__init__()
-        # Input projection
-        self.input_conv = nn.Conv1d(input_dim, dim, kernel_size=7, padding=3)
-        self.norm = nn.LayerNorm(dim)
-        # Attention layers
-        self.use_attention = use_attention
-        if use_attention:
-            self.attention = nn.ModuleList([
-                nn.MultiheadAttention(dim, num_heads, batch_first=True)
-                for _ in range(num_attention_layers)
-            ])
-            self.attn_norms = nn.ModuleList([
-                nn.LayerNorm(dim) for _ in range(num_attention_layers)
-            ])
-        # ConvNeXt blocks
         self.convnext = nn.ModuleList([
-            ConvNeXtBlock(dim, intermediate_dim, kernel_size, layer_scale_init_value)
             for _ in range(num_blocks)
         ])
-        self.final_norm = nn.LayerNorm(dim)
-    def forward(self, x: Tensor) -> Tensor:
         # Input projection
-        x = self.input_conv(x)
-        x = x.transpose(1, 2)  # [B, T, D]
-        x = self.norm(x)
-        x = x.transpose(1, 2)  # [B, D, T]
-        # Attention
-        if self.use_attention:
-            for attn, norm in zip(self.attention, self.attn_norms):
-                x_t = x.transpose(1, 2)  # [B, T, D]
-                residual = x_t
-                x_t = norm(x_t)
-                x_t, _ = attn(x_t, x_t, x_t)
-                x_t = residual + x_t
-                x = x_t.transpose(1, 2)  # [B, D, T]
         # ConvNeXt blocks
         for block in self.convnext:
-            x = block(x)
         # Final norm
-        x = x.transpose(1, 2)
-        x = self.final_norm(x)
-        x = x.transpose(1, 2)
         return x
-class ISTFTHead(nn.Module):
-    """Inverse STFT head for waveform synthesis."""
-    def __init__(
-        self,
-        dim: int,
-        n_fft: int,
-        hop_length: int,
-        padding: str = "center",
-    ):
         super().__init__()
         self.n_fft = n_fft
-        self.hop_length = hop_length
-        self.padding = padding
-        self.out_dim = n_fft // 2 + 1
-        self.proj = nn.Conv1d(dim, self.out_dim * 2, kernel_size=1)
-        # Register window buffer
-        self.register_buffer(
-            "window",
-            torch.hann_window(n_fft),
-            persistent=False
-        )
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Args:
-            x: [B, D, T]
-        Returns:
-            wav: [B, 1, T']
-        """
-        x = self.proj(x)
-        # Split mag/phase
-        mag, phase = x.chunk(2, dim=1)
-        # Process
-        mag = torch.exp(mag)
-        phase = torch.sin(phase)
-        # Complex spectrum
-        S = torch.complex(mag * torch.cos(phase * math.pi), mag * torch.sin(phase * math.pi))
-        # Ensure window is on same device
-        window = self.window.to(x.device)
-        # iSTFT
-        wav = torch.istft(
-            S,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            window=window,
-            center=True,
-            normalized=False,
-            onesided=True,
-            return_complex=False,
-        )
-        return wav.unsqueeze(1)
-# ==============================================================================
-# Feature Extractor (Mel Spectrogram)
-# ==============================================================================
-class MelSpectrogramFeatures(nn.Module):
-    """Extract mel spectrogram features from audio."""
-    def __init__(
-        self,
-        sample_rate: int = 24000,
-        n_fft: int = 1024,
-        hop_length: int = 256,
-        n_mels: int = 100,
-        f_min: float = 0.0,
-        f_max: float = None,
-        padding: str = "center",
-    ):
         super().__init__()
-        self.sample_rate = sample_rate
         self.n_fft = n_fft
         self.hop_length = hop_length
-        self.n_mels = n_mels
         self.padding = padding
-        # Mel filterbank
-        import torchaudio
-        mel_fb = torchaudio.functional.melscale_fbanks(
-            n_freqs=n_fft // 2 + 1,
-            f_min=f_min,
-            f_max=f_max or sample_rate // 2,
-            n_mels=n_mels,
-            sample_rate=sample_rate,
-            norm="slaney",
-            mel_scale="slaney",
-        )
-        self.register_buffer("mel_fb", mel_fb, persistent=False)
-        self.register_buffer("window", torch.hann_window(n_fft), persistent=False)
-    def forward(self, wav: Tensor) -> Tensor:
         """
         Args:
-            wav: [B, 1, T] or [B, T]
         Returns:
-            mel: [B, n_mels, T']
         """
-        if wav.dim() == 3:
-            wav = wav.squeeze(1)
-        # STFT
-        stft = torch.stft(
-            wav,
             n_fft=self.n_fft,
             hop_length=self.hop_length,
-            window=self.window.to(wav.device),
-            center=True,
-            return_complex=True,
         )
-        # Power spectrum
-        power = stft.abs().pow(2)
-        # Mel spectrogram
-        mel = torch.matmul(self.mel_fb.T.to(power.device), power)
-        # Log scale
-        mel = torch.log(mel.clamp(min=1e-5))
-        return mel
-# ==============================================================================
 # Main WavTokenizer Model
-# ==============================================================================
 class WavTokenizer(PreTrainedModel):
     """
-    WavTokenizer: Efficient acoustic discrete codec tokenizer.
-    Architecture:
-    - Encoder: Strided convolutions for audio compression
-    - VQ: Single-codebook vector quantization (4096 codes)
-    - Decoder: Vocos backbone (ConvNeXt + attention) + iSTFT head
-    Usage:
-        ```python
-        model = WavTokenizer.from_pretrained("TuKoResearch/WavTokenizerSmall", trust_remote_code=True)
-        # Encode
-        features, codes = model.encode_infer(wav, bandwidth_id=torch.tensor([0]))
-        # Decode
-        wav_out = model.decode(features, bandwidth_id=torch.tensor([0]))
-        # Or use codes directly
-        features = model.codes_to_features(codes)
-        wav_out = model.decode(features, bandwidth_id=torch.tensor([0]))
-        ```
     """
     config_class = WavTokenizerConfig
     def __init__(self, config: WavTokenizerConfig):
         super().__init__(config)
-        self.sample_rate = config.sample_rate
-        self.hop_length = config.hop_length
-        # Encoder
-        self.encoder = Encoder(
-            d_model=config.encoder_dim,
-            strides=config.encoder_rates,
-            d_latent=config.latent_dim,
-        )
-        # Quantizer
-        self.quantizer = ResidualVectorQuantize(
-            input_dim=config.latent_dim,
             codebook_size=config.codebook_size,
-            codebook_dim=config.codebook_dim,
             num_quantizers=config.num_quantizers,
         )
-        # Feature projection for decoder
-        self.feature_proj = nn.Conv1d(config.latent_dim, config.backbone_dim, 1)
-        # Decoder backbone
-        self.backbone = VocosBackbone(
-            input_dim=config.backbone_dim,
             dim=config.backbone_dim,
             intermediate_dim=config.backbone_intermediate_dim,
             num_blocks=config.backbone_num_blocks,
-            kernel_size=config.backbone_kernel_size,
-            layer_scale_init_value=config.backbone_layer_scale_init_value,
-            use_attention=config.use_attention,
-            num_heads=config.attention_heads,
-            num_attention_layers=config.attention_layers,
         )
-        # iSTFT head
         self.head = ISTFTHead(
             dim=config.backbone_dim,
             n_fft=config.n_fft,
@@ -613,201 +543,105 @@ class WavTokenizer(PreTrainedModel):
             padding=config.padding,
         )
-        # Bandwidth embedding
-        self.bandwidth_emb = nn.Embedding(4, config.backbone_dim)
         self.post_init()
-    @property
-    def vocab_size(self) -> int:
-        return self.config.codebook_size
-    @property
-    def frame_rate(self) -> float:
-        return self.config.sample_rate / self.config.hop_length
-    def encode(
-        self, wav: Tensor, bandwidth_id: Tensor = None
-    ) -> Tuple[Tensor, Tensor, Tensor]:
         """
-        Encode waveform to quantized features.
         Args:
-            wav: [B, 1, T] or [B, T]
-            bandwidth_id: Optional bandwidth ID
         Returns:
-            z_q: Quantized features [B, D, T']
-            commitment_loss: VQ loss
-            codes: Discrete codes [N_q, B, T']
         """
-        if wav.dim() == 2:
-            wav = wav.unsqueeze(1)
-        z = self.encoder(wav)
-        z_q, loss, codes = self.quantizer(z)
-        return z_q, loss, codes
-    @torch.no_grad()
-    def encode_infer(
-        self, wav: Tensor, bandwidth_id: Tensor = None
-    ) -> Tuple[Tensor, Tensor]:
         """
-        Encode waveform to features and codes (inference).
         Args:
-            wav: [B, 1, T] or [1, T] or [B, T]
-            bandwidth_id: Optional bandwidth ID
         Returns:
-            features: [B, D, T']
-            codes: [B, T'] (squeezed if single quantizer)
         """
-        if wav.dim() == 2:
-            if wav.size(0) == 1:
-                wav = wav.unsqueeze(0)  # [1, T] -> [1, 1, T]
-            else:
-                wav = wav.unsqueeze(1)  # [B, T] -> [B, 1, T]
-        z = self.encoder(wav)
-        z_q, _, codes = self.quantizer(z)
-        # Squeeze for single quantizer
-        if codes.size(0) == 1:
-            codes = codes.squeeze(0)
-        return z_q, codes
-    def decode(
-        self, features: Tensor, bandwidth_id: Tensor = None
-    ) -> Tensor:
         """
-        Decode features to waveform.
         Args:
-            features: [B, D, T']
-            bandwidth_id: Optional bandwidth ID
         Returns:
-            wav: [B, 1, T]
         """
-        x = self.feature_proj(features)
-        if bandwidth_id is not None:
-            bw_emb = self.bandwidth_emb(bandwidth_id)
-            x = x + bw_emb.unsqueeze(-1)
-        x = self.backbone(x)
-        wav = self.head(x)
-        return wav
-    @torch.no_grad()
-    def codes_to_features(self, codes: Tensor) -> Tensor:
         """
-        Convert codes to features.
         Args:
-            codes: [N_q, B, T'] or [B, T']
         Returns:
-            features: [B, D, T']
         """
-        return self.quantizer.decode(codes)
     def forward(
         self,
-        wav: Tensor = None,
-        codes: Tensor = None,
-        bandwidth_id: Tensor = None,
-        **kwargs
-    ) -> Union[BatchEncoding, Tensor]:
-        """
-        Forward pass.
-        If wav provided: encode to get tokens
-        If codes provided: decode to get wav
-        """
-        if wav is not None:
-            features, codes = self.encode_infer(wav, bandwidth_id)
-            return BatchEncoding({
-                "input_values": features,
-                "input_ids": codes,
-            })
-        elif codes is not None:
-            features = self.codes_to_features(codes)
-            return self.decode(features, bandwidth_id)
-        else:
-            raise ValueError("Provide either 'wav' or 'codes'")
-    @classmethod
-    def from_pretrained0802(
-        cls,
-        config_path: str,
-        checkpoint_path: str,
-        device: str = "cpu",
-    ) -> "WavTokenizer":
         """
-        Load from original WavTokenizer checkpoint.
         Args:
-            config_path: Path to YAML config
-            checkpoint_path: Path to .ckpt file
-            device: Device to load to
         Returns:
-            Loaded model
         """
-        import yaml
-        # Load YAML config
-        with open(config_path, 'r') as f:
-            yaml_cfg = yaml.safe_load(f)
-        # Extract config params
-        model_args = yaml_cfg.get('model', {}).get('init_args', {})
-        # Create HF config
-        config = WavTokenizerConfig(
-            sample_rate=24000,
-            n_fft=model_args.get('head', {}).get('init_args', {}).get('n_fft', 1280),
-            hop_length=model_args.get('head', {}).get('init_args', {}).get('hop_length', 320),
-            feature_dim=model_args.get('backbone', {}).get('init_args', {}).get('dim', 512),
-            latent_dim=model_args.get('backbone', {}).get('init_args', {}).get('input_channels', 512),
-            backbone_dim=model_args.get('backbone', {}).get('init_args', {}).get('dim', 512),
-            backbone_intermediate_dim=model_args.get('backbone', {}).get('init_args', {}).get('intermediate_dim', 1536),
-            backbone_num_blocks=model_args.get('backbone', {}).get('init_args', {}).get('num_layers', 8),
-            codebook_size=model_args.get('quantizer', {}).get('init_args', {}).get('codebook_size', 4096),
-            codebook_dim=model_args.get('quantizer', {}).get('init_args', {}).get('codebook_dim', 8),
-            num_quantizers=model_args.get('quantizer', {}).get('init_args', {}).get('num_quantizers', 1),
-            use_attention=True,
-            attention_dim=model_args.get('backbone', {}).get('init_args', {}).get('dim', 512),
-            attention_heads=8,
-            attention_layers=1,
-        )
-        # Create model
-        model = cls(config)
-        # Load checkpoint
-        ckpt = torch.load(checkpoint_path, map_location=device)
-        state_dict = ckpt.get('state_dict', ckpt)
-        # Clean state dict
-        new_state_dict = {}
-        for k, v in state_dict.items():
-            # Remove 'model.' prefix if present
-            if k.startswith('model.'):
-                k = k[6:]
-            new_state_dict[k] = v
-        # Load (non-strict to handle mismatches)
-        missing, unexpected = model.load_state_dict(new_state_dict, strict=False)
-        if missing:
-            print(f"Missing keys: {len(missing)}")
-        if unexpected:
-            print(f"Unexpected keys: {len(unexpected)}")
-        return model.to(device)

 """
+WavTokenizer model implementation for HuggingFace.
+This implementation exactly matches the checkpoint structure for direct weight loading.
 """
 import math
+from typing import Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers import PreTrainedModel
+from transformers.modeling_outputs import BaseModelOutput
 from .configuration_wavtokenizer import WavTokenizerConfig
+# =============================================================================
+# Audio Utilities
+# =============================================================================
+def convert_audio(wav, sr, target_sr, target_channels=1):
+    """Convert audio to target sample rate and channels."""
     if wav.dim() == 1:
+        wav = wav.unsqueeze(0).unsqueeze(0)
+    elif wav.dim() == 2:
+        wav = wav.unsqueeze(1)
+    if wav.shape[1] > target_channels:
+        wav = wav[:, :target_channels, :]
+    elif wav.shape[1] < target_channels:
+        wav = wav.repeat(1, target_channels, 1)
     if sr != target_sr:
+        wav = F.interpolate(wav, size=int(wav.shape[-1] * target_sr / sr), mode='linear', align_corners=False)
     return wav
+# =============================================================================
+# Weight-Normalized Conv1d (matching checkpoint's weight_g/weight_v structure)
+# =============================================================================
+class WNConv1d(nn.Module):
+    """Weight-normalized Conv1d matching checkpoint structure with weight_g/weight_v."""
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True):
+        super().__init__()
+        self.conv = nn.utils.weight_norm(
+            nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
+        )
+    def forward(self, x):
+        return self.conv(x)
+class WNConvTranspose1d(nn.Module):
     """Weight-normalized ConvTranspose1d."""
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, output_padding=0, groups=1, bias=True):
+        super().__init__()
+        self.convtr = nn.utils.weight_norm(
+            nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride, padding, output_padding, groups, bias)
+        )
+    def forward(self, x):
+        return self.convtr(x)
+# =============================================================================
+# Encoder (EnCodec-style, matching feature_extractor.encodec.encoder.model.*)
+# =============================================================================
+class _ConvWrapper(nn.Module):
+    """Wrapper to match checkpoint structure: conv.conv.weight_g, conv.conv.weight_v, conv.conv.bias"""
+    def __init__(self, in_ch, out_ch, kernel_size, stride=1, padding=0):
         super().__init__()
+        self.conv = WNConv1d(in_ch, out_ch, kernel_size, stride=stride, padding=padding)
+    def forward(self, x):
+        return self.conv(x)
+class _ResBlockWrapper(nn.Module):
+    """Wrapper to match checkpoint structure: block.1.conv.conv, block.3.conv.conv, shortcut.conv.conv"""
+    def __init__(self, dim):
+        super().__init__()
+        self.block = nn.Sequential()
+        self.block.add_module('0', nn.ELU())
+        self.block.add_module('1', _ConvWrapper(dim, dim // 2, 3, padding=1))
+        self.block.add_module('2', nn.ELU())
+        self.block.add_module('3', _ConvWrapper(dim // 2, dim, 1))
+        self.shortcut = _ConvWrapper(dim, dim, 1)
+    def forward(self, x):
+        return self.shortcut(x) + self.block(x)
+class _LSTMWrapper(nn.Module):
+    """LSTM wrapper matching checkpoint: lstm.weight_ih_l0, etc."""
+    def __init__(self, dim, num_layers=2):
         super().__init__()
+        self.lstm = nn.LSTM(dim, dim, num_layers=num_layers, batch_first=True)
+    def forward(self, x):
+        x = x.transpose(1, 2)
+        y, _ = self.lstm(x)
+        y = y + x
+        return y.transpose(1, 2)
+class EncoderModel(nn.Module):
     """
+    Encoder matching checkpoint: feature_extractor.encodec.encoder.model.*
+    Structure based on checkpoint:
+    - model.0: initial conv (1 -> 32)
+    - model.1: residual block (32)
+    - model.2: ELU (not saved)
+    - model.3: downsample conv (32->64, stride=2)
+    - model.4: residual block (64)
+    - model.5: ELU
+    - model.6: downsample conv (64->128, stride=4)
+    - model.7: residual block (128)
+    - model.8: ELU
+    - model.9: downsample conv (128->256, stride=5)
+    - model.10: residual block (256)
+    - model.11: ELU
+    - model.12: downsample conv (256->512, stride=8)
+    - model.13: LSTM
+    - model.14: ELU
+    - model.15: output conv (512->512)
     """
+    def __init__(self, channels=1, n_filters=32, dimension=512, ratios=[2, 4, 5, 8]):
         super().__init__()
+        layers = []
+        # model.0: Initial conv
+        layers.append(_ConvWrapper(channels, n_filters, 7, padding=3))
+        # Encoder blocks with downsampling
+        in_ch = n_filters
+        for ratio in ratios:
+            out_ch = in_ch * 2
+            # Residual block
+            layers.append(_ResBlockWrapper(in_ch))
+            # ELU (implicit in original, but we need it)
+            layers.append(nn.ELU())
+            # Downsample conv
+            layers.append(_ConvWrapper(in_ch, out_ch, ratio * 2, stride=ratio, padding=ratio // 2))
+            in_ch = out_ch
+        # LSTM
+        layers.append(_LSTMWrapper(in_ch))
+        # ELU
+        layers.append(nn.ELU())
+        # Output conv
+        layers.append(_ConvWrapper(in_ch, dimension, 7, padding=3))
+        self.model = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.model(x)
+# =============================================================================
+# Quantizer (matching feature_extractor.encodec.quantizer.vq.layers.0._codebook.*)
+# =============================================================================
+class Codebook(nn.Module):
+    """Codebook matching checkpoint: _codebook.embed, _codebook.inited, _codebook.cluster_size, _codebook.embed_avg"""
+    def __init__(self, num_embeddings, embedding_dim):
         super().__init__()
+        # These match checkpoint structure exactly
+        self.register_buffer('inited', torch.zeros(1))
+        self.register_buffer('cluster_size', torch.zeros(num_embeddings))
+        self.register_buffer('embed', torch.randn(num_embeddings, embedding_dim))
+        self.register_buffer('embed_avg', torch.randn(num_embeddings, embedding_dim))
+    def forward(self, x):
         """
         Args:
+            x: (B, T, D) input
         Returns:
+            quantized: (B, T, D) quantized output
+            indices: (B, T) codebook indices
         """
         # L2 normalize
+        embed = F.normalize(self.embed, dim=-1)
+        x_norm = F.normalize(x, dim=-1)
+        # Find nearest
+        dist = torch.cdist(x_norm, embed)
+        indices = dist.argmin(dim=-1)
+        # Quantize
+        quantized = F.embedding(indices, embed)
         # Straight-through
+        quantized = x_norm + (quantized - x_norm).detach()
+        return quantized, indices
+    def decode(self, indices):
+        embed = F.normalize(self.embed, dim=-1)
+        return F.embedding(indices, embed)
+class VQLayer(nn.Module):
+    """VQ layer matching checkpoint: vq.layers.0._codebook.*"""
+    def __init__(self, dim, codebook_size):
+        super().__init__()
+        self._codebook = Codebook(codebook_size, dim)
+    def forward(self, x):
+        # x: (B, D, T)
+        x = x.transpose(1, 2)  # (B, T, D)
+        quantized, indices = self._codebook(x)
+        return quantized.transpose(1, 2), indices
+    def decode(self, indices):
+        quantized = self._codebook.decode(indices)
+        return quantized.transpose(1, 2)
+class VQ(nn.Module):
+    """VQ wrapper matching checkpoint: vq.layers"""
+    def __init__(self, dim, codebook_size, num_quantizers=1):
         super().__init__()
+        self.layers = nn.ModuleList([
+            VQLayer(dim, codebook_size) for _ in range(num_quantizers)
         ])
+    def forward(self, x):
+        indices_list = []
+        quantized = torch.zeros_like(x)
+        residual = x
+        for layer in self.layers:
+            q, idx = layer(residual)
+            residual = residual - q
+            quantized = quantized + q
+            indices_list.append(idx)
+        indices = torch.stack(indices_list, dim=1)
+        return quantized, indices
+    def decode(self, indices):
+        quantized = None
+        for i, layer in enumerate(self.layers):
+            q = layer.decode(indices[:, i])
+            quantized = q if quantized is None else quantized + q
+        return quantized
+class Quantizer(nn.Module):
+    """Quantizer matching checkpoint: quantizer.vq"""
+    def __init__(self, dim, codebook_size, num_quantizers=1):
+        super().__init__()
+        self.vq = VQ(dim, codebook_size, num_quantizers)
+    def forward(self, x):
+        return self.vq(x)
+    def decode(self, indices):
+        return self.vq.decode(indices)
+class EnCodecWrapper(nn.Module):
+    """Wrapper matching checkpoint: encodec.encoder, encodec.quantizer"""
+    def __init__(self, channels=1, n_filters=32, dimension=512, ratios=[2, 4, 5, 8],
+                 codebook_size=4096, num_quantizers=1):
+        super().__init__()
+        self.encoder = EncoderModel(channels, n_filters, dimension, ratios)
+        self.quantizer = Quantizer(dimension, codebook_size, num_quantizers)
+        # Note: decoder exists in checkpoint but we use Vocos backbone instead
+    def encode(self, x):
+        z = self.encoder(x)
+        z_q, codes = self.quantizer(z)
+        return z_q, codes
+class FeatureExtractor(nn.Module):
+    """Feature extractor matching checkpoint: feature_extractor.encodec"""
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.encodec = EnCodecWrapper(**kwargs)
+    def encode(self, x):
+        return self.encodec.encode(x)
+    def decode_codes(self, codes):
+        return self.encodec.quantizer.decode(codes)
+# =============================================================================
+# Backbone (Vocos-style with bandwidth-conditioned AdaLayerNorm)
+# =============================================================================
+class AdaLayerNorm(nn.Module):
+    """
+    Bandwidth-conditioned Adaptive LayerNorm.
+    Checkpoint structure:
+    - norm.scale.weight: [4, 768] (4 bandwidth conditions)
+    - norm.shift.weight: [4, 768]
+    """
+    def __init__(self, dim, num_bandwidths=4, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+        self.dim = dim
+        # Match checkpoint: scale.weight and shift.weight are [num_bandwidths, dim]
+        self.scale = nn.Embedding(num_bandwidths, dim)
+        self.shift = nn.Embedding(num_bandwidths, dim)
+        # Initialize
+        nn.init.ones_(self.scale.weight)
+        nn.init.zeros_(self.shift.weight)
+    def forward(self, x, bandwidth_id=None):
+        """
+        Args:
+            x: (B, C, T) input
+            bandwidth_id: (B,) bandwidth index, or None for default (0)
+        """
+        # Normalize
+        mean = x.mean(dim=1, keepdim=True)
+        var = x.var(dim=1, keepdim=True, unbiased=False)
+        x = (x - mean) / torch.sqrt(var + self.eps)
+        # Get scale/shift based on bandwidth_id
+        if bandwidth_id is None:
+            bandwidth_id = torch.zeros(x.shape[0], dtype=torch.long, device=x.device)
+        scale = self.scale(bandwidth_id)  # (B, dim)
+        shift = self.shift(bandwidth_id)  # (B, dim)
+        # Apply: (B, dim, 1) for broadcasting
+        x = x * scale.unsqueeze(-1) + shift.unsqueeze(-1)
+        return x
 class ConvNeXtBlock(nn.Module):
+    """
+    ConvNeXt block matching checkpoint structure exactly.
+    Checkpoint keys:
+    - dwconv.weight: [768, 1, 7]
+    - dwconv.bias: [768]
+    - norm.scale.weight: [4, 768]
+    - norm.shift.weight: [4, 768]
+    - pwconv1.weight: [2304, 768]
+    - pwconv1.bias: [2304]
+    - pwconv2.weight: [768, 2304]
+    - pwconv2.bias: [768]
+    - gamma: [768]
+    """
+    def __init__(self, dim, intermediate_dim, kernel_size=7, layer_scale_init=1e-6, num_bandwidths=4):
         super().__init__()
         padding = (kernel_size - 1) // 2
         self.dwconv = nn.Conv1d(dim, dim, kernel_size, padding=padding, groups=dim)
+        self.norm = AdaLayerNorm(dim, num_bandwidths)
         self.pwconv1 = nn.Linear(dim, intermediate_dim)
         self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = nn.Parameter(layer_scale_init * torch.ones(dim))
+    def forward(self, x, bandwidth_id=None):
         residual = x
         x = self.dwconv(x)
+        x = self.norm(x, bandwidth_id)
+        x = x.transpose(1, 2)  # (B, T, C)
         x = self.pwconv1(x)
+        x = F.gelu(x)
         x = self.pwconv2(x)
+        x = x.transpose(1, 2)  # (B, C, T)
+        x = self.gamma.unsqueeze(0).unsqueeze(-1) * x
         return residual + x
+class Backbone(nn.Module):
+    """
+    Vocos backbone matching checkpoint structure.
+    Checkpoint keys:
+    - embed.weight, embed.bias
+    - norm.scale.weight, norm.shift.weight
+    - convnext.0-11.*
+    - final_layer_norm.weight, final_layer_norm.bias
+    """
+    def __init__(self, input_dim=512, dim=768, intermediate_dim=2304, num_blocks=12,
+                 num_bandwidths=4):
         super().__init__()
+        # Input projection: backbone.embed
+        self.embed = nn.Conv1d(input_dim, dim, kernel_size=3, padding=1)
+        # Input normalization: backbone.norm
+        self.norm = AdaLayerNorm(dim, num_bandwidths)
+        # ConvNeXt blocks: backbone.convnext.0-11
         self.convnext = nn.ModuleList([
+            ConvNeXtBlock(dim, intermediate_dim, num_bandwidths=num_bandwidths)
             for _ in range(num_blocks)
         ])
+        # Final norm: backbone.final_layer_norm
+        self.final_layer_norm = nn.LayerNorm(dim)
+    def forward(self, x, bandwidth_id=None):
         # Input projection
+        x = self.embed(x)
+        x = self.norm(x, bandwidth_id)
         # ConvNeXt blocks
         for block in self.convnext:
+            x = block(x, bandwidth_id)
         # Final norm
+        x = x.transpose(1, 2)  # (B, T, C)
+        x = self.final_layer_norm(x)
+        x = x.transpose(1, 2)  # (B, C, T)
         return x
+# =============================================================================
+# Head (iSTFT)
+# =============================================================================
+class ISTFT(nn.Module):
+    """ISTFT module matching checkpoint: istft.window"""
+    def __init__(self, n_fft=1280):
         super().__init__()
         self.n_fft = n_fft
+        self.register_buffer('window', torch.hann_window(n_fft))
+class ISTFTHead(nn.Module):
+    """
+    iSTFT head matching checkpoint structure.
+    Checkpoint keys:
+    - out.weight: [1282, 768]
+    - out.bias: [1282]
+    - istft.window: [1280]
+    """
+    def __init__(self, dim, n_fft=1280, hop_length=320, padding='center'):
         super().__init__()
         self.n_fft = n_fft
         self.hop_length = hop_length
         self.padding = padding
+        # Output projection: head.out
+        self.out = nn.Linear(dim, n_fft + 2)
+        # ISTFT window: head.istft.window
+        self.istft = ISTFT(n_fft)
+    def forward(self, x):
         """
         Args:
+            x: (B, C, T) backbone output
         Returns:
+            audio: (B, 1, samples)
         """
+        B, C, T = x.shape
+        x = x.transpose(1, 2)  # (B, T, C)
+        x = self.out(x)  # (B, T, n_fft + 2)
+        # Split magnitude and phase
+        n_bins = self.n_fft // 2 + 1  # 641
+        mag = torch.exp(x[:, :, :n_bins])
+        phase = x[:, :, n_bins:]
+        # Construct complex STFT
+        stft = torch.complex(mag * torch.cos(phase), mag * torch.sin(phase))
+        stft = stft.transpose(1, 2)  # (B, n_bins, T)
+        # Inverse STFT
+        audio = torch.istft(
+            stft,
             n_fft=self.n_fft,
             hop_length=self.hop_length,
+            win_length=self.n_fft,
+            window=self.istft.window,
+            center=(self.padding == 'center'),
+            return_complex=False,
         )
+        return audio.unsqueeze(1)
+# =============================================================================
 # Main WavTokenizer Model
+# =============================================================================
 class WavTokenizer(PreTrainedModel):
     """
+    WavTokenizer model for audio tokenization.
+    This implementation exactly matches the checkpoint structure for direct weight loading.
     """
     config_class = WavTokenizerConfig
+    base_model_prefix = "wavtokenizer"
     def __init__(self, config: WavTokenizerConfig):
         super().__init__(config)
+        self.config = config
+        # Feature extractor (encoder + quantizer)
+        # Matches: feature_extractor.encodec.*
+        self.feature_extractor = FeatureExtractor(
+            channels=1,
+            n_filters=config.encoder_dim,
+            dimension=config.latent_dim,
+            ratios=config.encoder_rates,
             codebook_size=config.codebook_size,
             num_quantizers=config.num_quantizers,
         )
+        # Backbone (Vocos-style decoder)
+        # Matches: backbone.*
+        self.backbone = Backbone(
+            input_dim=config.latent_dim,
             dim=config.backbone_dim,
             intermediate_dim=config.backbone_intermediate_dim,
             num_blocks=config.backbone_num_blocks,
+            num_bandwidths=4,
         )
+        # Head (iSTFT)
+        # Matches: head.*
         self.head = ISTFTHead(
             dim=config.backbone_dim,
             n_fft=config.n_fft,
             padding=config.padding,
         )
         self.post_init()
+    def encode(self, audio, bandwidth_id=None):
         """
+        Encode audio to quantized features and codes.
         Args:
+            audio: (B, 1, T) audio waveform
+            bandwidth_id: Optional (B,) bandwidth index
         Returns:
+            features: (B, D, T') quantized features
+            codes: (B, num_quantizers, T') discrete codes
         """
+        return self.feature_extractor.encode(audio)
+    def encode_infer(self, audio, bandwidth_id=None):
         """
+        Encode audio for inference.
         Args:
+            audio: (B, 1, T) audio waveform
+            bandwidth_id: Optional bandwidth index (scalar or tensor)
         Returns:
+            features: (B, D, T') quantized features
+            codes: (B, T') discrete codes (squeezed for single quantizer)
         """
+        features, codes = self.encode(audio, bandwidth_id)
+        if codes.shape[1] == 1:
+            codes = codes.squeeze(1)
+        return features, codes
+    def decode(self, features, bandwidth_id=None):
         """
+        Decode features to audio.
         Args:
+            features: (B, D, T') quantized features
+            bandwidth_id: Optional (B,) bandwidth index
         Returns:
+            audio: (B, 1, T) reconstructed waveform
         """
+        x = self.backbone(features, bandwidth_id)
+        return self.head(x)
+    def codes_to_features(self, codes):
         """
+        Convert discrete codes back to continuous features.
         Args:
+            codes: (B, T) or (B, num_quantizers, T) discrete codes
         Returns:
+            features: (B, D, T) continuous features
         """
+        if codes.dim() == 2:
+            codes = codes.unsqueeze(1)
+        return self.feature_extractor.decode_codes(codes)
     def forward(
         self,
+        input_values: Optional[torch.Tensor] = None,
+        input_ids: Optional[torch.Tensor] = None,
+        bandwidth_id: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
         """
+        HuggingFace-style forward pass.
         Args:
+            input_values: (B, 1, T) or (B, T) audio waveform
+            input_ids: (B, T) or (B, num_quantizers, T) discrete codes
+            bandwidth_id: Optional (B,) bandwidth index
         Returns:
+            BaseModelOutput with last_hidden_state (features) and hidden_states (codes, audio)
         """
+        if input_values is not None:
+            if input_values.dim() == 2:
+                input_values = input_values.unsqueeze(1)
+            features, codes = self.encode(input_values, bandwidth_id)
+            audio = self.decode(features, bandwidth_id)
+            return BaseModelOutput(
+                last_hidden_state=features,
+                hidden_states=(codes, audio),
+            )
+        elif input_ids is not None:
+            features = self.codes_to_features(input_ids)
+            audio = self.decode(features, bandwidth_id)
+            return BaseModelOutput(
+                last_hidden_state=features,
+                hidden_states=(input_ids, audio),
+            )
+        else:
+            raise ValueError("Either input_values or input_ids must be provided")