WavTokenizer / configuration_wavtokenizer.py
klemenk's picture
Update configuration_wavtokenizer.py
3592734 verified
"""
WavTokenizer Configuration for HuggingFace Transformers
This configuration class defines all the hyperparameters for WavTokenizer,
an acoustic discrete codec tokenizer for audio language modeling.
"""
from transformers import PretrainedConfig
class WavTokenizerConfig(PretrainedConfig):
"""
Configuration class for WavTokenizer model.
WavTokenizer is a SOTA discrete acoustic codec model that compresses audio
into discrete tokens (40 or 75 tokens per second) while maintaining high
reconstruction quality.
Args:
sample_rate (`int`, *optional*, defaults to 24000):
The sample rate of input audio.
n_fft (`int`, *optional*, defaults to 1280):
FFT size for STFT.
hop_length (`int`, *optional*, defaults to 320):
Hop length for STFT (determines frame rate: 24000/320 = 75 fps).
n_mels (`int`, *optional*, defaults to 128):
Number of mel filterbank channels.
padding (`str`, *optional*, defaults to "center"):
Padding mode for STFT ("center" or "same").
feature_dim (`int`, *optional*, defaults to 512):
Dimension of the feature backbone.
encoder_dim (`int`, *optional*, defaults to 64):
Dimension of encoder output.
encoder_rates (`list[int]`, *optional*, defaults to [8, 5, 4, 2]):
Downsampling rates for the encoder.
latent_dim (`int`, *optional*):
Dimension of the latent space (defaults to feature_dim).
codebook_size (`int`, *optional*, defaults to 4096):
Size of the VQ codebook.
codebook_dim (`int`, *optional*, defaults to 8):
Dimension of codebook vectors.
num_quantizers (`int`, *optional*, defaults to 1):
Number of residual vector quantizers.
backbone_type (`str`, *optional*, defaults to "vocos"):
Type of decoder backbone ("vocos").
backbone_dim (`int`, *optional*, defaults to 512):
Dimension of the decoder backbone.
backbone_num_blocks (`int`, *optional*, defaults to 8):
Number of ConvNeXt blocks in the backbone.
backbone_intermediate_dim (`int`, *optional*, defaults to 1536):
Intermediate dimension in ConvNeXt blocks.
backbone_kernel_size (`int`, *optional*, defaults to 7):
Kernel size for depthwise convolutions.
backbone_layer_scale_init_value (`float`, *optional*, defaults to 1e-6):
Initial value for layer scale.
head_type (`str`, *optional*, defaults to "istft"):
Type of waveform synthesis head ("istft").
head_dim (`int`, *optional*, defaults to 1025):
Output dimension for the head (n_fft // 2 + 1).
use_attention (`bool`, *optional*, defaults to True):
Whether to use attention in the decoder.
attention_dim (`int`, *optional*, defaults to 512):
Dimension for attention layers.
attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads.
attention_layers (`int`, *optional*, defaults to 1):
Number of attention layers.
"""
model_type = "wavtokenizer"
def __init__(
self,
# Audio parameters
sample_rate: int = 24000,
n_fft: int = 1280,
hop_length: int = 320,
n_mels: int = 128,
padding: str = "center",
# Feature dimensions
feature_dim: int = 512,
encoder_dim: int = 32,
encoder_rates: list = None,
latent_dim: int = None,
# Quantizer parameters
codebook_size: int = 4096,
codebook_dim: int = 512,
num_quantizers: int = 1,
# Backbone parameters
backbone_type: str = "vocos",
backbone_dim: int = 768,
backbone_num_blocks: int = 12,
backbone_intermediate_dim: int = 2304,
backbone_kernel_size: int = 7,
backbone_layer_scale_init_value: float = 1e-6,
# Head parameters
head_type: str = "istft",
head_dim: int = 1025,
# Attention parameters
use_attention: bool = True,
attention_dim: int = 512,
attention_heads: int = 8,
attention_layers: int = 1,
**kwargs
):
super().__init__(**kwargs)
# Audio
self.sample_rate = sample_rate
self.n_fft = n_fft
self.hop_length = hop_length
self.n_mels = n_mels
self.padding = padding
# Feature dimensions
self.feature_dim = feature_dim
self.encoder_dim = encoder_dim
self.encoder_rates = encoder_rates if encoder_rates is not None else [2, 4, 5, 8]
self.latent_dim = latent_dim if latent_dim is not None else feature_dim
# Quantizer
self.codebook_size = codebook_size
self.codebook_dim = codebook_dim
self.num_quantizers = num_quantizers
# Backbone
self.backbone_type = backbone_type
self.backbone_dim = backbone_dim
self.backbone_num_blocks = backbone_num_blocks
self.backbone_intermediate_dim = backbone_intermediate_dim
self.backbone_kernel_size = backbone_kernel_size
self.backbone_layer_scale_init_value = backbone_layer_scale_init_value
# Head
self.head_type = head_type
self.head_dim = head_dim
# Attention
self.use_attention = use_attention
self.attention_dim = attention_dim
self.attention_heads = attention_heads
self.attention_layers = attention_layers
@property
def vocab_size(self) -> int:
"""Returns the vocabulary size (codebook size)."""
return self.codebook_size
@property
def frame_rate(self) -> float:
"""Returns the frame rate (tokens per second)."""
return self.sample_rate / self.hop_length