|
|
""" |
|
|
WavTokenizer Configuration for HuggingFace Transformers |
|
|
|
|
|
This configuration class defines all the hyperparameters for WavTokenizer, |
|
|
an acoustic discrete codec tokenizer for audio language modeling. |
|
|
""" |
|
|
|
|
|
from transformers import PretrainedConfig |
|
|
|
|
|
|
|
|
class WavTokenizerConfig(PretrainedConfig): |
|
|
""" |
|
|
Configuration class for WavTokenizer model. |
|
|
|
|
|
WavTokenizer is a SOTA discrete acoustic codec model that compresses audio |
|
|
into discrete tokens (40 or 75 tokens per second) while maintaining high |
|
|
reconstruction quality. |
|
|
|
|
|
Args: |
|
|
sample_rate (`int`, *optional*, defaults to 24000): |
|
|
The sample rate of input audio. |
|
|
n_fft (`int`, *optional*, defaults to 1280): |
|
|
FFT size for STFT. |
|
|
hop_length (`int`, *optional*, defaults to 320): |
|
|
Hop length for STFT (determines frame rate: 24000/320 = 75 fps). |
|
|
n_mels (`int`, *optional*, defaults to 128): |
|
|
Number of mel filterbank channels. |
|
|
padding (`str`, *optional*, defaults to "center"): |
|
|
Padding mode for STFT ("center" or "same"). |
|
|
|
|
|
feature_dim (`int`, *optional*, defaults to 512): |
|
|
Dimension of the feature backbone. |
|
|
encoder_dim (`int`, *optional*, defaults to 64): |
|
|
Dimension of encoder output. |
|
|
encoder_rates (`list[int]`, *optional*, defaults to [8, 5, 4, 2]): |
|
|
Downsampling rates for the encoder. |
|
|
latent_dim (`int`, *optional*): |
|
|
Dimension of the latent space (defaults to feature_dim). |
|
|
|
|
|
codebook_size (`int`, *optional*, defaults to 4096): |
|
|
Size of the VQ codebook. |
|
|
codebook_dim (`int`, *optional*, defaults to 8): |
|
|
Dimension of codebook vectors. |
|
|
num_quantizers (`int`, *optional*, defaults to 1): |
|
|
Number of residual vector quantizers. |
|
|
|
|
|
backbone_type (`str`, *optional*, defaults to "vocos"): |
|
|
Type of decoder backbone ("vocos"). |
|
|
backbone_dim (`int`, *optional*, defaults to 512): |
|
|
Dimension of the decoder backbone. |
|
|
backbone_num_blocks (`int`, *optional*, defaults to 8): |
|
|
Number of ConvNeXt blocks in the backbone. |
|
|
backbone_intermediate_dim (`int`, *optional*, defaults to 1536): |
|
|
Intermediate dimension in ConvNeXt blocks. |
|
|
backbone_kernel_size (`int`, *optional*, defaults to 7): |
|
|
Kernel size for depthwise convolutions. |
|
|
backbone_layer_scale_init_value (`float`, *optional*, defaults to 1e-6): |
|
|
Initial value for layer scale. |
|
|
|
|
|
head_type (`str`, *optional*, defaults to "istft"): |
|
|
Type of waveform synthesis head ("istft"). |
|
|
head_dim (`int`, *optional*, defaults to 1025): |
|
|
Output dimension for the head (n_fft // 2 + 1). |
|
|
|
|
|
use_attention (`bool`, *optional*, defaults to True): |
|
|
Whether to use attention in the decoder. |
|
|
attention_dim (`int`, *optional*, defaults to 512): |
|
|
Dimension for attention layers. |
|
|
attention_heads (`int`, *optional*, defaults to 8): |
|
|
Number of attention heads. |
|
|
attention_layers (`int`, *optional*, defaults to 1): |
|
|
Number of attention layers. |
|
|
""" |
|
|
|
|
|
model_type = "wavtokenizer" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
|
|
|
sample_rate: int = 24000, |
|
|
n_fft: int = 1280, |
|
|
hop_length: int = 320, |
|
|
n_mels: int = 128, |
|
|
padding: str = "center", |
|
|
|
|
|
|
|
|
feature_dim: int = 512, |
|
|
encoder_dim: int = 32, |
|
|
encoder_rates: list = None, |
|
|
latent_dim: int = None, |
|
|
|
|
|
|
|
|
codebook_size: int = 4096, |
|
|
codebook_dim: int = 512, |
|
|
num_quantizers: int = 1, |
|
|
|
|
|
|
|
|
backbone_type: str = "vocos", |
|
|
backbone_dim: int = 768, |
|
|
backbone_num_blocks: int = 12, |
|
|
backbone_intermediate_dim: int = 2304, |
|
|
backbone_kernel_size: int = 7, |
|
|
backbone_layer_scale_init_value: float = 1e-6, |
|
|
|
|
|
|
|
|
head_type: str = "istft", |
|
|
head_dim: int = 1025, |
|
|
|
|
|
|
|
|
use_attention: bool = True, |
|
|
attention_dim: int = 512, |
|
|
attention_heads: int = 8, |
|
|
attention_layers: int = 1, |
|
|
|
|
|
**kwargs |
|
|
): |
|
|
super().__init__(**kwargs) |
|
|
|
|
|
|
|
|
self.sample_rate = sample_rate |
|
|
self.n_fft = n_fft |
|
|
self.hop_length = hop_length |
|
|
self.n_mels = n_mels |
|
|
self.padding = padding |
|
|
|
|
|
|
|
|
self.feature_dim = feature_dim |
|
|
self.encoder_dim = encoder_dim |
|
|
self.encoder_rates = encoder_rates if encoder_rates is not None else [2, 4, 5, 8] |
|
|
self.latent_dim = latent_dim if latent_dim is not None else feature_dim |
|
|
|
|
|
|
|
|
self.codebook_size = codebook_size |
|
|
self.codebook_dim = codebook_dim |
|
|
self.num_quantizers = num_quantizers |
|
|
|
|
|
|
|
|
self.backbone_type = backbone_type |
|
|
self.backbone_dim = backbone_dim |
|
|
self.backbone_num_blocks = backbone_num_blocks |
|
|
self.backbone_intermediate_dim = backbone_intermediate_dim |
|
|
self.backbone_kernel_size = backbone_kernel_size |
|
|
self.backbone_layer_scale_init_value = backbone_layer_scale_init_value |
|
|
|
|
|
|
|
|
self.head_type = head_type |
|
|
self.head_dim = head_dim |
|
|
|
|
|
|
|
|
self.use_attention = use_attention |
|
|
self.attention_dim = attention_dim |
|
|
self.attention_heads = attention_heads |
|
|
self.attention_layers = attention_layers |
|
|
|
|
|
@property |
|
|
def vocab_size(self) -> int: |
|
|
"""Returns the vocabulary size (codebook size).""" |
|
|
return self.codebook_size |
|
|
|
|
|
@property |
|
|
def frame_rate(self) -> float: |
|
|
"""Returns the frame rate (tokens per second).""" |
|
|
return self.sample_rate / self.hop_length |
|
|
|