File size: 6,054 Bytes
1b82420 3592734 1b82420 3592734 1b82420 3592734 1b82420 3592734 1b82420 3592734 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
"""
WavTokenizer Configuration for HuggingFace Transformers
This configuration class defines all the hyperparameters for WavTokenizer,
an acoustic discrete codec tokenizer for audio language modeling.
"""
from transformers import PretrainedConfig
class WavTokenizerConfig(PretrainedConfig):
"""
Configuration class for WavTokenizer model.
WavTokenizer is a SOTA discrete acoustic codec model that compresses audio
into discrete tokens (40 or 75 tokens per second) while maintaining high
reconstruction quality.
Args:
sample_rate (`int`, *optional*, defaults to 24000):
The sample rate of input audio.
n_fft (`int`, *optional*, defaults to 1280):
FFT size for STFT.
hop_length (`int`, *optional*, defaults to 320):
Hop length for STFT (determines frame rate: 24000/320 = 75 fps).
n_mels (`int`, *optional*, defaults to 128):
Number of mel filterbank channels.
padding (`str`, *optional*, defaults to "center"):
Padding mode for STFT ("center" or "same").
feature_dim (`int`, *optional*, defaults to 512):
Dimension of the feature backbone.
encoder_dim (`int`, *optional*, defaults to 64):
Dimension of encoder output.
encoder_rates (`list[int]`, *optional*, defaults to [8, 5, 4, 2]):
Downsampling rates for the encoder.
latent_dim (`int`, *optional*):
Dimension of the latent space (defaults to feature_dim).
codebook_size (`int`, *optional*, defaults to 4096):
Size of the VQ codebook.
codebook_dim (`int`, *optional*, defaults to 8):
Dimension of codebook vectors.
num_quantizers (`int`, *optional*, defaults to 1):
Number of residual vector quantizers.
backbone_type (`str`, *optional*, defaults to "vocos"):
Type of decoder backbone ("vocos").
backbone_dim (`int`, *optional*, defaults to 512):
Dimension of the decoder backbone.
backbone_num_blocks (`int`, *optional*, defaults to 8):
Number of ConvNeXt blocks in the backbone.
backbone_intermediate_dim (`int`, *optional*, defaults to 1536):
Intermediate dimension in ConvNeXt blocks.
backbone_kernel_size (`int`, *optional*, defaults to 7):
Kernel size for depthwise convolutions.
backbone_layer_scale_init_value (`float`, *optional*, defaults to 1e-6):
Initial value for layer scale.
head_type (`str`, *optional*, defaults to "istft"):
Type of waveform synthesis head ("istft").
head_dim (`int`, *optional*, defaults to 1025):
Output dimension for the head (n_fft // 2 + 1).
use_attention (`bool`, *optional*, defaults to True):
Whether to use attention in the decoder.
attention_dim (`int`, *optional*, defaults to 512):
Dimension for attention layers.
attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads.
attention_layers (`int`, *optional*, defaults to 1):
Number of attention layers.
"""
model_type = "wavtokenizer"
def __init__(
self,
# Audio parameters
sample_rate: int = 24000,
n_fft: int = 1280,
hop_length: int = 320,
n_mels: int = 128,
padding: str = "center",
# Feature dimensions
feature_dim: int = 512,
encoder_dim: int = 32,
encoder_rates: list = None,
latent_dim: int = None,
# Quantizer parameters
codebook_size: int = 4096,
codebook_dim: int = 512,
num_quantizers: int = 1,
# Backbone parameters
backbone_type: str = "vocos",
backbone_dim: int = 768,
backbone_num_blocks: int = 12,
backbone_intermediate_dim: int = 2304,
backbone_kernel_size: int = 7,
backbone_layer_scale_init_value: float = 1e-6,
# Head parameters
head_type: str = "istft",
head_dim: int = 1025,
# Attention parameters
use_attention: bool = True,
attention_dim: int = 512,
attention_heads: int = 8,
attention_layers: int = 1,
**kwargs
):
super().__init__(**kwargs)
# Audio
self.sample_rate = sample_rate
self.n_fft = n_fft
self.hop_length = hop_length
self.n_mels = n_mels
self.padding = padding
# Feature dimensions
self.feature_dim = feature_dim
self.encoder_dim = encoder_dim
self.encoder_rates = encoder_rates if encoder_rates is not None else [2, 4, 5, 8]
self.latent_dim = latent_dim if latent_dim is not None else feature_dim
# Quantizer
self.codebook_size = codebook_size
self.codebook_dim = codebook_dim
self.num_quantizers = num_quantizers
# Backbone
self.backbone_type = backbone_type
self.backbone_dim = backbone_dim
self.backbone_num_blocks = backbone_num_blocks
self.backbone_intermediate_dim = backbone_intermediate_dim
self.backbone_kernel_size = backbone_kernel_size
self.backbone_layer_scale_init_value = backbone_layer_scale_init_value
# Head
self.head_type = head_type
self.head_dim = head_dim
# Attention
self.use_attention = use_attention
self.attention_dim = attention_dim
self.attention_heads = attention_heads
self.attention_layers = attention_layers
@property
def vocab_size(self) -> int:
"""Returns the vocabulary size (codebook size)."""
return self.codebook_size
@property
def frame_rate(self) -> float:
"""Returns the frame rate (tokens per second)."""
return self.sample_rate / self.hop_length
|