""" WavTokenizer Configuration for HuggingFace Transformers This configuration class defines all the hyperparameters for WavTokenizer, an acoustic discrete codec tokenizer for audio language modeling. """ from transformers import PretrainedConfig class WavTokenizerConfig(PretrainedConfig): """ Configuration class for WavTokenizer model. WavTokenizer is a SOTA discrete acoustic codec model that compresses audio into discrete tokens (40 or 75 tokens per second) while maintaining high reconstruction quality. Args: sample_rate (`int`, *optional*, defaults to 24000): The sample rate of input audio. n_fft (`int`, *optional*, defaults to 1280): FFT size for STFT. hop_length (`int`, *optional*, defaults to 320): Hop length for STFT (determines frame rate: 24000/320 = 75 fps). n_mels (`int`, *optional*, defaults to 128): Number of mel filterbank channels. padding (`str`, *optional*, defaults to "center"): Padding mode for STFT ("center" or "same"). feature_dim (`int`, *optional*, defaults to 512): Dimension of the feature backbone. encoder_dim (`int`, *optional*, defaults to 64): Dimension of encoder output. encoder_rates (`list[int]`, *optional*, defaults to [8, 5, 4, 2]): Downsampling rates for the encoder. latent_dim (`int`, *optional*): Dimension of the latent space (defaults to feature_dim). codebook_size (`int`, *optional*, defaults to 4096): Size of the VQ codebook. codebook_dim (`int`, *optional*, defaults to 8): Dimension of codebook vectors. num_quantizers (`int`, *optional*, defaults to 1): Number of residual vector quantizers. backbone_type (`str`, *optional*, defaults to "vocos"): Type of decoder backbone ("vocos"). backbone_dim (`int`, *optional*, defaults to 512): Dimension of the decoder backbone. backbone_num_blocks (`int`, *optional*, defaults to 8): Number of ConvNeXt blocks in the backbone. backbone_intermediate_dim (`int`, *optional*, defaults to 1536): Intermediate dimension in ConvNeXt blocks. backbone_kernel_size (`int`, *optional*, defaults to 7): Kernel size for depthwise convolutions. backbone_layer_scale_init_value (`float`, *optional*, defaults to 1e-6): Initial value for layer scale. head_type (`str`, *optional*, defaults to "istft"): Type of waveform synthesis head ("istft"). head_dim (`int`, *optional*, defaults to 1025): Output dimension for the head (n_fft // 2 + 1). use_attention (`bool`, *optional*, defaults to True): Whether to use attention in the decoder. attention_dim (`int`, *optional*, defaults to 512): Dimension for attention layers. attention_heads (`int`, *optional*, defaults to 8): Number of attention heads. attention_layers (`int`, *optional*, defaults to 1): Number of attention layers. """ model_type = "wavtokenizer" def __init__( self, # Audio parameters sample_rate: int = 24000, n_fft: int = 1280, hop_length: int = 320, n_mels: int = 128, padding: str = "center", # Feature dimensions feature_dim: int = 512, encoder_dim: int = 32, encoder_rates: list = None, latent_dim: int = None, # Quantizer parameters codebook_size: int = 4096, codebook_dim: int = 512, num_quantizers: int = 1, # Backbone parameters backbone_type: str = "vocos", backbone_dim: int = 768, backbone_num_blocks: int = 12, backbone_intermediate_dim: int = 2304, backbone_kernel_size: int = 7, backbone_layer_scale_init_value: float = 1e-6, # Head parameters head_type: str = "istft", head_dim: int = 1025, # Attention parameters use_attention: bool = True, attention_dim: int = 512, attention_heads: int = 8, attention_layers: int = 1, **kwargs ): super().__init__(**kwargs) # Audio self.sample_rate = sample_rate self.n_fft = n_fft self.hop_length = hop_length self.n_mels = n_mels self.padding = padding # Feature dimensions self.feature_dim = feature_dim self.encoder_dim = encoder_dim self.encoder_rates = encoder_rates if encoder_rates is not None else [2, 4, 5, 8] self.latent_dim = latent_dim if latent_dim is not None else feature_dim # Quantizer self.codebook_size = codebook_size self.codebook_dim = codebook_dim self.num_quantizers = num_quantizers # Backbone self.backbone_type = backbone_type self.backbone_dim = backbone_dim self.backbone_num_blocks = backbone_num_blocks self.backbone_intermediate_dim = backbone_intermediate_dim self.backbone_kernel_size = backbone_kernel_size self.backbone_layer_scale_init_value = backbone_layer_scale_init_value # Head self.head_type = head_type self.head_dim = head_dim # Attention self.use_attention = use_attention self.attention_dim = attention_dim self.attention_heads = attention_heads self.attention_layers = attention_layers @property def vocab_size(self) -> int: """Returns the vocabulary size (codebook size).""" return self.codebook_size @property def frame_rate(self) -> float: """Returns the frame rate (tokens per second).""" return self.sample_rate / self.hop_length