File size: 6,054 Bytes
1b82420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3592734
1b82420
 
 
 
 
3592734
1b82420
 
 
 
3592734
 
 
1b82420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3592734
1b82420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3592734
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
WavTokenizer Configuration for HuggingFace Transformers

This configuration class defines all the hyperparameters for WavTokenizer,
an acoustic discrete codec tokenizer for audio language modeling.
"""

from transformers import PretrainedConfig


class WavTokenizerConfig(PretrainedConfig):
    """
    Configuration class for WavTokenizer model.
    
    WavTokenizer is a SOTA discrete acoustic codec model that compresses audio
    into discrete tokens (40 or 75 tokens per second) while maintaining high
    reconstruction quality.
    
    Args:
        sample_rate (`int`, *optional*, defaults to 24000):
            The sample rate of input audio.
        n_fft (`int`, *optional*, defaults to 1280):
            FFT size for STFT.
        hop_length (`int`, *optional*, defaults to 320):
            Hop length for STFT (determines frame rate: 24000/320 = 75 fps).
        n_mels (`int`, *optional*, defaults to 128):
            Number of mel filterbank channels.
        padding (`str`, *optional*, defaults to "center"):
            Padding mode for STFT ("center" or "same").
        
        feature_dim (`int`, *optional*, defaults to 512):
            Dimension of the feature backbone.
        encoder_dim (`int`, *optional*, defaults to 64):
            Dimension of encoder output.
        encoder_rates (`list[int]`, *optional*, defaults to [8, 5, 4, 2]):
            Downsampling rates for the encoder.
        latent_dim (`int`, *optional*):
            Dimension of the latent space (defaults to feature_dim).
        
        codebook_size (`int`, *optional*, defaults to 4096):
            Size of the VQ codebook.
        codebook_dim (`int`, *optional*, defaults to 8):
            Dimension of codebook vectors.
        num_quantizers (`int`, *optional*, defaults to 1):
            Number of residual vector quantizers.
        
        backbone_type (`str`, *optional*, defaults to "vocos"):
            Type of decoder backbone ("vocos").
        backbone_dim (`int`, *optional*, defaults to 512):
            Dimension of the decoder backbone.
        backbone_num_blocks (`int`, *optional*, defaults to 8):
            Number of ConvNeXt blocks in the backbone.
        backbone_intermediate_dim (`int`, *optional*, defaults to 1536):
            Intermediate dimension in ConvNeXt blocks.
        backbone_kernel_size (`int`, *optional*, defaults to 7):
            Kernel size for depthwise convolutions.
        backbone_layer_scale_init_value (`float`, *optional*, defaults to 1e-6):
            Initial value for layer scale.
        
        head_type (`str`, *optional*, defaults to "istft"):
            Type of waveform synthesis head ("istft").
        head_dim (`int`, *optional*, defaults to 1025):
            Output dimension for the head (n_fft // 2 + 1).
        
        use_attention (`bool`, *optional*, defaults to True):
            Whether to use attention in the decoder.
        attention_dim (`int`, *optional*, defaults to 512):
            Dimension for attention layers.
        attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads.
        attention_layers (`int`, *optional*, defaults to 1):
            Number of attention layers.
    """
    
    model_type = "wavtokenizer"
    
    def __init__(
        self,
        # Audio parameters
        sample_rate: int = 24000,
        n_fft: int = 1280,
        hop_length: int = 320,
        n_mels: int = 128,
        padding: str = "center",
        
        # Feature dimensions
        feature_dim: int = 512,
        encoder_dim: int = 32,
        encoder_rates: list = None,
        latent_dim: int = None,
        
        # Quantizer parameters
        codebook_size: int = 4096,
        codebook_dim: int = 512,
        num_quantizers: int = 1,
        
        # Backbone parameters
        backbone_type: str = "vocos",
        backbone_dim: int = 768,
        backbone_num_blocks: int = 12,
        backbone_intermediate_dim: int = 2304,
        backbone_kernel_size: int = 7,
        backbone_layer_scale_init_value: float = 1e-6,
        
        # Head parameters
        head_type: str = "istft",
        head_dim: int = 1025,
        
        # Attention parameters
        use_attention: bool = True,
        attention_dim: int = 512,
        attention_heads: int = 8,
        attention_layers: int = 1,
        
        **kwargs
    ):
        super().__init__(**kwargs)
        
        # Audio
        self.sample_rate = sample_rate
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.n_mels = n_mels
        self.padding = padding
        
        # Feature dimensions
        self.feature_dim = feature_dim
        self.encoder_dim = encoder_dim
        self.encoder_rates = encoder_rates if encoder_rates is not None else [2, 4, 5, 8]
        self.latent_dim = latent_dim if latent_dim is not None else feature_dim
        
        # Quantizer
        self.codebook_size = codebook_size
        self.codebook_dim = codebook_dim
        self.num_quantizers = num_quantizers
        
        # Backbone
        self.backbone_type = backbone_type
        self.backbone_dim = backbone_dim
        self.backbone_num_blocks = backbone_num_blocks
        self.backbone_intermediate_dim = backbone_intermediate_dim
        self.backbone_kernel_size = backbone_kernel_size
        self.backbone_layer_scale_init_value = backbone_layer_scale_init_value
        
        # Head
        self.head_type = head_type
        self.head_dim = head_dim
        
        # Attention
        self.use_attention = use_attention
        self.attention_dim = attention_dim
        self.attention_heads = attention_heads
        self.attention_layers = attention_layers
    
    @property
    def vocab_size(self) -> int:
        """Returns the vocabulary size (codebook size)."""
        return self.codebook_size
    
    @property
    def frame_rate(self) -> float:
        """Returns the frame rate (tokens per second)."""
        return self.sample_rate / self.hop_length