| | """ |
| | Dasheng Audio Tokenizer Configuration |
| | """ |
| |
|
| | from transformers import PretrainedConfig |
| |
|
| | class DashengTokenizerConfig(PretrainedConfig): |
| | """ |
| | Configuration class for DashEng Audio Tokenizer. |
| | |
| | This configuration is used to initialize the DashEng model with the same |
| | parameters as the original implementation in models.py. |
| | |
| | Args: |
| | target_nmels (int): Number of Mel bins for the frontend. Default: 100 |
| | decoder_embed_dim (int): Decoder embedding dimension. Default: 768 |
| | decoder_depth (int): Number of decoder layers. Default: 8 |
| | decoder_intermediate_size (int): Decoder intermediate size. Default: 1536 |
| | istft_n_fft (int): ISTFT n_fft parameter. Default: 1280 |
| | istft_hop (int): ISTFT hop parameter. Default: 640 |
| | upsample_tokens (int): Upsample factor for tokens. Default: 1 |
| | n_mels_patch (int): Number of Mel bins for patch embedding. Default: 100 |
| | hop_length (int): Hop length for Mel spectrogram. Default: 160 |
| | """ |
| |
|
| | model_type = "dashengtokenizer" |
| |
|
| | def __init__( |
| | self, |
| | embed_dim: int = 1280, |
| | depth:int = 32, |
| | num_heads: int = 16, |
| | decoder_embed_dim: int = 1280, |
| | decoder_depth: int = 12, |
| | decoder_intermediate_size: int = 5120, |
| | istft_n_fft: int = 1280, |
| | istft_hop: int = 320, |
| | upsample_tokens: int = 2, |
| | n_mels_patch: int = 128, |
| | hop_length: int = 160, |
| | **kwargs, |
| | ): |
| | super().__init__(**kwargs) |
| | self.embed_dim = embed_dim |
| | self.depth = depth |
| | self.num_heads = num_heads |
| | self.decoder_embed_dim = decoder_embed_dim |
| | self.decoder_depth = decoder_depth |
| | self.decoder_intermediate_size = decoder_intermediate_size |
| | self.istft_n_fft = istft_n_fft |
| | self.istft_hop = istft_hop |
| | self.upsample_tokens = upsample_tokens |
| | self.n_mels_patch = n_mels_patch |
| | self.hop_length = hop_length |
| |
|