| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """XYTokenizer model configuration""" |
|
|
| from transformers.configuration_utils import PretrainedConfig |
| from transformers.utils import logging |
|
|
| logger = logging.get_logger(__name__) |
|
|
|
|
| class XYTokenizerConfig(PretrainedConfig): |
| r""" |
| This is the configuration class to store the configuration of a [`XYTokenizerModel`]. It is used to instantiate a |
| XY Tokenizer model according to the specified arguments, defining the model architecture. |
| |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| documentation from [`PretrainedConfig`] for more information. |
| |
| Args: |
| input_sample_rate (`int`, *optional*, defaults to 16000): |
| The sampling rate of the input audio. |
| output_sample_rate (`int`, *optional*, defaults to 16000): |
| The sampling rate of the output audio. |
| encoder_downsample_rate (`int`, *optional*, defaults to 1280): |
| The total downsampling factor of the encoder part. |
| decoder_upsample_rate (`int`, *optional*, defaults to 1920): |
| The total upsampling factor of the decoder part. |
| code_dim (`int`, *optional*, defaults to 1280): |
| The dimension of the code embeddings. |
| |
| // ... (All other parameters from the original YAML/dict config would be listed here) ... |
| // For brevity, we will define them with default values based on the provided code. |
| |
| Example: |
| semantic_encoder_d_model (`int`, *optional*, defaults to 1280): |
| Hidden dimension for the semantic encoder. |
| num_quantizers (`int`, *optional*, defaults to 32): |
| Number of residual quantizers. |
| ... |
| """ |
| model_type = "xy_tokenizer" |
|
|
| |
| |
| |
| def __init__( |
| self, |
| input_sample_rate=16000, |
| output_sample_rate=16000, |
| encoder_downsample_rate=1280, |
| decoder_upsample_rate=1920, |
| code_dim=1280, |
| |
| |
| **kwargs, |
| ): |
| self.input_sample_rate = input_sample_rate |
| self.output_sample_rate = output_sample_rate |
| self.encoder_downsample_rate = encoder_downsample_rate |
| self.decoder_upsample_rate = decoder_upsample_rate |
| self.code_dim = code_dim |
| |
| |
| |
| self.params = kwargs |
| |
| super().__init__(**kwargs) |
|
|
|
|
| __all__ = ["XYTokenizerConfig"] |