|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""MossAudioTokenizer model configuration""" |
|
|
|
|
|
from typing import Any |
|
|
|
|
|
from transformers.configuration_utils import PreTrainedConfig |
|
|
from transformers.utils import logging |
|
|
|
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
|
|
|
class MossAudioTokenizerConfig(PreTrainedConfig): |
|
|
r""" |
|
|
This is the configuration class to store the configuration of a [`MossAudioTokenizerModel`]. It is used to instantiate a |
|
|
MossAudioTokenizer model according to the specified arguments, defining the model architecture. |
|
|
|
|
|
Instantiating a configuration with the defaults will yield a similar configuration to that of the |
|
|
[VoiceAgentGroup/moss_audio_tokenizer](https://huggingface.co/VoiceAgentGroup/moss_audio_tokenizer) architecture. |
|
|
|
|
|
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the |
|
|
documentation from [`PreTrainedConfig`] for more information. |
|
|
|
|
|
Args: |
|
|
sampling_rate (`int`, *optional*, defaults to 24000): |
|
|
The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz). |
|
|
downsample_rate (`int`, *optional*, defaults to 1920): |
|
|
Total downsampling rate from waveform to tokens. |
|
|
causal_transformer_context_duration (`float`, *optional*, defaults to 10.0): |
|
|
Context duration in seconds for causal transformer. |
|
|
encoder_kwargs (`list[dict]`, *optional*): |
|
|
List of encoder module configurations. Each dict specifies a module type and its parameters. |
|
|
decoder_kwargs (`list[dict]`, *optional*): |
|
|
List of decoder module configurations in execution order. |
|
|
quantizer_type (`str`, *optional*, defaults to `"rvq"`): |
|
|
Quantizer type. Options include `"rvq"`, `"spec_rvq"`, `"rlfq"`, `"random_prefix_rlfq"`. |
|
|
quantizer_kwargs (`dict`, *optional*): |
|
|
Configuration for the quantizer including `input_dim`, `rvq_dim`, `output_dim`, `num_quantizers`, |
|
|
`codebook_size`, and `codebook_dim`. |
|
|
|
|
|
Example: |
|
|
|
|
|
```python |
|
|
>>> from transformers import MossAudioTokenizerModel, MossAudioTokenizerConfig |
|
|
|
|
|
>>> # Initializing a MossAudioTokenizer style configuration |
|
|
>>> configuration = MossAudioTokenizerConfig() |
|
|
|
|
|
>>> # Initializing a model (with random weights) from the configuration |
|
|
>>> model = MossAudioTokenizerModel(configuration) |
|
|
|
|
|
>>> # Accessing the model configuration |
|
|
>>> configuration = model.config |
|
|
``` |
|
|
""" |
|
|
|
|
|
model_type = "moss-audio-tokenizer" |
|
|
|
|
|
|
|
|
attribute_map = {"sample_rate": "sampling_rate"} |
|
|
|
|
|
sampling_rate: int |
|
|
downsample_rate: int |
|
|
causal_transformer_context_duration: float |
|
|
encoder_kwargs: list[dict[str, Any]] |
|
|
decoder_kwargs: list[dict[str, Any]] |
|
|
quantizer_type: str |
|
|
quantizer_kwargs: dict[str, Any] |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
version: str | None = None, |
|
|
sampling_rate: int = 24000, |
|
|
downsample_rate: int = 1920, |
|
|
causal_transformer_context_duration: float = 10.0, |
|
|
encoder_kwargs: list[dict[str, Any]] | None = None, |
|
|
decoder_kwargs: list[dict[str, Any]] | None = None, |
|
|
quantizer_type: str = "rlfq", |
|
|
quantizer_kwargs: dict[str, Any] | None = None, |
|
|
**kwargs, |
|
|
): |
|
|
|
|
|
|
|
|
kwargs.pop("model_type", None) |
|
|
|
|
|
|
|
|
self.version = version |
|
|
self.sampling_rate = sampling_rate |
|
|
self.downsample_rate = downsample_rate |
|
|
self.causal_transformer_context_duration = causal_transformer_context_duration |
|
|
|
|
|
if encoder_kwargs is None: |
|
|
encoder_kwargs = [ |
|
|
{ |
|
|
"module_type": "PatchedPretransform", |
|
|
"patch_size": 240, |
|
|
}, |
|
|
{ |
|
|
"module_type": "Transformer", |
|
|
"input_dimension": 240, |
|
|
"output_dimension": 384, |
|
|
"d_model": 768, |
|
|
"num_heads": 12, |
|
|
"num_layers": 12, |
|
|
"dim_feedforward": 3072, |
|
|
"causal": True, |
|
|
"norm": "layer_norm", |
|
|
"positional_embedding": "rope", |
|
|
"max_period": 10000, |
|
|
"gating": "none", |
|
|
"layer_scale": 0.01, |
|
|
"conv_layout": True, |
|
|
}, |
|
|
{ |
|
|
"module_type": "PatchedPretransform", |
|
|
"patch_size": 2, |
|
|
}, |
|
|
{ |
|
|
"module_type": "Transformer", |
|
|
"input_dimension": 768, |
|
|
"output_dimension": 384, |
|
|
"d_model": 768, |
|
|
"num_heads": 12, |
|
|
"num_layers": 12, |
|
|
"dim_feedforward": 3072, |
|
|
"causal": True, |
|
|
"norm": "layer_norm", |
|
|
"positional_embedding": "rope", |
|
|
"max_period": 10000, |
|
|
"gating": "none", |
|
|
"layer_scale": 0.01, |
|
|
"conv_layout": True, |
|
|
}, |
|
|
{ |
|
|
"module_type": "PatchedPretransform", |
|
|
"patch_size": 2, |
|
|
}, |
|
|
{ |
|
|
"module_type": "Transformer", |
|
|
"input_dimension": 768, |
|
|
"output_dimension": 640, |
|
|
"d_model": 768, |
|
|
"num_heads": 12, |
|
|
"num_layers": 12, |
|
|
"dim_feedforward": 3072, |
|
|
"causal": True, |
|
|
"norm": "layer_norm", |
|
|
"positional_embedding": "rope", |
|
|
"max_period": 10000, |
|
|
"gating": "none", |
|
|
"layer_scale": 0.01, |
|
|
"conv_layout": True, |
|
|
}, |
|
|
{ |
|
|
"module_type": "PatchedPretransform", |
|
|
"patch_size": 2, |
|
|
}, |
|
|
{ |
|
|
"module_type": "Transformer", |
|
|
"input_dimension": 1280, |
|
|
"output_dimension": 768, |
|
|
"d_model": 1280, |
|
|
"num_heads": 20, |
|
|
"num_layers": 32, |
|
|
"dim_feedforward": 5120, |
|
|
"causal": True, |
|
|
"norm": "layer_norm", |
|
|
"positional_embedding": "rope", |
|
|
"max_period": 10000, |
|
|
"gating": "none", |
|
|
"layer_scale": 0.01, |
|
|
"conv_layout": True, |
|
|
}, |
|
|
] |
|
|
self.encoder_kwargs = encoder_kwargs |
|
|
|
|
|
|
|
|
if decoder_kwargs is None: |
|
|
decoder_kwargs = [ |
|
|
{ |
|
|
"module_type": "Transformer", |
|
|
"input_dimension": 768, |
|
|
"output_dimension": 1280, |
|
|
"d_model": 1280, |
|
|
"num_heads": 20, |
|
|
"num_layers": 32, |
|
|
"dim_feedforward": 5120, |
|
|
"causal": True, |
|
|
"norm": "layer_norm", |
|
|
"positional_embedding": "rope", |
|
|
"max_period": 10000, |
|
|
"gating": "none", |
|
|
"layer_scale": 0.01, |
|
|
"conv_layout": True, |
|
|
}, |
|
|
{ |
|
|
"module_type": "PatchedPretransform", |
|
|
"patch_size": 2, |
|
|
}, |
|
|
{ |
|
|
"module_type": "Transformer", |
|
|
"input_dimension": 640, |
|
|
"output_dimension": 768, |
|
|
"d_model": 768, |
|
|
"num_heads": 12, |
|
|
"num_layers": 12, |
|
|
"dim_feedforward": 3072, |
|
|
"causal": True, |
|
|
"norm": "layer_norm", |
|
|
"positional_embedding": "rope", |
|
|
"max_period": 10000, |
|
|
"gating": "none", |
|
|
"layer_scale": 0.01, |
|
|
"conv_layout": True, |
|
|
}, |
|
|
{ |
|
|
"module_type": "PatchedPretransform", |
|
|
"patch_size": 2, |
|
|
}, |
|
|
{ |
|
|
"module_type": "Transformer", |
|
|
"input_dimension": 384, |
|
|
"output_dimension": 768, |
|
|
"d_model": 768, |
|
|
"num_heads": 12, |
|
|
"num_layers": 12, |
|
|
"dim_feedforward": 3072, |
|
|
"causal": True, |
|
|
"norm": "layer_norm", |
|
|
"positional_embedding": "rope", |
|
|
"max_period": 10000, |
|
|
"gating": "none", |
|
|
"layer_scale": 0.01, |
|
|
"conv_layout": True, |
|
|
}, |
|
|
{ |
|
|
"module_type": "PatchedPretransform", |
|
|
"patch_size": 2, |
|
|
}, |
|
|
{ |
|
|
"module_type": "Transformer", |
|
|
"input_dimension": 384, |
|
|
"output_dimension": 768, |
|
|
"d_model": 768, |
|
|
"num_heads": 12, |
|
|
"num_layers": 12, |
|
|
"dim_feedforward": 3072, |
|
|
"causal": True, |
|
|
"norm": "layer_norm", |
|
|
"positional_embedding": "rope", |
|
|
"max_period": 10000, |
|
|
"gating": "none", |
|
|
"layer_scale": 0.01, |
|
|
"conv_layout": True, |
|
|
}, |
|
|
{ |
|
|
"module_type": "PatchedPretransform", |
|
|
"patch_size": 2, |
|
|
}, |
|
|
{ |
|
|
"module_type": "Transformer", |
|
|
"input_dimension": 384, |
|
|
"output_dimension": 240, |
|
|
"d_model": 768, |
|
|
"num_heads": 12, |
|
|
"num_layers": 12, |
|
|
"dim_feedforward": 3072, |
|
|
"causal": True, |
|
|
"norm": "layer_norm", |
|
|
"positional_embedding": "rope", |
|
|
"max_period": 10000, |
|
|
"gating": "none", |
|
|
"layer_scale": 0.01, |
|
|
"conv_layout": True, |
|
|
}, |
|
|
{ |
|
|
"module_type": "PatchedPretransform", |
|
|
"patch_size": 240, |
|
|
}, |
|
|
] |
|
|
self.decoder_kwargs = decoder_kwargs |
|
|
|
|
|
|
|
|
if quantizer_kwargs is None: |
|
|
quantizer_kwargs = { |
|
|
"input_dim": 768, |
|
|
"rvq_dim": 512, |
|
|
"output_dim": 768, |
|
|
"num_quantizers": 32, |
|
|
"codebook_size": 1024, |
|
|
"codebook_dim": 8, |
|
|
"quantizer_type": "rlfq", |
|
|
} |
|
|
|
|
|
|
|
|
kw_qtype = quantizer_kwargs.get("quantizer_type", None) |
|
|
if kw_qtype is not None: |
|
|
self.quantizer_type = kw_qtype |
|
|
else: |
|
|
self.quantizer_type = quantizer_type |
|
|
quantizer_kwargs["quantizer_type"] = quantizer_type |
|
|
|
|
|
self.quantizer_kwargs = quantizer_kwargs |
|
|
|
|
|
super().__init__(**kwargs) |
|
|
|
|
|
@property |
|
|
def num_quantizers(self) -> int: |
|
|
"""Return the number of quantizers from quantizer_kwargs.""" |
|
|
return self.quantizer_kwargs.get("num_quantizers", 32) |
|
|
|
|
|
@property |
|
|
def codebook_size(self) -> int: |
|
|
"""Return the codebook size from quantizer_kwargs.""" |
|
|
return self.quantizer_kwargs.get("codebook_size", 4096) |
|
|
|
|
|
@property |
|
|
def frame_rate(self) -> float: |
|
|
"""Return the frame rate (tokens per second).""" |
|
|
return self.sampling_rate / self.downsample_rate |
|
|
|
|
|
|
|
|
__all__ = ["MossAudioTokenizerConfig"] |
|
|
|