| { |
| "architectures": [ |
| "MossAudioTokenizerModel" |
| ], |
| "auto_map": { |
| "AutoConfig": "configuration_moss_audio_tokenizer.MossAudioTokenizerConfig", |
| "AutoModel": "modeling_moss_audio_tokenizer.MossAudioTokenizerModel" |
| }, |
| "model_type": "moss-audio-tokenizer", |
| "sample_rate": 48000, |
| "sampling_rate": 48000, |
| "downsample_rate": 3840, |
| "causal_transformer_context_duration": 10.0, |
| "number_channels": 2, |
| "enable_channel_interleave": true, |
| "attention_implementation": "sdpa", |
| "compute_dtype": "fp32", |
| "dtype": "float32", |
| "code_dim": 768, |
| "encoder_kwargs": [ |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 240 |
| }, |
| { |
| "causal": true, |
| "context_duration": 4.0, |
| "conv_layout": true, |
| "d_model": 256, |
| "dim_feedforward": 1024, |
| "gating": "none", |
| "input_dimension": 240, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 4, |
| "num_layers": 4, |
| "output_dimension": 384, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "context_duration": 6.0, |
| "conv_layout": true, |
| "d_model": 256, |
| "dim_feedforward": 1024, |
| "gating": "none", |
| "input_dimension": 768, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 4, |
| "num_layers": 2, |
| "output_dimension": 384, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "context_duration": 8.0, |
| "conv_layout": true, |
| "d_model": 256, |
| "dim_feedforward": 1024, |
| "gating": "none", |
| "input_dimension": 768, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 4, |
| "num_layers": 2, |
| "output_dimension": 384, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "context_duration": 10.0, |
| "conv_layout": true, |
| "d_model": 256, |
| "dim_feedforward": 1024, |
| "gating": "none", |
| "input_dimension": 768, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 4, |
| "num_layers": 4, |
| "output_dimension": 192, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 4 |
| } |
| ], |
| "decoder_kwargs": [ |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 4 |
| }, |
| { |
| "causal": true, |
| "context_duration": 10.0, |
| "conv_layout": true, |
| "d_model": 256, |
| "dim_feedforward": 1024, |
| "gating": "none", |
| "input_dimension": 192, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 4, |
| "num_layers": 4, |
| "output_dimension": 768, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "context_duration": 8.0, |
| "conv_layout": true, |
| "d_model": 256, |
| "dim_feedforward": 1024, |
| "gating": "none", |
| "input_dimension": 384, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 4, |
| "num_layers": 2, |
| "output_dimension": 768, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "context_duration": 6.0, |
| "conv_layout": true, |
| "d_model": 256, |
| "dim_feedforward": 1024, |
| "gating": "none", |
| "input_dimension": 384, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 4, |
| "num_layers": 2, |
| "output_dimension": 768, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "context_duration": 4.0, |
| "conv_layout": true, |
| "d_model": 256, |
| "dim_feedforward": 1024, |
| "gating": "none", |
| "input_dimension": 384, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 4, |
| "num_layers": 4, |
| "output_dimension": 240, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 240 |
| } |
| ], |
| "quantizer_type": "rlfq", |
| "quantizer_kwargs": { |
| "codebook_dim": 8, |
| "codebook_loss_weight": 1.0, |
| "codebook_size": 1024, |
| "commitment_loss_weight": 0.25, |
| "input_dim": 768, |
| "num_quantizers": 16, |
| "output_dim": 768, |
| "quantizer_dropout": 1.0, |
| "quantizer_type": "rlfq", |
| "rvq_dim": 512 |
| }, |
| "transformers_version": "4.56.0.dev0", |
| "reversed_decoder_kwargs": [ |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 240 |
| }, |
| { |
| "causal": true, |
| "context_duration": 4.0, |
| "conv_layout": true, |
| "d_model": 256, |
| "dim_feedforward": 1024, |
| "gating": "none", |
| "input_dimension": 240, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 4, |
| "num_layers": 4, |
| "output_dimension": 384, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "context_duration": 6.0, |
| "conv_layout": true, |
| "d_model": 256, |
| "dim_feedforward": 1024, |
| "gating": "none", |
| "input_dimension": 768, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 4, |
| "num_layers": 2, |
| "output_dimension": 384, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "context_duration": 8.0, |
| "conv_layout": true, |
| "d_model": 256, |
| "dim_feedforward": 1024, |
| "gating": "none", |
| "input_dimension": 768, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 4, |
| "num_layers": 2, |
| "output_dimension": 384, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 2 |
| }, |
| { |
| "causal": true, |
| "context_duration": 10.0, |
| "conv_layout": true, |
| "d_model": 256, |
| "dim_feedforward": 1024, |
| "gating": "none", |
| "input_dimension": 768, |
| "layer_scale": 0.01, |
| "max_period": 10000, |
| "module_type": "Transformer", |
| "norm": "layer_norm", |
| "num_heads": 4, |
| "num_layers": 4, |
| "output_dimension": 192, |
| "positional_embedding": "rope" |
| }, |
| { |
| "module_type": "PatchedPretransform", |
| "patch_size": 4 |
| } |
| ] |
| } |
|
|