Image Feature Extraction
Transformers
Safetensors
moss-audio-tokenizer
audio
audio-tokenizer
neural-codec
moss-tts-family
MOSS Audio Tokenizer
speech-tokenizer
trust-remote-code
custom_code
Instructions to use OpenMOSS-Team/MOSS-Audio-Tokenizer-v2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use OpenMOSS-Team/MOSS-Audio-Tokenizer-v2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-feature-extraction", model="OpenMOSS-Team/MOSS-Audio-Tokenizer-v2", trust_remote_code=True)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("OpenMOSS-Team/MOSS-Audio-Tokenizer-v2", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| { | |
| "architectures": [ | |
| "MossAudioTokenizerModel" | |
| ], | |
| "auto_map": { | |
| "AutoConfig": "configuration_moss_audio_tokenizer.MossAudioTokenizerConfig", | |
| "AutoModel": "modeling_moss_audio_tokenizer.MossAudioTokenizerModel" | |
| }, | |
| "model_type": "moss-audio-tokenizer", | |
| "sample_rate": 48000, | |
| "sampling_rate": 48000, | |
| "downsample_rate": 3840, | |
| "causal_transformer_context_duration": 10.0, | |
| "number_channels": 2, | |
| "enable_channel_interleave": true, | |
| "attention_implementation": "flash_attention_2", | |
| "compute_dtype": "bf16", | |
| "dtype": "float32", | |
| "code_dim": 768, | |
| "encoder_kwargs": [ | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 240 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 240, | |
| "output_dimension": 384, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 1.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 768, | |
| "output_dimension": 384, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 2.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 768, | |
| "output_dimension": 384, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 4.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 768, | |
| "output_dimension": 384, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 8.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 768, | |
| "output_dimension": 640, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 10.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 1280, | |
| "output_dimension": 768, | |
| "d_model": 1280, | |
| "num_heads": 20, | |
| "num_layers": 32, | |
| "dim_feedforward": 5120, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 10.0 | |
| } | |
| ], | |
| "decoder_kwargs": [ | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 768, | |
| "output_dimension": 1280, | |
| "d_model": 1280, | |
| "num_heads": 20, | |
| "num_layers": 32, | |
| "dim_feedforward": 5120, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 10.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 640, | |
| "output_dimension": 768, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 10.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 384, | |
| "output_dimension": 768, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 8.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 384, | |
| "output_dimension": 768, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 4.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 384, | |
| "output_dimension": 768, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 2.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 384, | |
| "output_dimension": 240, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 1.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 240 | |
| } | |
| ], | |
| "reversed_decoder_kwargs": [ | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 240 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 240, | |
| "output_dimension": 384, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 1.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 768, | |
| "output_dimension": 384, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 2.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 768, | |
| "output_dimension": 384, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 4.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 768, | |
| "output_dimension": 384, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 8.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 768, | |
| "output_dimension": 640, | |
| "d_model": 768, | |
| "num_heads": 12, | |
| "num_layers": 12, | |
| "dim_feedforward": 3072, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 10.0 | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "module_type": "Transformer", | |
| "input_dimension": 1280, | |
| "output_dimension": 768, | |
| "d_model": 1280, | |
| "num_heads": 20, | |
| "num_layers": 32, | |
| "dim_feedforward": 5120, | |
| "causal": true, | |
| "norm": "layer_norm", | |
| "positional_embedding": "rope", | |
| "max_period": 10000, | |
| "gating": "none", | |
| "layer_scale": 0.01, | |
| "conv_layout": true, | |
| "context_duration": 10.0 | |
| } | |
| ], | |
| "quantizer_type": "rlfq", | |
| "quantizer_kwargs": { | |
| "input_dim": 768, | |
| "rvq_dim": 512, | |
| "output_dim": 768, | |
| "num_quantizers": 32, | |
| "codebook_size": 1024, | |
| "codebook_dim": 8, | |
| "quantizer_type": "rlfq" | |
| }, | |
| "transformers_version": "4.56.0.dev0" | |
| } |