Spaces:
Running
Running
| { | |
| "architectures": [ | |
| "MossAudioTokenizerModel" | |
| ], | |
| "auto_map": { | |
| "AutoConfig": "configuration_moss_audio_tokenizer.MossAudioTokenizerConfig", | |
| "AutoModel": "modeling_moss_audio_tokenizer.MossAudioTokenizerModel" | |
| }, | |
| "model_type": "moss-audio-tokenizer", | |
| "sample_rate": 48000, | |
| "sampling_rate": 48000, | |
| "downsample_rate": 3840, | |
| "causal_transformer_context_duration": 10.0, | |
| "number_channels": 2, | |
| "enable_channel_interleave": true, | |
| "attention_implementation": "sdpa", | |
| "compute_dtype": "fp32", | |
| "dtype": "float32", | |
| "code_dim": 768, | |
| "encoder_kwargs": [ | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 240 | |
| }, | |
| { | |
| "causal": true, | |
| "context_duration": 4.0, | |
| "conv_layout": true, | |
| "d_model": 256, | |
| "dim_feedforward": 1024, | |
| "gating": "none", | |
| "input_dimension": 240, | |
| "layer_scale": 0.01, | |
| "max_period": 10000, | |
| "module_type": "Transformer", | |
| "norm": "layer_norm", | |
| "num_heads": 4, | |
| "num_layers": 4, | |
| "output_dimension": 384, | |
| "positional_embedding": "rope" | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "causal": true, | |
| "context_duration": 6.0, | |
| "conv_layout": true, | |
| "d_model": 256, | |
| "dim_feedforward": 1024, | |
| "gating": "none", | |
| "input_dimension": 768, | |
| "layer_scale": 0.01, | |
| "max_period": 10000, | |
| "module_type": "Transformer", | |
| "norm": "layer_norm", | |
| "num_heads": 4, | |
| "num_layers": 2, | |
| "output_dimension": 384, | |
| "positional_embedding": "rope" | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "causal": true, | |
| "context_duration": 8.0, | |
| "conv_layout": true, | |
| "d_model": 256, | |
| "dim_feedforward": 1024, | |
| "gating": "none", | |
| "input_dimension": 768, | |
| "layer_scale": 0.01, | |
| "max_period": 10000, | |
| "module_type": "Transformer", | |
| "norm": "layer_norm", | |
| "num_heads": 4, | |
| "num_layers": 2, | |
| "output_dimension": 384, | |
| "positional_embedding": "rope" | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "causal": true, | |
| "context_duration": 10.0, | |
| "conv_layout": true, | |
| "d_model": 256, | |
| "dim_feedforward": 1024, | |
| "gating": "none", | |
| "input_dimension": 768, | |
| "layer_scale": 0.01, | |
| "max_period": 10000, | |
| "module_type": "Transformer", | |
| "norm": "layer_norm", | |
| "num_heads": 4, | |
| "num_layers": 4, | |
| "output_dimension": 192, | |
| "positional_embedding": "rope" | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 4 | |
| } | |
| ], | |
| "decoder_kwargs": [ | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 4 | |
| }, | |
| { | |
| "causal": true, | |
| "context_duration": 10.0, | |
| "conv_layout": true, | |
| "d_model": 256, | |
| "dim_feedforward": 1024, | |
| "gating": "none", | |
| "input_dimension": 192, | |
| "layer_scale": 0.01, | |
| "max_period": 10000, | |
| "module_type": "Transformer", | |
| "norm": "layer_norm", | |
| "num_heads": 4, | |
| "num_layers": 4, | |
| "output_dimension": 768, | |
| "positional_embedding": "rope" | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "causal": true, | |
| "context_duration": 8.0, | |
| "conv_layout": true, | |
| "d_model": 256, | |
| "dim_feedforward": 1024, | |
| "gating": "none", | |
| "input_dimension": 384, | |
| "layer_scale": 0.01, | |
| "max_period": 10000, | |
| "module_type": "Transformer", | |
| "norm": "layer_norm", | |
| "num_heads": 4, | |
| "num_layers": 2, | |
| "output_dimension": 768, | |
| "positional_embedding": "rope" | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "causal": true, | |
| "context_duration": 6.0, | |
| "conv_layout": true, | |
| "d_model": 256, | |
| "dim_feedforward": 1024, | |
| "gating": "none", | |
| "input_dimension": 384, | |
| "layer_scale": 0.01, | |
| "max_period": 10000, | |
| "module_type": "Transformer", | |
| "norm": "layer_norm", | |
| "num_heads": 4, | |
| "num_layers": 2, | |
| "output_dimension": 768, | |
| "positional_embedding": "rope" | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "causal": true, | |
| "context_duration": 4.0, | |
| "conv_layout": true, | |
| "d_model": 256, | |
| "dim_feedforward": 1024, | |
| "gating": "none", | |
| "input_dimension": 384, | |
| "layer_scale": 0.01, | |
| "max_period": 10000, | |
| "module_type": "Transformer", | |
| "norm": "layer_norm", | |
| "num_heads": 4, | |
| "num_layers": 4, | |
| "output_dimension": 240, | |
| "positional_embedding": "rope" | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 240 | |
| } | |
| ], | |
| "quantizer_type": "rlfq", | |
| "quantizer_kwargs": { | |
| "codebook_dim": 8, | |
| "codebook_loss_weight": 1.0, | |
| "codebook_size": 1024, | |
| "commitment_loss_weight": 0.25, | |
| "input_dim": 768, | |
| "num_quantizers": 16, | |
| "output_dim": 768, | |
| "quantizer_dropout": 1.0, | |
| "quantizer_type": "rlfq", | |
| "rvq_dim": 512 | |
| }, | |
| "transformers_version": "4.56.0.dev0", | |
| "reversed_decoder_kwargs": [ | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 240 | |
| }, | |
| { | |
| "causal": true, | |
| "context_duration": 4.0, | |
| "conv_layout": true, | |
| "d_model": 256, | |
| "dim_feedforward": 1024, | |
| "gating": "none", | |
| "input_dimension": 240, | |
| "layer_scale": 0.01, | |
| "max_period": 10000, | |
| "module_type": "Transformer", | |
| "norm": "layer_norm", | |
| "num_heads": 4, | |
| "num_layers": 4, | |
| "output_dimension": 384, | |
| "positional_embedding": "rope" | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "causal": true, | |
| "context_duration": 6.0, | |
| "conv_layout": true, | |
| "d_model": 256, | |
| "dim_feedforward": 1024, | |
| "gating": "none", | |
| "input_dimension": 768, | |
| "layer_scale": 0.01, | |
| "max_period": 10000, | |
| "module_type": "Transformer", | |
| "norm": "layer_norm", | |
| "num_heads": 4, | |
| "num_layers": 2, | |
| "output_dimension": 384, | |
| "positional_embedding": "rope" | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "causal": true, | |
| "context_duration": 8.0, | |
| "conv_layout": true, | |
| "d_model": 256, | |
| "dim_feedforward": 1024, | |
| "gating": "none", | |
| "input_dimension": 768, | |
| "layer_scale": 0.01, | |
| "max_period": 10000, | |
| "module_type": "Transformer", | |
| "norm": "layer_norm", | |
| "num_heads": 4, | |
| "num_layers": 2, | |
| "output_dimension": 384, | |
| "positional_embedding": "rope" | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 2 | |
| }, | |
| { | |
| "causal": true, | |
| "context_duration": 10.0, | |
| "conv_layout": true, | |
| "d_model": 256, | |
| "dim_feedforward": 1024, | |
| "gating": "none", | |
| "input_dimension": 768, | |
| "layer_scale": 0.01, | |
| "max_period": 10000, | |
| "module_type": "Transformer", | |
| "norm": "layer_norm", | |
| "num_heads": 4, | |
| "num_layers": 4, | |
| "output_dimension": 192, | |
| "positional_embedding": "rope" | |
| }, | |
| { | |
| "module_type": "PatchedPretransform", | |
| "patch_size": 4 | |
| } | |
| ] | |
| } | |