--replace-all
update nano
d4a3b2c
{
"architectures": [
"MossAudioTokenizerModel"
],
"auto_map": {
"AutoConfig": "configuration_moss_audio_tokenizer.MossAudioTokenizerConfig",
"AutoModel": "modeling_moss_audio_tokenizer.MossAudioTokenizerModel"
},
"model_type": "moss-audio-tokenizer",
"sample_rate": 48000,
"sampling_rate": 48000,
"downsample_rate": 3840,
"causal_transformer_context_duration": 10.0,
"number_channels": 2,
"enable_channel_interleave": true,
"attention_implementation": "sdpa",
"compute_dtype": "fp32",
"dtype": "float32",
"code_dim": 768,
"encoder_kwargs": [
{
"module_type": "PatchedPretransform",
"patch_size": 240
},
{
"causal": true,
"context_duration": 4.0,
"conv_layout": true,
"d_model": 256,
"dim_feedforward": 1024,
"gating": "none",
"input_dimension": 240,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 4,
"num_layers": 4,
"output_dimension": 384,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"context_duration": 6.0,
"conv_layout": true,
"d_model": 256,
"dim_feedforward": 1024,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 4,
"num_layers": 2,
"output_dimension": 384,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"context_duration": 8.0,
"conv_layout": true,
"d_model": 256,
"dim_feedforward": 1024,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 4,
"num_layers": 2,
"output_dimension": 384,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"context_duration": 10.0,
"conv_layout": true,
"d_model": 256,
"dim_feedforward": 1024,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 4,
"num_layers": 4,
"output_dimension": 192,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 4
}
],
"decoder_kwargs": [
{
"module_type": "PatchedPretransform",
"patch_size": 4
},
{
"causal": true,
"context_duration": 10.0,
"conv_layout": true,
"d_model": 256,
"dim_feedforward": 1024,
"gating": "none",
"input_dimension": 192,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 4,
"num_layers": 4,
"output_dimension": 768,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"context_duration": 8.0,
"conv_layout": true,
"d_model": 256,
"dim_feedforward": 1024,
"gating": "none",
"input_dimension": 384,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 4,
"num_layers": 2,
"output_dimension": 768,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"context_duration": 6.0,
"conv_layout": true,
"d_model": 256,
"dim_feedforward": 1024,
"gating": "none",
"input_dimension": 384,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 4,
"num_layers": 2,
"output_dimension": 768,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"context_duration": 4.0,
"conv_layout": true,
"d_model": 256,
"dim_feedforward": 1024,
"gating": "none",
"input_dimension": 384,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 4,
"num_layers": 4,
"output_dimension": 240,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 240
}
],
"quantizer_type": "rlfq",
"quantizer_kwargs": {
"codebook_dim": 8,
"codebook_loss_weight": 1.0,
"codebook_size": 1024,
"commitment_loss_weight": 0.25,
"input_dim": 768,
"num_quantizers": 16,
"output_dim": 768,
"quantizer_dropout": 1.0,
"quantizer_type": "rlfq",
"rvq_dim": 512
},
"transformers_version": "4.56.0.dev0",
"reversed_decoder_kwargs": [
{
"module_type": "PatchedPretransform",
"patch_size": 240
},
{
"causal": true,
"context_duration": 4.0,
"conv_layout": true,
"d_model": 256,
"dim_feedforward": 1024,
"gating": "none",
"input_dimension": 240,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 4,
"num_layers": 4,
"output_dimension": 384,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"context_duration": 6.0,
"conv_layout": true,
"d_model": 256,
"dim_feedforward": 1024,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 4,
"num_layers": 2,
"output_dimension": 384,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"context_duration": 8.0,
"conv_layout": true,
"d_model": 256,
"dim_feedforward": 1024,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 4,
"num_layers": 2,
"output_dimension": 384,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"context_duration": 10.0,
"conv_layout": true,
"d_model": 256,
"dim_feedforward": 1024,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 4,
"num_layers": 4,
"output_dimension": 192,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 4
}
]
}