MOSS-Audio-Tokenizer / config.json
gaoyang07's picture
update config (#5)
6381a50
{
"architectures": [
"MossAudioTokenizerModel"
],
"auto_map": {
"AutoConfig": "configuration_moss_audio_tokenizer.MossAudioTokenizerConfig",
"AutoModel": "modeling_moss_audio_tokenizer.MossAudioTokenizerModel"
},
"causal_transformer_context_duration": 10,
"code_dim": 768,
"decoder_kwargs": [
{
"causal": true,
"conv_layout": true,
"d_model": 1280,
"dim_feedforward": 5120,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 20,
"num_layers": 32,
"output_dimension": 1280,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 640,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 768,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 384,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 768,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 384,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 240,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 240
}
],
"downsample_rate": 1920,
"dtype": "float32",
"encoder_kwargs": [
{
"module_type": "PatchedPretransform",
"patch_size": 240
},
{
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 240,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 384,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 384,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 640,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"conv_layout": true,
"d_model": 1280,
"dim_feedforward": 5120,
"gating": "none",
"input_dimension": 1280,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 20,
"num_layers": 32,
"output_dimension": 768,
"positional_embedding": "rope"
}
],
"model_type": "moss-audio-tokenizer",
"quantizer_kwargs": {
"codebook_dim": 8,
"codebook_size": 1024,
"input_dim": 768,
"num_quantizers": 32,
"output_dim": 768,
"quantizer_type": "rlfq",
"rvq_dim": 512
},
"quantizer_type": "rlfq",
"reversed_decoder_kwargs": [
{
"module_type": "PatchedPretransform",
"patch_size": 240
},
{
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 240,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 384,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 384,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 640,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"conv_layout": true,
"d_model": 1280,
"dim_feedforward": 5120,
"gating": "none",
"input_dimension": 1280,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 20,
"num_layers": 32,
"output_dimension": 768,
"positional_embedding": "rope"
}
],
"sample_rate": 24000,
"sampling_rate": 24000,
"transformers_version": "4.56.0.dev0",
"version": "4.26.1.a"
}