tamarher's picture
Duplicate from appautomaton/openmoss-audio-tokenizer-mlx
3422b87
{
"model_type": "moss-audio-tokenizer",
"version": "4.26.1.a",
"sampling_rate": 24000,
"sample_rate": 24000,
"downsample_rate": 1920,
"causal_transformer_context_duration": 10.0,
"encoder_kwargs": [
{
"module_type": "PatchedPretransform",
"patch_size": 240
},
{
"module_type": "Transformer",
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 240,
"layer_scale": 0.01,
"max_period": 10000,
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 384,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"module_type": "Transformer",
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 384,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"module_type": "Transformer",
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 640,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"module_type": "Transformer",
"causal": true,
"conv_layout": true,
"d_model": 1280,
"dim_feedforward": 5120,
"gating": "none",
"input_dimension": 1280,
"layer_scale": 0.01,
"max_period": 10000,
"norm": "layer_norm",
"num_heads": 20,
"num_layers": 32,
"output_dimension": 768,
"positional_embedding": "rope"
}
],
"decoder_kwargs": [
{
"module_type": "Transformer",
"causal": true,
"conv_layout": true,
"d_model": 1280,
"dim_feedforward": 5120,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"norm": "layer_norm",
"num_heads": 20,
"num_layers": 32,
"output_dimension": 1280,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"module_type": "Transformer",
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 640,
"layer_scale": 0.01,
"max_period": 10000,
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 768,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"module_type": "Transformer",
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 384,
"layer_scale": 0.01,
"max_period": 10000,
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 768,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"module_type": "Transformer",
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 384,
"layer_scale": 0.01,
"max_period": 10000,
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 240,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 240
}
],
"quantizer_type": "rlfq",
"quantizer_kwargs": {
"input_dim": 768,
"rvq_dim": 512,
"output_dim": 768,
"num_quantizers": 32,
"codebook_size": 1024,
"codebook_dim": 8,
"quantizer_type": "rlfq"
},
"architectures": [
"MossAudioTokenizerModel"
],
"auto_map": {
"AutoConfig": "configuration_moss_audio_tokenizer.MossAudioTokenizerConfig",
"AutoModel": "modeling_moss_audio_tokenizer.MossAudioTokenizerModel"
},
"code_dim": 768,
"dtype": "float32",
"reversed_decoder_kwargs": [
{
"module_type": "PatchedPretransform",
"patch_size": 240
},
{
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 240,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 384,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 384,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"conv_layout": true,
"d_model": 768,
"dim_feedforward": 3072,
"gating": "none",
"input_dimension": 768,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 12,
"num_layers": 12,
"output_dimension": 640,
"positional_embedding": "rope"
},
{
"module_type": "PatchedPretransform",
"patch_size": 2
},
{
"causal": true,
"conv_layout": true,
"d_model": 1280,
"dim_feedforward": 5120,
"gating": "none",
"input_dimension": 1280,
"layer_scale": 0.01,
"max_period": 10000,
"module_type": "Transformer",
"norm": "layer_norm",
"num_heads": 20,
"num_layers": 32,
"output_dimension": 768,
"positional_embedding": "rope"
}
],
"transformers_version": "4.56.0.dev0",
"quantization": {
"bits": 8,
"group_size": 64,
"mode": "affine"
}
}