MOSS-TTS-Nano

Running

MOSS-TTS-Nano / weights /codec /config.json

--replace-all

Add Nano-TTS CPU Gradio Space

1459ef5 6 days ago

7.39 kB

	{
	"architectures": [
	"MossAudioTokenizerModel"
	],
	"auto_map": {
	"AutoConfig": "configuration_moss_audio_tokenizer.MossAudioTokenizerConfig",
	"AutoModel": "modeling_moss_audio_tokenizer.MossAudioTokenizerModel"
	},
	"model_type": "moss-audio-tokenizer",
	"sample_rate": 48000,
	"sampling_rate": 48000,
	"downsample_rate": 3840,
	"causal_transformer_context_duration": 10.0,
	"number_channels": 2,
	"enable_channel_interleave": true,
	"attention_implementation": "sdpa",
	"compute_dtype": "fp32",
	"dtype": "float32",
	"code_dim": 768,
	"encoder_kwargs": [
	{
	"module_type": "PatchedPretransform",
	"patch_size": 240
	},
	{
	"causal": true,
	"context_duration": 4.0,
	"conv_layout": true,
	"d_model": 256,
	"dim_feedforward": 1024,
	"gating": "none",
	"input_dimension": 240,
	"layer_scale": 0.01,
	"max_period": 10000,
	"module_type": "Transformer",
	"norm": "layer_norm",
	"num_heads": 4,
	"num_layers": 4,
	"output_dimension": 384,
	"positional_embedding": "rope"
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2
	},
	{
	"causal": true,
	"context_duration": 6.0,
	"conv_layout": true,
	"d_model": 256,
	"dim_feedforward": 1024,
	"gating": "none",
	"input_dimension": 768,
	"layer_scale": 0.01,
	"max_period": 10000,
	"module_type": "Transformer",
	"norm": "layer_norm",
	"num_heads": 4,
	"num_layers": 2,
	"output_dimension": 384,
	"positional_embedding": "rope"
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2
	},
	{
	"causal": true,
	"context_duration": 8.0,
	"conv_layout": true,
	"d_model": 256,
	"dim_feedforward": 1024,
	"gating": "none",
	"input_dimension": 768,
	"layer_scale": 0.01,
	"max_period": 10000,
	"module_type": "Transformer",
	"norm": "layer_norm",
	"num_heads": 4,
	"num_layers": 2,
	"output_dimension": 384,
	"positional_embedding": "rope"
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2
	},
	{
	"causal": true,
	"context_duration": 10.0,
	"conv_layout": true,
	"d_model": 256,
	"dim_feedforward": 1024,
	"gating": "none",
	"input_dimension": 768,
	"layer_scale": 0.01,
	"max_period": 10000,
	"module_type": "Transformer",
	"norm": "layer_norm",
	"num_heads": 4,
	"num_layers": 4,
	"output_dimension": 192,
	"positional_embedding": "rope"
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 4
	}
	],
	"decoder_kwargs": [
	{
	"module_type": "PatchedPretransform",
	"patch_size": 4
	},
	{
	"causal": true,
	"context_duration": 10.0,
	"conv_layout": true,
	"d_model": 256,
	"dim_feedforward": 1024,
	"gating": "none",
	"input_dimension": 192,
	"layer_scale": 0.01,
	"max_period": 10000,
	"module_type": "Transformer",
	"norm": "layer_norm",
	"num_heads": 4,
	"num_layers": 4,
	"output_dimension": 768,
	"positional_embedding": "rope"
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2
	},
	{
	"causal": true,
	"context_duration": 8.0,
	"conv_layout": true,
	"d_model": 256,
	"dim_feedforward": 1024,
	"gating": "none",
	"input_dimension": 384,
	"layer_scale": 0.01,
	"max_period": 10000,
	"module_type": "Transformer",
	"norm": "layer_norm",
	"num_heads": 4,
	"num_layers": 2,
	"output_dimension": 768,
	"positional_embedding": "rope"
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2
	},
	{
	"causal": true,
	"context_duration": 6.0,
	"conv_layout": true,
	"d_model": 256,
	"dim_feedforward": 1024,
	"gating": "none",
	"input_dimension": 384,
	"layer_scale": 0.01,
	"max_period": 10000,
	"module_type": "Transformer",
	"norm": "layer_norm",
	"num_heads": 4,
	"num_layers": 2,
	"output_dimension": 768,
	"positional_embedding": "rope"
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2
	},
	{
	"causal": true,
	"context_duration": 4.0,
	"conv_layout": true,
	"d_model": 256,
	"dim_feedforward": 1024,
	"gating": "none",
	"input_dimension": 384,
	"layer_scale": 0.01,
	"max_period": 10000,
	"module_type": "Transformer",
	"norm": "layer_norm",
	"num_heads": 4,
	"num_layers": 4,
	"output_dimension": 240,
	"positional_embedding": "rope"
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 240
	}
	],
	"quantizer_type": "rlfq",
	"quantizer_kwargs": {
	"codebook_dim": 8,
	"codebook_loss_weight": 1.0,
	"codebook_size": 1024,
	"commitment_loss_weight": 0.25,
	"input_dim": 768,
	"num_quantizers": 16,
	"output_dim": 768,
	"quantizer_dropout": 1.0,
	"quantizer_type": "rlfq",
	"rvq_dim": 512
	},
	"transformers_version": "4.56.0.dev0",
	"reversed_decoder_kwargs": [
	{
	"module_type": "PatchedPretransform",
	"patch_size": 240
	},
	{
	"causal": true,
	"context_duration": 4.0,
	"conv_layout": true,
	"d_model": 256,
	"dim_feedforward": 1024,
	"gating": "none",
	"input_dimension": 240,
	"layer_scale": 0.01,
	"max_period": 10000,
	"module_type": "Transformer",
	"norm": "layer_norm",
	"num_heads": 4,
	"num_layers": 4,
	"output_dimension": 384,
	"positional_embedding": "rope"
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2
	},
	{
	"causal": true,
	"context_duration": 6.0,
	"conv_layout": true,
	"d_model": 256,
	"dim_feedforward": 1024,
	"gating": "none",
	"input_dimension": 768,
	"layer_scale": 0.01,
	"max_period": 10000,
	"module_type": "Transformer",
	"norm": "layer_norm",
	"num_heads": 4,
	"num_layers": 2,
	"output_dimension": 384,
	"positional_embedding": "rope"
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2
	},
	{
	"causal": true,
	"context_duration": 8.0,
	"conv_layout": true,
	"d_model": 256,
	"dim_feedforward": 1024,
	"gating": "none",
	"input_dimension": 768,
	"layer_scale": 0.01,
	"max_period": 10000,
	"module_type": "Transformer",
	"norm": "layer_norm",
	"num_heads": 4,
	"num_layers": 2,
	"output_dimension": 384,
	"positional_embedding": "rope"
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 2
	},
	{
	"causal": true,
	"context_duration": 10.0,
	"conv_layout": true,
	"d_model": 256,
	"dim_feedforward": 1024,
	"gating": "none",
	"input_dimension": 768,
	"layer_scale": 0.01,
	"max_period": 10000,
	"module_type": "Transformer",
	"norm": "layer_norm",
	"num_heads": 4,
	"num_layers": 4,
	"output_dimension": 192,
	"positional_embedding": "rope"
	},
	{
	"module_type": "PatchedPretransform",
	"patch_size": 4
	}
	]
	}