| { | |
| "model_type": "spark-tts", | |
| "architectures": [ | |
| "SparkTTSModel" | |
| ], | |
| "auto_map": { | |
| "AutoConfig": "configuration_spark_tts.SparkTTSConfig", | |
| "AutoModel": "modeling_spark_tts.SparkTTSModel", | |
| "AutoProcessor": "processing_spark_tts.SparkTTSProcessor" | |
| }, | |
| "processor_class": "processing_spark_tts.SparkTTSProcessor", | |
| "llm_model_name_or_path": "./LLM", | |
| "bicodec_model_name_or_path": "./BiCodec", | |
| "wav2vec2_model_name_or_path": "./wav2vec2-large-xlsr-53", | |
| "sample_rate": 16000, | |
| "highpass_cutoff_freq": 40, | |
| "latent_hop_length": 320, | |
| "ref_segment_duration": 6.0, | |
| "volume_normalize": true, | |
| "torch_dtype": "bfloat16", | |
| "transformers_version": "4.50.3", | |
| "_commit_hash": null, | |
| "bicodec_config": { | |
| "mel_params": { | |
| "sample_rate": 16000, | |
| "n_fft": 1024, | |
| "win_length": 640, | |
| "hop_length": 320, | |
| "mel_fmin": 10, | |
| "mel_fmax": null, | |
| "num_mels": 128 | |
| }, | |
| "encoder_config": { | |
| "input_channels": 1024, | |
| "vocos_dim": 384, | |
| "vocos_intermediate_dim": 2048, | |
| "vocos_num_layers": 12, | |
| "out_channels": 1024, | |
| "sample_ratios": [1, 1] | |
| }, | |
| "decoder_config": { | |
| "input_channel": 1024, | |
| "channels": 1536, | |
| "rates": [8, 5, 4, 2], | |
| "kernel_sizes": [16, 11, 8, 4] | |
| }, | |
| "quantizer_config": { | |
| "input_dim": 1024, | |
| "codebook_size": 8192, | |
| "codebook_dim": 8, | |
| "commitment": 0.25, | |
| "codebook_loss_weight": 2.0, | |
| "decay": 0.99, | |
| "threshold_ema_dead_code": 0.2 | |
| }, | |
| "speaker_encoder_config": { | |
| "input_dim": 128, | |
| "out_dim": 1024, | |
| "latent_dim": 128, | |
| "token_num": 32, | |
| "fsq_levels": [4, 4, 4, 4, 4, 4], | |
| "fsq_num_quantizers": 1 | |
| }, | |
| "prenet_config": { | |
| "input_channels": 1024, | |
| "vocos_dim": 384, | |
| "vocos_intermediate_dim": 2048, | |
| "vocos_num_layers": 12, | |
| "out_channels": 1024, | |
| "condition_dim": 1024, | |
| "sample_ratios": [1, 1], | |
| "use_tanh_at_final": false | |
| }, | |
| "postnet_config": { | |
| "input_channels": 1024, | |
| "vocos_dim": 384, | |
| "vocos_intermediate_dim": 2048, | |
| "vocos_num_layers": 6, | |
| "out_channels": 1024, | |
| "use_tanh_at_final": false | |
| } | |
| } | |
| } |