| { |
| "model_type": "spark-tts", |
| "architectures": [ |
| "SparkTTSModel" |
| ], |
| "auto_map": { |
| "AutoConfig": "configuration_spark_tts.SparkTTSConfig", |
| "AutoModel": "modeling_spark_tts.SparkTTSModel", |
| "AutoProcessor": "processing_spark_tts.SparkTTSProcessor" |
| }, |
| "processor_class": "processing_spark_tts.SparkTTSProcessor", |
| "llm_model_name_or_path": "./LLM", |
| "bicodec_model_name_or_path": "./BiCodec", |
| "wav2vec2_model_name_or_path": "./wav2vec2-large-xlsr-53", |
| "sample_rate": 16000, |
| "highpass_cutoff_freq": 40, |
| "latent_hop_length": 320, |
| "ref_segment_duration": 6.0, |
| "volume_normalize": true, |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.50.3", |
| "_commit_hash": null, |
| "bicodec_config": { |
| "mel_params": { |
| "sample_rate": 16000, |
| "n_fft": 1024, |
| "win_length": 640, |
| "hop_length": 320, |
| "mel_fmin": 10, |
| "mel_fmax": null, |
| "num_mels": 128 |
| }, |
| "encoder_config": { |
| "input_channels": 1024, |
| "vocos_dim": 384, |
| "vocos_intermediate_dim": 2048, |
| "vocos_num_layers": 12, |
| "out_channels": 1024, |
| "sample_ratios": [1, 1] |
| }, |
| "decoder_config": { |
| "input_channel": 1024, |
| "channels": 1536, |
| "rates": [8, 5, 4, 2], |
| "kernel_sizes": [16, 11, 8, 4] |
| }, |
| "quantizer_config": { |
| "input_dim": 1024, |
| "codebook_size": 8192, |
| "codebook_dim": 8, |
| "commitment": 0.25, |
| "codebook_loss_weight": 2.0, |
| "decay": 0.99, |
| "threshold_ema_dead_code": 0.2 |
| }, |
| "speaker_encoder_config": { |
| "input_dim": 128, |
| "out_dim": 1024, |
| "latent_dim": 128, |
| "token_num": 32, |
| "fsq_levels": [4, 4, 4, 4, 4, 4], |
| "fsq_num_quantizers": 1 |
| }, |
| "prenet_config": { |
| "input_channels": 1024, |
| "vocos_dim": 384, |
| "vocos_intermediate_dim": 2048, |
| "vocos_num_layers": 12, |
| "out_channels": 1024, |
| "condition_dim": 1024, |
| "sample_ratios": [1, 1], |
| "use_tanh_at_final": false |
| }, |
| "postnet_config": { |
| "input_channels": 1024, |
| "vocos_dim": 384, |
| "vocos_intermediate_dim": 2048, |
| "vocos_num_layers": 6, |
| "out_channels": 1024, |
| "use_tanh_at_final": false |
| } |
| } |
| } |