| model_type: spark-tts | |
| architectures: | |
| - SparkTTSModel | |
| auto_map: | |
| AutoConfig: configuration_spark_tts.SparkTTSConfig | |
| AutoModel: modeling_spark_tts.SparkTTSModel | |
| AutoProcessor: processing_spark_tts.SparkTTSProcessor | |
| processor_class: processing_spark_tts.SparkTTSProcessor | |
| llm_model_name_or_path: ./LLM | |
| bicodec_model_name_or_path: ./BiCodec | |
| wav2vec2_model_name_or_path: ./wav2vec2-large-xlsr-53 | |
| sample_rate: 16000 | |
| highpass_cutoff_freq: 40 | |
| latent_hop_length: 320 | |
| ref_segment_duration: 6.0 | |
| volume_normalize: true | |
| torch_dtype: bfloat16 | |
| transformers_version: "4.50.3" | |
| _commit_hash: null | |
| bicodec_config: | |
| mel_params: | |
| sample_rate: 16000 | |
| n_fft: 1024 | |
| win_length: 640 | |
| hop_length: 320 | |
| mel_fmin: 10 | |
| mel_fmax: null | |
| num_mels: 128 | |
| encoder_config: | |
| input_channels: 1024 | |
| vocos_dim: 384 | |
| vocos_intermediate_dim: 2048 | |
| vocos_num_layers: 12 | |
| out_channels: 1024 | |
| sample_ratios: [1, 1] | |
| decoder_config: | |
| input_channel: 1024 | |
| channels: 1536 | |
| rates: [8, 5, 4, 2] | |
| kernel_sizes: [16, 11, 8, 4] | |
| quantizer_config: | |
| input_dim: 1024 | |
| codebook_size: 8192 | |
| codebook_dim: 8 | |
| commitment: 0.25 | |
| codebook_loss_weight: 2.0 | |
| decay: 0.99 | |
| threshold_ema_dead_code: 0.2 | |
| speaker_encoder_config: | |
| input_dim: 128 | |
| out_dim: 1024 | |
| latent_dim: 128 | |
| token_num: 32 | |
| fsq_levels: [4, 4, 4, 4, 4, 4] | |
| fsq_num_quantizers: 1 | |
| prenet_config: | |
| input_channels: 1024 | |
| vocos_dim: 384 | |
| vocos_intermediate_dim: 2048 | |
| vocos_num_layers: 12 | |
| out_channels: 1024 | |
| condition_dim: 1024 | |
| sample_ratios: [1, 1] | |
| use_tanh_at_final: false | |
| postnet_config: | |
| input_channels: 1024 | |
| vocos_dim: 384 | |
| vocos_intermediate_dim: 2048 | |
| vocos_num_layers: 6 | |
| out_channels: 1024 | |
| use_tanh_at_final: false | |