model_type: spark-tts architectures: - SparkTTSModel auto_map: AutoConfig: configuration_spark_tts.SparkTTSConfig AutoModel: modeling_spark_tts.SparkTTSModel AutoProcessor: processing_spark_tts.SparkTTSProcessor processor_class: processing_spark_tts.SparkTTSProcessor llm_model_name_or_path: ./LLM bicodec_model_name_or_path: ./BiCodec wav2vec2_model_name_or_path: ./wav2vec2-large-xlsr-53 sample_rate: 16000 highpass_cutoff_freq: 40 latent_hop_length: 320 ref_segment_duration: 6.0 volume_normalize: true torch_dtype: bfloat16 transformers_version: "4.50.3" _commit_hash: null bicodec_config: mel_params: sample_rate: 16000 n_fft: 1024 win_length: 640 hop_length: 320 mel_fmin: 10 mel_fmax: null num_mels: 128 encoder_config: input_channels: 1024 vocos_dim: 384 vocos_intermediate_dim: 2048 vocos_num_layers: 12 out_channels: 1024 sample_ratios: [1, 1] decoder_config: input_channel: 1024 channels: 1536 rates: [8, 5, 4, 2] kernel_sizes: [16, 11, 8, 4] quantizer_config: input_dim: 1024 codebook_size: 8192 codebook_dim: 8 commitment: 0.25 codebook_loss_weight: 2.0 decay: 0.99 threshold_ema_dead_code: 0.2 speaker_encoder_config: input_dim: 128 out_dim: 1024 latent_dim: 128 token_num: 32 fsq_levels: [4, 4, 4, 4, 4, 4] fsq_num_quantizers: 1 prenet_config: input_channels: 1024 vocos_dim: 384 vocos_intermediate_dim: 2048 vocos_num_layers: 12 out_channels: 1024 condition_dim: 1024 sample_ratios: [1, 1] use_tanh_at_final: false postnet_config: input_channels: 1024 vocos_dim: 384 vocos_intermediate_dim: 2048 vocos_num_layers: 6 out_channels: 1024 use_tanh_at_final: false