model_type: spark-tts
architectures:
  - SparkTTSModel
auto_map:
  AutoConfig: configuration_spark_tts.SparkTTSConfig
  AutoModel: modeling_spark_tts.SparkTTSModel
  AutoProcessor: processing_spark_tts.SparkTTSProcessor
processor_class: processing_spark_tts.SparkTTSProcessor
llm_model_name_or_path: ./LLM
bicodec_model_name_or_path: ./BiCodec
wav2vec2_model_name_or_path: ./wav2vec2-large-xlsr-53
sample_rate: 16000
highpass_cutoff_freq: 40
latent_hop_length: 320
ref_segment_duration: 6.0
volume_normalize: true
torch_dtype: bfloat16
transformers_version: "4.50.3"
_commit_hash: null
bicodec_config:
  mel_params:
    sample_rate: 16000
    n_fft: 1024
    win_length: 640
    hop_length: 320
    mel_fmin: 10
    mel_fmax: null
    num_mels: 128
  encoder_config:
    input_channels: 1024
    vocos_dim: 384
    vocos_intermediate_dim: 2048
    vocos_num_layers: 12
    out_channels: 1024
    sample_ratios: [1, 1]
  decoder_config:
    input_channel: 1024
    channels: 1536
    rates: [8, 5, 4, 2]
    kernel_sizes: [16, 11, 8, 4]
  quantizer_config:
    input_dim: 1024
    codebook_size: 8192
    codebook_dim: 8
    commitment: 0.25
    codebook_loss_weight: 2.0
    decay: 0.99
    threshold_ema_dead_code: 0.2
  speaker_encoder_config:
    input_dim: 128
    out_dim: 1024
    latent_dim: 128
    token_num: 32
    fsq_levels: [4, 4, 4, 4, 4, 4]
    fsq_num_quantizers: 1
  prenet_config:
    input_channels: 1024
    vocos_dim: 384
    vocos_intermediate_dim: 2048
    vocos_num_layers: 12
    out_channels: 1024
    condition_dim: 1024
    sample_ratios: [1, 1]
    use_tanh_at_final: false
  postnet_config:
    input_channels: 1024
    vocos_dim: 384
    vocos_intermediate_dim: 2048
    vocos_num_layers: 6
    out_channels: 1024
    use_tanh_at_final: false