khanusa
/

text2speech

Model card Files Files and versions

text2speech / config.yaml

khanusa's picture

Create config.yaml

efc2070 verified 8 months ago

history blame contribute delete

1.77 kB

	model_type: spark-tts
	architectures:
	- SparkTTSModel
	auto_map:
	AutoConfig: configuration_spark_tts.SparkTTSConfig
	AutoModel: modeling_spark_tts.SparkTTSModel
	AutoProcessor: processing_spark_tts.SparkTTSProcessor
	processor_class: processing_spark_tts.SparkTTSProcessor
	llm_model_name_or_path: ./LLM
	bicodec_model_name_or_path: ./BiCodec
	wav2vec2_model_name_or_path: ./wav2vec2-large-xlsr-53
	sample_rate: 16000
	highpass_cutoff_freq: 40
	latent_hop_length: 320
	ref_segment_duration: 6.0
	volume_normalize: true
	torch_dtype: bfloat16
	transformers_version: "4.50.3"
	_commit_hash: null
	bicodec_config:
	mel_params:
	sample_rate: 16000
	n_fft: 1024
	win_length: 640
	hop_length: 320
	mel_fmin: 10
	mel_fmax: null
	num_mels: 128
	encoder_config:
	input_channels: 1024
	vocos_dim: 384
	vocos_intermediate_dim: 2048
	vocos_num_layers: 12
	out_channels: 1024
	sample_ratios: [1, 1]
	decoder_config:
	input_channel: 1024
	channels: 1536
	rates: [8, 5, 4, 2]
	kernel_sizes: [16, 11, 8, 4]
	quantizer_config:
	input_dim: 1024
	codebook_size: 8192
	codebook_dim: 8
	commitment: 0.25
	codebook_loss_weight: 2.0
	decay: 0.99
	threshold_ema_dead_code: 0.2
	speaker_encoder_config:
	input_dim: 128
	out_dim: 1024
	latent_dim: 128
	token_num: 32
	fsq_levels: [4, 4, 4, 4, 4, 4]
	fsq_num_quantizers: 1
	prenet_config:
	input_channels: 1024
	vocos_dim: 384
	vocos_intermediate_dim: 2048
	vocos_num_layers: 12
	out_channels: 1024
	condition_dim: 1024
	sample_ratios: [1, 1]
	use_tanh_at_final: false
	postnet_config:
	input_channels: 1024
	vocos_dim: 384
	vocos_intermediate_dim: 2048
	vocos_num_layers: 6
	out_channels: 1024
	use_tanh_at_final: false