FireRedTTS-1S / configs /config_24k.json
Shen Feiyu
add 1s
faadabf
{
"semantic_llm": {
"start_text_token": 32000,
"stop_text_token": 32001,
"num_text_tokens": 32002,
"start_audio_token": 16384,
"stop_audio_token": 16385,
"num_audio_tokens": 16386,
"llm_hidden_size": 1024,
"llm_intermediate_size": 4096,
"llm_num_layers": 30,
"llm_num_heads": 16,
"llm_max_audio_seq_len": 630,
"llm_max_text_seq_len": 402,
"llm_max_prompt_len": 250,
"code_stride_len": 640,
"EOS_TOKEN": 16385
},
"acoustic_llm": {
"n_stacks": 1,
"layers": 24,
"model_dim": 1536,
"heads": 16,
"max_text_tokens": 2048,
"max_speech_tokens": 2048,
"max_conditioning_inputs": 1,
"number_text_tokens": 16386,
"start_text_token": 16384,
"stop_text_token": 16385,
"n_frames_per_step": 1,
"n_heads_per_frame": 8,
"delay_prediction": 1,
"upsample_factors": 1,
"streaming_delayed_frames": 8,
"number_speech_tokens": 16386,
"start_speech_token": 16384,
"stop_speech_token": 16385,
"speaker_embedding_pretrained": true,
"speaker_embedding_ckpt": null,
"speaker_embedding_dim": 512,
"temperature": 0.5,
"repetition_penalty": 2.0,
"top_p": 0.5,
"top_k": 25
},
"acoustic_codec": {
"n_model_size": 1024,
"encoder_config": {
"ngf": 48,
"up_ratios": [
2,
4,
4,
4,
5
],
"causal": true
},
"decoder_config": {
"upsample_initial_channel": 1536,
"ngf": 48,
"up_ratios": [
6,
5,
4,
4,
2
],
"causal": true
},
"vq_config": {
"n_groups": 8,
"ordered": true,
"codebook_size": [
128,
128,
128,
128,
128,
128,
128,
128,
128,
128,
128,
128,
128,
128,
128,
128
],
"codebook_dim": [
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8,
8
],
"requires_projection": true,
"decay": 0.99,
"threshold_ema_dead_code": 0,
"commitment_weight": 0.01
},
"resampler_config": {
"source_sr": 16000,
"target_sr": 16000
}
},
"semantic_tokenizer": {
"in_dim": 1024,
"out_dim": 80,
"n_model_size": 512,
"downsample_scales": [
1,
1,
1,
2
],
"upsample_scales": [
[
2,
1
],
[
2,
1,
1,
1
]
],
"mel_config": {
"style": "BigVGAN",
"filter_length": 1024,
"hop_length": 160,
"win_length": 640,
"n_mel_channels": 80,
"sampling_rate": 16000
},
"vq_config": {
"codebook_size": [
128,
128
],
"codebook_dim": [
128,
128
],
"requires_projection": true
},
"tree_config": [
{
"downsample_rate": 1,
"n_groups": 1,
"dropout": 0
}
],
"n_samples_per_token": 640,
"checkpointing": true
}
}