|
|
{ |
|
|
"preprocess": { |
|
|
"hop_size": 480, |
|
|
"sample_rate": 24000, |
|
|
"n_fft": 1920, |
|
|
"num_mels": 128, |
|
|
"win_size": 1920, |
|
|
"fmin": 0, |
|
|
"fmax": 12000, |
|
|
"mel_var": 8.14, |
|
|
"mel_mean": -4.92, |
|
|
"f0_fmin": 50.0, |
|
|
"f0_fmax": 1100.0, |
|
|
"wav_code_frame_rate": 18.75, |
|
|
"min_dur": 1, |
|
|
"max_dur": 30, |
|
|
"drop_prosody_id_prob": -1, |
|
|
"pad_token_id": 151643, |
|
|
"eos_token": "<|im_end|>", |
|
|
"eos_token_id": 151645, |
|
|
|
|
|
}, |
|
|
"model": { |
|
|
|
|
|
|
|
|
"use_intelligibility_reward": true, |
|
|
"use_chromagram_reward": true, |
|
|
"use_target_length_reward": true, |
|
|
"reward_combination_strategy": "advantage_first", |
|
|
"coco_style": { |
|
|
"coco_type": "style", |
|
|
"downsample_rate": 8, |
|
|
"codebook_size": 512, |
|
|
"hidden_size": 1024, |
|
|
"codebook_dim": 8, |
|
|
"encoder": { |
|
|
"vocos_dim": 384, |
|
|
"vocos_intermediate_dim": 2048, |
|
|
"vocos_num_layers": 12, |
|
|
}, |
|
|
"decoder": { |
|
|
"vocos_dim": 384, |
|
|
"vocos_intermediate_dim": 2048, |
|
|
"vocos_num_layers": 12, |
|
|
}, |
|
|
"use_normed_whisper": true, |
|
|
"whisper_stats_path": "models/svc/vevosing/config/whisper_stats.pt", |
|
|
"whisper_dim": 1024, |
|
|
"chromagram_dim": 24, |
|
|
}, |
|
|
"coco_content_style": { |
|
|
"coco_type": "content_style", |
|
|
"downsample_rate": 4, |
|
|
"codebook_size": 16384, |
|
|
"hidden_size": 1024, |
|
|
"codebook_dim": 8, |
|
|
"encoder": { |
|
|
"vocos_dim": 384, |
|
|
"vocos_intermediate_dim": 2048, |
|
|
"vocos_num_layers": 12, |
|
|
}, |
|
|
"decoder": { |
|
|
"vocos_dim": 384, |
|
|
"vocos_intermediate_dim": 2048, |
|
|
"vocos_num_layers": 12, |
|
|
}, |
|
|
"use_normed_whisper": true, |
|
|
"whisper_stats_path": "models/svc/vevosing/config/whisper_stats.pt", |
|
|
"whisper_dim": 1024, |
|
|
"chromagram_dim": 24, |
|
|
}, |
|
|
}, |
|
|
} |