Safetensors
tts
vc
svs
svc
music
RMSnow's picture
Upload folder using huggingface_hub
795b27d verified
{
"preprocess": {
"hop_size": 480,
"sample_rate": 24000,
"n_fft": 1920,
"num_mels": 128,
"win_size": 1920,
"fmin": 0,
"fmax": 12000,
"mel_var": 8.14,
"mel_mean": -4.92,
"f0_fmin": 50.0,
"f0_fmax": 1100.0,
"wav_code_frame_rate": 18.75, // Vevo2: 12.5 (Content-Style Code) + 6.25 (Prosody Code) = 18.75
"min_dur": 1,
"max_dur": 30,
"drop_prosody_id_prob": -1, // Dropping prosody ids means the Text-to-CS, while not dropping means the Text+Note-to-CS,
"pad_token_id": 151643, // <|endoftext|> for Qwen2.5-0.5B-Instruct,
"eos_token": "<|im_end|>",
"eos_token_id": 151645, // <|im_end|> for Qwen2.5-0.5B-Instruct,
// "tokenizer_path": "/mnt/data4/zhangxueyao/SpeechGenerationYC_ckpts/ckpts/vevo2/pretrained/Qwen2.5-0.5B-Instruct-add_prosody_contentstyle"
},
"model": {
// "pretrained_model_path": "/mnt/data4/zhangxueyao/SpeechGenerationYC_ckpts/ckpts/vevo2/pretrained/Qwen2.5-0.5B-Instruct-add_prosody_contentstyle", // Qwen2.5 Model
// "rl_init_model_path": "/mnt/data4/zhangxueyao/SpeechGenerationYC_ckpts/ckpts/vevo2/llm_dpo/dpo_qwen0.5B_intp2_highsim_3e-5/checkpoint_backup/epoch-0023_step-0027000_loss-0.000961", // DPO Model
"use_intelligibility_reward": true,
"use_chromagram_reward": true,
"use_target_length_reward": true,
"reward_combination_strategy": "advantage_first", // "reward_first" or "advantage_first"
"coco_style": {
"coco_type": "style", // content, style, or content_style
"downsample_rate": 8, // The original frame rate is 50 Hz, downsample to 6.25 Hz
"codebook_size": 512,
"hidden_size": 1024, // Representations Dim
"codebook_dim": 8,
"encoder": {
"vocos_dim": 384,
"vocos_intermediate_dim": 2048,
"vocos_num_layers": 12,
},
"decoder": {
"vocos_dim": 384,
"vocos_intermediate_dim": 2048,
"vocos_num_layers": 12,
},
"use_normed_whisper": true,
"whisper_stats_path": "models/svc/vevosing/config/whisper_stats.pt",
"whisper_dim": 1024,
"chromagram_dim": 24,
},
"coco_content_style": {
"coco_type": "content_style", // content, style, or content_style
"downsample_rate": 4, // The original frame rate is 50 Hz, downsample to 12.5 Hz
"codebook_size": 16384,
"hidden_size": 1024, // Representations Dim
"codebook_dim": 8,
"encoder": {
"vocos_dim": 384,
"vocos_intermediate_dim": 2048,
"vocos_num_layers": 12,
},
"decoder": {
"vocos_dim": 384,
"vocos_intermediate_dim": 2048,
"vocos_num_layers": 12,
},
"use_normed_whisper": true,
"whisper_stats_path": "models/svc/vevosing/config/whisper_stats.pt",
"whisper_dim": 1024,
"chromagram_dim": 24,
},
},
}