{ "preprocess": { "hop_size": 480, "sample_rate": 24000, "n_fft": 1920, "num_mels": 128, "win_size": 1920, "fmin": 0, "fmax": 12000, "mel_var": 8.14, "mel_mean": -4.92, "f0_fmin": 50.0, "f0_fmax": 1100.0, "wav_code_frame_rate": 18.75, // Vevo2: 12.5 (Content-Style Code) + 6.25 (Prosody Code) = 18.75 "min_dur": 1, "max_dur": 30, "drop_prosody_id_prob": -1, // Dropping prosody ids means the Text-to-CS, while not dropping means the Text+Note-to-CS, "pad_token_id": 151643, // <|endoftext|> for Qwen2.5-0.5B-Instruct, "eos_token": "<|im_end|>", "eos_token_id": 151645, // <|im_end|> for Qwen2.5-0.5B-Instruct, // "tokenizer_path": "/mnt/data4/zhangxueyao/SpeechGenerationYC_ckpts/ckpts/vevo2/pretrained/Qwen2.5-0.5B-Instruct-add_prosody_contentstyle" }, "model": { // "pretrained_model_path": "/mnt/data4/zhangxueyao/SpeechGenerationYC_ckpts/ckpts/vevo2/pretrained/Qwen2.5-0.5B-Instruct-add_prosody_contentstyle", // Qwen2.5 Model // "rl_init_model_path": "/mnt/data4/zhangxueyao/SpeechGenerationYC_ckpts/ckpts/vevo2/llm_dpo/dpo_qwen0.5B_intp2_highsim_3e-5/checkpoint_backup/epoch-0023_step-0027000_loss-0.000961", // DPO Model "use_intelligibility_reward": true, "use_chromagram_reward": true, "use_target_length_reward": true, "reward_combination_strategy": "advantage_first", // "reward_first" or "advantage_first" "coco_style": { "coco_type": "style", // content, style, or content_style "downsample_rate": 8, // The original frame rate is 50 Hz, downsample to 6.25 Hz "codebook_size": 512, "hidden_size": 1024, // Representations Dim "codebook_dim": 8, "encoder": { "vocos_dim": 384, "vocos_intermediate_dim": 2048, "vocos_num_layers": 12, }, "decoder": { "vocos_dim": 384, "vocos_intermediate_dim": 2048, "vocos_num_layers": 12, }, "use_normed_whisper": true, "whisper_stats_path": "models/svc/vevosing/config/whisper_stats.pt", "whisper_dim": 1024, "chromagram_dim": 24, }, "coco_content_style": { "coco_type": "content_style", // content, style, or content_style "downsample_rate": 4, // The original frame rate is 50 Hz, downsample to 12.5 Hz "codebook_size": 16384, "hidden_size": 1024, // Representations Dim "codebook_dim": 8, "encoder": { "vocos_dim": 384, "vocos_intermediate_dim": 2048, "vocos_num_layers": 12, }, "decoder": { "vocos_dim": 384, "vocos_intermediate_dim": 2048, "vocos_num_layers": 12, }, "use_normed_whisper": true, "whisper_stats_path": "models/svc/vevosing/config/whisper_stats.pt", "whisper_dim": 1024, "chromagram_dim": 24, }, }, }