Vevo2 / contentstyle_modeling /posttrained /amphion_config.json

Upload folder using huggingface_hub

795b27d verified 4 months ago

3.22 kB

	{
	"preprocess": {
	"hop_size": 480,
	"sample_rate": 24000,
	"n_fft": 1920,
	"num_mels": 128,
	"win_size": 1920,
	"fmin": 0,
	"fmax": 12000,
	"mel_var": 8.14,
	"mel_mean": -4.92,
	"f0_fmin": 50.0,
	"f0_fmax": 1100.0,
	"wav_code_frame_rate": 18.75, // Vevo2: 12.5 (Content-Style Code) + 6.25 (Prosody Code) = 18.75
	"min_dur": 1,
	"max_dur": 30,
	"drop_prosody_id_prob": -1, // Dropping prosody ids means the Text-to-CS, while not dropping means the Text+Note-to-CS,
	"pad_token_id": 151643, // <\|endoftext\|> for Qwen2.5-0.5B-Instruct,
	"eos_token": "<\|im_end\|>",
	"eos_token_id": 151645, // <\|im_end\|> for Qwen2.5-0.5B-Instruct,
	// "tokenizer_path": "/mnt/data4/zhangxueyao/SpeechGenerationYC_ckpts/ckpts/vevo2/pretrained/Qwen2.5-0.5B-Instruct-add_prosody_contentstyle"
	},
	"model": {
	// "pretrained_model_path": "/mnt/data4/zhangxueyao/SpeechGenerationYC_ckpts/ckpts/vevo2/pretrained/Qwen2.5-0.5B-Instruct-add_prosody_contentstyle", // Qwen2.5 Model
	// "rl_init_model_path": "/mnt/data4/zhangxueyao/SpeechGenerationYC_ckpts/ckpts/vevo2/llm_dpo/dpo_qwen0.5B_intp2_highsim_3e-5/checkpoint_backup/epoch-0023_step-0027000_loss-0.000961", // DPO Model
	"use_intelligibility_reward": true,
	"use_chromagram_reward": true,
	"use_target_length_reward": true,
	"reward_combination_strategy": "advantage_first", // "reward_first" or "advantage_first"
	"coco_style": {
	"coco_type": "style", // content, style, or content_style
	"downsample_rate": 8, // The original frame rate is 50 Hz, downsample to 6.25 Hz
	"codebook_size": 512,
	"hidden_size": 1024, // Representations Dim
	"codebook_dim": 8,
	"encoder": {
	"vocos_dim": 384,
	"vocos_intermediate_dim": 2048,
	"vocos_num_layers": 12,
	},
	"decoder": {
	"vocos_dim": 384,
	"vocos_intermediate_dim": 2048,
	"vocos_num_layers": 12,
	},
	"use_normed_whisper": true,
	"whisper_stats_path": "models/svc/vevosing/config/whisper_stats.pt",
	"whisper_dim": 1024,
	"chromagram_dim": 24,
	},
	"coco_content_style": {
	"coco_type": "content_style", // content, style, or content_style
	"downsample_rate": 4, // The original frame rate is 50 Hz, downsample to 12.5 Hz
	"codebook_size": 16384,
	"hidden_size": 1024, // Representations Dim
	"codebook_dim": 8,
	"encoder": {
	"vocos_dim": 384,
	"vocos_intermediate_dim": 2048,
	"vocos_num_layers": 12,
	},
	"decoder": {
	"vocos_dim": 384,
	"vocos_intermediate_dim": 2048,
	"vocos_num_layers": 12,
	},
	"use_normed_whisper": true,
	"whisper_stats_path": "models/svc/vevosing/config/whisper_stats.pt",
	"whisper_dim": 1024,
	"chromagram_dim": 24,
	},
	},
	}