waytan22
/

SongGeneration-v2.0

Model card Files Files and versions

SongGeneration-v2.0 / ckpt /config.yaml

root

init commit

ffd9215 3 months ago

history blame contribute delete

2.53 kB

	# ================ Train Config ================ #
	lyric_processor:
	max_dur: 270
	min_dur: 30
	prompt_len: 10
	pad_to_max: true


	# ================ Audio tokenzier ================ #
	audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors
	audio_tokenizer_frame_rate: 25
	audio_tokenizer_code_depth: 1
	sample_rate: 48000
	mode: 'inference'
	# ================ VAE ================ #
	vae_config: ckpt/vae/stable_audio_1920_vae.json
	vae_model: ckpt/vae/autoencoder_music_1320k.ckpt
	# ================ LM Pretrain ================ #
	lm_checkpoint: ckpt/songgeneration2

	# ================== LM =========================== #
	lm:
	lm_type: Llama # [Llama]
	dim: 2048
	intermediate_size: 11008
	num_heads: 16
	num_layers: 36
	code_depth: 1
	code_size: 16384
	dropout: 0.0
	activation: gelu
	norm_first: true
	bias_ff: false
	bias_attn: false
	bias_proj: false
	causal: true
	custom: false
	memory_efficient: true
	attention_as_float32: false
	layer_scale: null
	positional_embedding: sin
	xpos: false
	checkpointing: torch
	weight_init: gaussian
	depthwise_init: current
	zero_bias_init: true
	norm: layer_norm
	cross_attention: false
	qk_layer_norm: false
	qk_layer_norm_cross: false
	attention_dropout: null
	kv_repeat: 1

	codebooks_pattern:
	modeling: delay
	delay:
	delays: [ 0 ]
	flatten_first: 0
	empty_initial: 0

	# ================ Conditioners ===================== #
	classifier_free_guidance:
	# drop all conditions simultaneously
	training_dropout: 0 # 0.15
	inference_coef: 1.5

	attribute_dropout:
	# drop each condition separately
	args:
	active_on_eval: false
	text:
	description: 0.0
	type_info: 0.5
	audio:
	prompt_audio: 0.5

	use_text_training: True
	fuser:
	sum: []
	prepend: [ description, prompt_audio, type_info ] # this order is the SAME with the input concatenation order

	conditioners:
	prompt_audio:
	model: qt_embedding
	qt_embedding:
	code_size: 16384
	code_depth: 1
	max_len: ${eval:${prompt_len}${audio_tokenizer_frame_rate}+${audio_tokenizer_code_depth}+1} # 2510+2+1
	description:
	model: QwTokenizer
	QwTokenizer:
	token_path: third_party/Qwen2-7B
	max_len: 600
	add_token_list: ${load_yaml:conf/vocab.yaml}
	type_info:
	model: QwTextTokenizer
	QwTextTokenizer:
	token_path: third_party/Qwen2-7B
	max_len: 100

	vllm:
	cfg: True
	device_num: 1
	gpu_memory_utilization: 0.75 # 尽可能大
	guidance_scale: 1.8
	temp: 0.8
	top_k: 5000
	top_p: 0.0