SongGeneration-v2.0 / ckpt /config.yaml
root
init commit
ffd9215
# ================ Train Config ================ #
lyric_processor:
max_dur: 270
min_dur: 30
prompt_len: 10
pad_to_max: true
# ================ Audio tokenzier ================ #
audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors
audio_tokenizer_frame_rate: 25
audio_tokenizer_code_depth: 1
sample_rate: 48000
mode: 'inference'
# ================ VAE ================ #
vae_config: ckpt/vae/stable_audio_1920_vae.json
vae_model: ckpt/vae/autoencoder_music_1320k.ckpt
# ================ LM Pretrain ================ #
lm_checkpoint: ckpt/songgeneration2
# ================== LM =========================== #
lm:
lm_type: Llama # [Llama]
dim: 2048
intermediate_size: 11008
num_heads: 16
num_layers: 36
code_depth: 1
code_size: 16384
dropout: 0.0
activation: gelu
norm_first: true
bias_ff: false
bias_attn: false
bias_proj: false
causal: true
custom: false
memory_efficient: true
attention_as_float32: false
layer_scale: null
positional_embedding: sin
xpos: false
checkpointing: torch
weight_init: gaussian
depthwise_init: current
zero_bias_init: true
norm: layer_norm
cross_attention: false
qk_layer_norm: false
qk_layer_norm_cross: false
attention_dropout: null
kv_repeat: 1
codebooks_pattern:
modeling: delay
delay:
delays: [ 0 ]
flatten_first: 0
empty_initial: 0
# ================ Conditioners ===================== #
classifier_free_guidance:
# drop all conditions simultaneously
training_dropout: 0 # 0.15
inference_coef: 1.5
attribute_dropout:
# drop each condition separately
args:
active_on_eval: false
text:
description: 0.0
type_info: 0.5
audio:
prompt_audio: 0.5
use_text_training: True
fuser:
sum: []
prepend: [ description, prompt_audio, type_info ] # this order is the SAME with the input concatenation order
conditioners:
prompt_audio:
model: qt_embedding
qt_embedding:
code_size: 16384
code_depth: 1
max_len: ${eval:${prompt_len}*${audio_tokenizer_frame_rate}+${audio_tokenizer_code_depth}+1} # 25*10+2+1
description:
model: QwTokenizer
QwTokenizer:
token_path: third_party/Qwen2-7B
max_len: 600
add_token_list: ${load_yaml:conf/vocab.yaml}
type_info:
model: QwTextTokenizer
QwTextTokenizer:
token_path: third_party/Qwen2-7B
max_len: 100
vllm:
cfg: True
device_num: 1
gpu_memory_utilization: 0.75 # 尽可能大
guidance_scale: 1.8
temp: 0.8
top_k: 5000
top_p: 0.0