File size: 3,349 Bytes
0730c18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
# ================ Train Config ================ #
lyric_processor:
max_dur: 150
min_dur: 30
prompt_len: 10
pad_to_max: true
# ================ Audio tokenzier ================ #
audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors
audio_tokenizer_frame_rate: 25
audio_tokenizer_code_depth: 1
sample_rate: 48000
audio_tokenizer_checkpoint_sep: Flow1dVAESeparate_./ckpt/model_septoken/model_2.safetensors
audio_tokenizer_frame_rate_sep: 25
audio_tokenizer_code_depth_sep: 2
sample_rate_sep: 48000
# ================ VAE ================ #
vae_config: ./ckpt/vae/stable_audio_1920_vae.json
vae_model: ./ckpt/vae/autoencoder_music_1320k.ckpt
# ================== LM =========================== #
lm:
lm_type: Llama # [Llama]
dim: 1536
intermediate_size: 8960
num_heads: 12
num_layers: 28
num_layers_sub: 12
code_depth: 3
code_size: 16384
max_position_embeddings: 8196
max_position_embeddings_sub: 10000
rope_theta: 100000.0
rope_theta_sub: 500000.0
dropout: 0.0
use_flash_attn_2: true
activation: gelu
norm_first: true
bias_ff: false
bias_attn: false
causal: true
custom: false
memory_efficient: true
attention_as_float32: false
layer_scale: null
positional_embedding: sin
xpos: false
checkpointing: torch
weight_init: gaussian
depthwise_init: current
zero_bias_init: true
norm: layer_norm
cross_attention: false
qk_layer_norm: false
qk_layer_norm_cross: false
attention_dropout: null
kv_repeat: 1
codebooks_pattern:
modeling: delay
delay:
delays: [ 0, 250, 250 ]
flatten_first: 0
empty_initial: 0
# ================ Conditioners ===================== #
classifier_free_guidance:
# drop all conditions simultaneously
training_dropout: 0.15
inference_coef: 1.5
attribute_dropout:
# drop each condition separately
args:
active_on_eval: false
text:
description: 0.0
type_info: 0.5
audio:
prompt_audio: 0.0
use_text_training: True
fuser:
sum: []
prepend: [ description, prompt_audio, type_info ] # this order is the SAME with the input concatenation order
conditioners:
prompt_audio:
model: qt_embedding
qt_embedding:
code_size: 16384
code_depth: 3
max_len: ${eval:${prompt_len}*${audio_tokenizer_frame_rate}+2} # 25*10+2+1
description:
model: QwTokenizer
QwTokenizer:
token_path: third_party/Qwen2-7B
max_len: 300
add_token_list: ${load_yaml:conf/vocab.yaml}
type_info:
model: QwTextTokenizer
QwTextTokenizer:
token_path: third_party/Qwen2-7B
max_len: 50
offload:
audiolm:
offload_module: self
cpu_mem_gb: 0
pre_copy_step: 1
clean_cache_after_forward: false
dtype: torch.float16
offload_layer_dict:
transformer: 4
transformer2: 4
ignore_layer_list: []
clean_cache_wrapper:
module: self
method_name: _sample_next_token
diff_mem_gb_thre: 2
debug: false
wav_tokenizer_diffusion:
offload_module: self.model.model
pre_copy_step: 1
clean_cache_after_forward: false
cpu_mem_gb: -1
dtype: null
offload_layer_dict:
cfm_wrapper: 5
hubert: 4
ignore_layer_list: []
clean_cache_wrapper:
module: self.model.model.cfm_wrapper.estimator
method_name: forward
diff_mem_gb_thre: 1
debug: false
|