File size: 3,349 Bytes
0730c18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# ================ Train Config ================ #
lyric_processor:
max_dur: 150
min_dur: 30
prompt_len: 10
pad_to_max: true


# ================ Audio tokenzier ================ #
audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors
audio_tokenizer_frame_rate: 25
audio_tokenizer_code_depth: 1
sample_rate: 48000

audio_tokenizer_checkpoint_sep: Flow1dVAESeparate_./ckpt/model_septoken/model_2.safetensors
audio_tokenizer_frame_rate_sep: 25
audio_tokenizer_code_depth_sep: 2
sample_rate_sep: 48000

# ================ VAE ================ #
vae_config: ./ckpt/vae/stable_audio_1920_vae.json
vae_model: ./ckpt/vae/autoencoder_music_1320k.ckpt

# ================== LM =========================== #
lm:
  lm_type: Llama # [Llama]
  dim: 1536
  intermediate_size: 8960
  num_heads: 12
  num_layers: 28
  num_layers_sub: 12
  code_depth: 3
  code_size: 16384
  max_position_embeddings: 8196
  max_position_embeddings_sub: 10000
  rope_theta: 100000.0
  rope_theta_sub: 500000.0
  dropout: 0.0
  use_flash_attn_2: true
  activation: gelu
  norm_first: true
  bias_ff: false
  bias_attn: false
  causal: true
  custom: false
  memory_efficient: true
  attention_as_float32: false
  layer_scale: null
  positional_embedding: sin
  xpos: false
  checkpointing: torch
  weight_init: gaussian
  depthwise_init: current
  zero_bias_init: true
  norm: layer_norm
  cross_attention: false
  qk_layer_norm: false
  qk_layer_norm_cross: false
  attention_dropout: null
  kv_repeat: 1

codebooks_pattern:
  modeling: delay
  delay:
    delays: [ 0, 250, 250 ]
    flatten_first: 0
    empty_initial: 0

# ================ Conditioners ===================== #
classifier_free_guidance:
  # drop all conditions simultaneously
  training_dropout: 0.15
  inference_coef: 1.5

attribute_dropout:
  # drop each condition separately
  args:
    active_on_eval: false
  text:
    description: 0.0
    type_info: 0.5
  audio:
    prompt_audio: 0.0


use_text_training: True
fuser:
  sum: []
  prepend: [ description, prompt_audio, type_info ] # this order is the SAME with the input concatenation order

conditioners:
  prompt_audio:
    model: qt_embedding
    qt_embedding:
      code_size: 16384
      code_depth: 3
      max_len: ${eval:${prompt_len}*${audio_tokenizer_frame_rate}+2} # 25*10+2+1
  description:
    model: QwTokenizer
    QwTokenizer:
      token_path: third_party/Qwen2-7B
      max_len: 300
      add_token_list: ${load_yaml:conf/vocab.yaml}
  type_info:
    model: QwTextTokenizer
    QwTextTokenizer:
      token_path: third_party/Qwen2-7B
      max_len: 50

offload:
  audiolm:
    offload_module: self
    cpu_mem_gb: 0
    pre_copy_step: 1
    clean_cache_after_forward: false
    dtype: torch.float16
    offload_layer_dict:
      transformer: 4
      transformer2: 4
    ignore_layer_list: []
    clean_cache_wrapper:
      module: self
      method_name: _sample_next_token
      diff_mem_gb_thre: 2
    debug: false

  wav_tokenizer_diffusion:
    offload_module: self.model.model
    pre_copy_step: 1
    clean_cache_after_forward: false
    cpu_mem_gb: -1
    dtype: null
    offload_layer_dict:
      cfm_wrapper: 5
      hubert: 4
    ignore_layer_list: []
    clean_cache_wrapper:
      module: self.model.model.cfm_wrapper.estimator
      method_name: forward
      diff_mem_gb_thre: 1
    debug: false