File size: 6,265 Bytes
248479c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# Hugging Face optimized configuration
# This config is optimized for training on HF Spaces with limited resources

# set random seed
__set_seed1: !apply:random.seed [1986]
__set_seed2: !apply:numpy.random.seed [1986]
__set_seed3: !apply:torch.manual_seed [1986]
__set_seed4: !apply:torch.cuda.manual_seed_all [1986]

# fixed params - optimized for HF
sample_rate: 24000
llm_input_size: 512 # Reduced from 896
llm_output_size: 512 # Reduced from 896
spk_embed_dim: 128 # Reduced from 192
qwen_pretrain_path: ''
token_frame_rate: 25
token_mel_ratio: 2
token_latent_ratio: 3
use_speaker_encoder: True
speaker_encoder_path: '/tmp/checkpoints/llm/best_speaker_encoder.pt'

# stream related params
chunk_size: 16 # Reduced from 25
num_decoding_left_chunks: -1

speaker_encoder_config:
  mel_dim: 80
  model_dim: 256 # Reduced from 512
  output_dim: !ref <spk_embed_dim>
  num_blocks: 4 # Reduced from 6
  num_heads: 4 # Reduced from 8
  kernel_size: 1
  dropout: 0.1
  max_conditioning_inputs: 2 # Reduced from 3

# Smaller LLM model for HF
llm: !new:cosyvoice.llm.llm.Qwen2LM
  llm_input_size: !ref <llm_input_size>
  llm_output_size: !ref <llm_output_size>
  speech_token_size: 6561
  length_normalized_loss: True
  lsm_weight: 0
  mix_ratio: [3, 10] # Reduced from [5, 15]
  use_speaker_encoder: !ref <use_speaker_encoder>
  spk_embed_dim: !ref <spk_embed_dim>
  max_conditioning_inputs: 2
  llm: !new:cosyvoice.llm.llm.Qwen2Encoder
    pretrain_path: !ref <qwen_pretrain_path>
  sampling: !name:cosyvoice.utils.common.ras_sampling
    top_p: 0.8
    top_k: 25
    win_size: 8 # Reduced from 10
    tau_r: 0.1

extract_reference_mel:
  !name:cosyvoice.dataset.processor.extract_reference_mel_from_speech
  feat_extractor: !ref <feat_extractor>
  min_length: 0.5
  max_length: 3.0 # Reduced from 4.0
  num_crops: 1
  training: True
  sample_rate: !ref <sample_rate>

# Smaller Flow model for HF
flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
  input_size: 256 # Reduced from 512
  output_size: 64
  spk_embed_dim: !ref <spk_embed_dim>
  output_type: 'mel'
  vocab_size: 6561
  input_frame_rate: !ref <token_frame_rate>
  only_mask_loss: True
  token_latent_ratio: !ref <token_latent_ratio>
  pre_lookahead_len: 2 # Reduced from 3
  use_speaker_encoder: !ref <use_speaker_encoder>
  freeze_speaker_encoder: True
  speaker_encoder_path: !ref <speaker_encoder_path>
  encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
    output_size: 256 # Reduced from 512
    attention_heads: 4 # Reduced from 8
    linear_units: 1024 # Reduced from 2048
    num_blocks: 4 # Reduced from 6
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    attention_dropout_rate: 0.1
    normalize_before: True
    input_layer: 'linear'
    pos_enc_layer_type: 'rel_pos_espnet'
    selfattention_layer_type: 'rel_selfattn'
    input_size: 256 # Reduced from 512
    use_cnn_module: False
    macaron_style: False
    static_chunk_size: !ref <chunk_size>
  decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
    in_channels: 240
    n_spks: 1
    spk_emb_dim: 80
    cfm_params: !new:omegaconf.DictConfig
      content:
        sigma_min: 1e-06
        solver: 'euler'
        t_scheduler: 'cosine'
        training_cfg_rate: 0.1 # Reduced from 0.2
        inference_cfg_rate: 0.5 # Reduced from 0.7
        reg_loss_type: 'l1'
        use_immiscible: True
        immiscible_k: 4 # Reduced from 8
        use_contrastive_fm: True
        contrastive_lambda: 0.03 # Reduced from 0.05
    estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
      in_channels: 320
      out_channels: 64
      channels: [128] # Reduced from [256]
      dropout: 0.0
      attention_head_dim: 32 # Reduced from 64
      n_blocks: 3 # Reduced from 4
      num_mid_blocks: 8 # Reduced from 12
      num_heads: 4 # Reduced from 8
      act_fn: 'gelu'
      static_chunk_size: !ref <chunk_size> * <token_latent_ratio>
      num_decoding_left_chunks: !ref <num_decoding_left_chunks>

# Processor functions (unchanged)
individual_file_opener: !name:cosyvoice.dataset.processor.individual_file_opener
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
  token_path: !ref <qwen_pretrain_path>
  skip_special_tokens: True
allowed_special: 'all'
tokenize: !name:cosyvoice.dataset.processor.tokenize
  get_tokenizer: !ref <get_tokenizer>
  allowed_special: !ref <allowed_special>
filter: !name:cosyvoice.dataset.processor.filter
  max_length: 20480 # Reduced from 40960
  min_length: 100
  token_max_length: 150 # Reduced from 200
  token_min_length: 1
resample: !name:cosyvoice.dataset.processor.resample
  resample_rate: !ref <sample_rate>
truncate: !name:cosyvoice.dataset.processor.truncate
  truncate_length: 12240 # Reduced from 24480
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
  n_fft: 1920
  num_mels: 80
  sampling_rate: !ref <sample_rate>
  hop_size: 480
  win_size: 1920
  fmin: 0
  fmax: 8000
  center: False
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
  feat_extractor: !ref <feat_extractor>
  token_mel_ratio: !ref <token_mel_ratio>
shuffle: !name:cosyvoice.dataset.processor.shuffle
  shuffle_size: 500 # Reduced from 1000
sort: !name:cosyvoice.dataset.processor.sort
  sort_size: 250 # Reduced from 500
batch: !name:cosyvoice.dataset.processor.batch
  batch_type: 'dynamic'
  max_frames_in_batch: 2500 # Reduced from 5000
padding: !name:cosyvoice.dataset.processor.padding
  use_speaker_encoder: !ref <use_speaker_encoder>

# dataset processor pipeline
data_pipeline:
  [
    !ref <individual_file_opener>,
    !ref <tokenize>,
    !ref <filter>,
    !ref <resample>,
    !ref <extract_reference_mel>,
    !ref <compute_fbank>,
    !ref <shuffle>,
    !ref <sort>,
    !ref <batch>,
    !ref <padding>,
  ]

# HF optimized training configuration
train_conf:
  optim: adamw
  optim_conf:
    lr: 3e-5 # Reduced from 5e-5
  scheduler: constantlr
  scheduler_conf:
    warmup_steps: 200 # Reduced from 500
  max_epoch: 50 # Reduced from 2000
  grad_clip: 1
  accum_grad: 2 # Added gradient accumulation
  log_interval: 10 # Increased from 5
  save_per_step: 1000 # Reduced from 2000
  total_iters: 100000 # Reduced from 1000000000