Spaces:
Sleeping
Sleeping
File size: 6,265 Bytes
248479c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | # Hugging Face optimized configuration
# This config is optimized for training on HF Spaces with limited resources
# set random seed
__set_seed1: !apply:random.seed [1986]
__set_seed2: !apply:numpy.random.seed [1986]
__set_seed3: !apply:torch.manual_seed [1986]
__set_seed4: !apply:torch.cuda.manual_seed_all [1986]
# fixed params - optimized for HF
sample_rate: 24000
llm_input_size: 512 # Reduced from 896
llm_output_size: 512 # Reduced from 896
spk_embed_dim: 128 # Reduced from 192
qwen_pretrain_path: ''
token_frame_rate: 25
token_mel_ratio: 2
token_latent_ratio: 3
use_speaker_encoder: True
speaker_encoder_path: '/tmp/checkpoints/llm/best_speaker_encoder.pt'
# stream related params
chunk_size: 16 # Reduced from 25
num_decoding_left_chunks: -1
speaker_encoder_config:
mel_dim: 80
model_dim: 256 # Reduced from 512
output_dim: !ref <spk_embed_dim>
num_blocks: 4 # Reduced from 6
num_heads: 4 # Reduced from 8
kernel_size: 1
dropout: 0.1
max_conditioning_inputs: 2 # Reduced from 3
# Smaller LLM model for HF
llm: !new:cosyvoice.llm.llm.Qwen2LM
llm_input_size: !ref <llm_input_size>
llm_output_size: !ref <llm_output_size>
speech_token_size: 6561
length_normalized_loss: True
lsm_weight: 0
mix_ratio: [3, 10] # Reduced from [5, 15]
use_speaker_encoder: !ref <use_speaker_encoder>
spk_embed_dim: !ref <spk_embed_dim>
max_conditioning_inputs: 2
llm: !new:cosyvoice.llm.llm.Qwen2Encoder
pretrain_path: !ref <qwen_pretrain_path>
sampling: !name:cosyvoice.utils.common.ras_sampling
top_p: 0.8
top_k: 25
win_size: 8 # Reduced from 10
tau_r: 0.1
extract_reference_mel:
!name:cosyvoice.dataset.processor.extract_reference_mel_from_speech
feat_extractor: !ref <feat_extractor>
min_length: 0.5
max_length: 3.0 # Reduced from 4.0
num_crops: 1
training: True
sample_rate: !ref <sample_rate>
# Smaller Flow model for HF
flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
input_size: 256 # Reduced from 512
output_size: 64
spk_embed_dim: !ref <spk_embed_dim>
output_type: 'mel'
vocab_size: 6561
input_frame_rate: !ref <token_frame_rate>
only_mask_loss: True
token_latent_ratio: !ref <token_latent_ratio>
pre_lookahead_len: 2 # Reduced from 3
use_speaker_encoder: !ref <use_speaker_encoder>
freeze_speaker_encoder: True
speaker_encoder_path: !ref <speaker_encoder_path>
encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
output_size: 256 # Reduced from 512
attention_heads: 4 # Reduced from 8
linear_units: 1024 # Reduced from 2048
num_blocks: 4 # Reduced from 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
normalize_before: True
input_layer: 'linear'
pos_enc_layer_type: 'rel_pos_espnet'
selfattention_layer_type: 'rel_selfattn'
input_size: 256 # Reduced from 512
use_cnn_module: False
macaron_style: False
static_chunk_size: !ref <chunk_size>
decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
in_channels: 240
n_spks: 1
spk_emb_dim: 80
cfm_params: !new:omegaconf.DictConfig
content:
sigma_min: 1e-06
solver: 'euler'
t_scheduler: 'cosine'
training_cfg_rate: 0.1 # Reduced from 0.2
inference_cfg_rate: 0.5 # Reduced from 0.7
reg_loss_type: 'l1'
use_immiscible: True
immiscible_k: 4 # Reduced from 8
use_contrastive_fm: True
contrastive_lambda: 0.03 # Reduced from 0.05
estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
in_channels: 320
out_channels: 64
channels: [128] # Reduced from [256]
dropout: 0.0
attention_head_dim: 32 # Reduced from 64
n_blocks: 3 # Reduced from 4
num_mid_blocks: 8 # Reduced from 12
num_heads: 4 # Reduced from 8
act_fn: 'gelu'
static_chunk_size: !ref <chunk_size> * <token_latent_ratio>
num_decoding_left_chunks: !ref <num_decoding_left_chunks>
# Processor functions (unchanged)
individual_file_opener: !name:cosyvoice.dataset.processor.individual_file_opener
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
token_path: !ref <qwen_pretrain_path>
skip_special_tokens: True
allowed_special: 'all'
tokenize: !name:cosyvoice.dataset.processor.tokenize
get_tokenizer: !ref <get_tokenizer>
allowed_special: !ref <allowed_special>
filter: !name:cosyvoice.dataset.processor.filter
max_length: 20480 # Reduced from 40960
min_length: 100
token_max_length: 150 # Reduced from 200
token_min_length: 1
resample: !name:cosyvoice.dataset.processor.resample
resample_rate: !ref <sample_rate>
truncate: !name:cosyvoice.dataset.processor.truncate
truncate_length: 12240 # Reduced from 24480
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
n_fft: 1920
num_mels: 80
sampling_rate: !ref <sample_rate>
hop_size: 480
win_size: 1920
fmin: 0
fmax: 8000
center: False
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
feat_extractor: !ref <feat_extractor>
token_mel_ratio: !ref <token_mel_ratio>
shuffle: !name:cosyvoice.dataset.processor.shuffle
shuffle_size: 500 # Reduced from 1000
sort: !name:cosyvoice.dataset.processor.sort
sort_size: 250 # Reduced from 500
batch: !name:cosyvoice.dataset.processor.batch
batch_type: 'dynamic'
max_frames_in_batch: 2500 # Reduced from 5000
padding: !name:cosyvoice.dataset.processor.padding
use_speaker_encoder: !ref <use_speaker_encoder>
# dataset processor pipeline
data_pipeline:
[
!ref <individual_file_opener>,
!ref <tokenize>,
!ref <filter>,
!ref <resample>,
!ref <extract_reference_mel>,
!ref <compute_fbank>,
!ref <shuffle>,
!ref <sort>,
!ref <batch>,
!ref <padding>,
]
# HF optimized training configuration
train_conf:
optim: adamw
optim_conf:
lr: 3e-5 # Reduced from 5e-5
scheduler: constantlr
scheduler_conf:
warmup_steps: 200 # Reduced from 500
max_epoch: 50 # Reduced from 2000
grad_clip: 1
accum_grad: 2 # Added gradient accumulation
log_interval: 10 # Increased from 5
save_per_step: 1000 # Reduced from 2000
total_iters: 100000 # Reduced from 1000000000
|