Spaces:

mnhatdaous
/

learnable-speech

Sleeping

File size: 6,265 Bytes

248479c

# Hugging Face optimized configuration
# This config is optimized for training on HF Spaces with limited resources

# set random seed
__set_seed1: !apply:random.seed [1986]
__set_seed2: !apply:numpy.random.seed [1986]
__set_seed3: !apply:torch.manual_seed [1986]
__set_seed4: !apply:torch.cuda.manual_seed_all [1986]

# fixed params - optimized for HF
sample_rate: 24000
llm_input_size: 512 # Reduced from 896
llm_output_size: 512 # Reduced from 896
spk_embed_dim: 128 # Reduced from 192
qwen_pretrain_path: ''
token_frame_rate: 25
token_mel_ratio: 2
token_latent_ratio: 3
use_speaker_encoder: True
speaker_encoder_path: '/tmp/checkpoints/llm/best_speaker_encoder.pt'

# stream related params
chunk_size: 16 # Reduced from 25
num_decoding_left_chunks: -1

speaker_encoder_config:
  mel_dim: 80
  model_dim: 256 # Reduced from 512
  output_dim: !ref <spk_embed_dim>
  num_blocks: 4 # Reduced from 6
  num_heads: 4 # Reduced from 8
  kernel_size: 1
  dropout: 0.1
  max_conditioning_inputs: 2 # Reduced from 3

# Smaller LLM model for HF
llm: !new:cosyvoice.llm.llm.Qwen2LM
  llm_input_size: !ref <llm_input_size>
  llm_output_size: !ref <llm_output_size>
  speech_token_size: 6561
  length_normalized_loss: True
  lsm_weight: 0
  mix_ratio: [3, 10] # Reduced from [5, 15]
  use_speaker_encoder: !ref <use_speaker_encoder>
  spk_embed_dim: !ref <spk_embed_dim>
  max_conditioning_inputs: 2
  llm: !new:cosyvoice.llm.llm.Qwen2Encoder
    pretrain_path: !ref <qwen_pretrain_path>
  sampling: !name:cosyvoice.utils.common.ras_sampling
    top_p: 0.8
    top_k: 25
    win_size: 8 # Reduced from 10
    tau_r: 0.1

extract_reference_mel:
  !name:cosyvoice.dataset.processor.extract_reference_mel_from_speech
  feat_extractor: !ref <feat_extractor>
  min_length: 0.5
  max_length: 3.0 # Reduced from 4.0
  num_crops: 1
  training: True
  sample_rate: !ref <sample_rate>

# Smaller Flow model for HF
flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
  input_size: 256 # Reduced from 512
  output_size: 64
  spk_embed_dim: !ref <spk_embed_dim>
  output_type: 'mel'
  vocab_size: 6561
  input_frame_rate: !ref <token_frame_rate>
  only_mask_loss: True
  token_latent_ratio: !ref <token_latent_ratio>
  pre_lookahead_len: 2 # Reduced from 3
  use_speaker_encoder: !ref <use_speaker_encoder>
  freeze_speaker_encoder: True
  speaker_encoder_path: !ref <speaker_encoder_path>
  encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
    output_size: 256 # Reduced from 512
    attention_heads: 4 # Reduced from 8
    linear_units: 1024 # Reduced from 2048
    num_blocks: 4 # Reduced from 6
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    attention_dropout_rate: 0.1
    normalize_before: True
    input_layer: 'linear'
    pos_enc_layer_type: 'rel_pos_espnet'
    selfattention_layer_type: 'rel_selfattn'
    input_size: 256 # Reduced from 512
    use_cnn_module: False
    macaron_style: False
    static_chunk_size: !ref <chunk_size>
  decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
    in_channels: 240
    n_spks: 1
    spk_emb_dim: 80
    cfm_params: !new:omegaconf.DictConfig
      content:
        sigma_min: 1e-06
        solver: 'euler'
        t_scheduler: 'cosine'
        training_cfg_rate: 0.1 # Reduced from 0.2
        inference_cfg_rate: 0.5 # Reduced from 0.7
        reg_loss_type: 'l1'
        use_immiscible: True
        immiscible_k: 4 # Reduced from 8
        use_contrastive_fm: True
        contrastive_lambda: 0.03 # Reduced from 0.05
    estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
      in_channels: 320
      out_channels: 64
      channels: [128] # Reduced from [256]
      dropout: 0.0
      attention_head_dim: 32 # Reduced from 64
      n_blocks: 3 # Reduced from 4
      num_mid_blocks: 8 # Reduced from 12
      num_heads: 4 # Reduced from 8
      act_fn: 'gelu'
      static_chunk_size: !ref <chunk_size> * <token_latent_ratio>
      num_decoding_left_chunks: !ref <num_decoding_left_chunks>

# Processor functions (unchanged)
individual_file_opener: !name:cosyvoice.dataset.processor.individual_file_opener
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
  token_path: !ref <qwen_pretrain_path>
  skip_special_tokens: True
allowed_special: 'all'
tokenize: !name:cosyvoice.dataset.processor.tokenize
  get_tokenizer: !ref <get_tokenizer>
  allowed_special: !ref <allowed_special>
filter: !name:cosyvoice.dataset.processor.filter
  max_length: 20480 # Reduced from 40960
  min_length: 100
  token_max_length: 150 # Reduced from 200
  token_min_length: 1
resample: !name:cosyvoice.dataset.processor.resample
  resample_rate: !ref <sample_rate>
truncate: !name:cosyvoice.dataset.processor.truncate
  truncate_length: 12240 # Reduced from 24480
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
  n_fft: 1920
  num_mels: 80
  sampling_rate: !ref <sample_rate>
  hop_size: 480
  win_size: 1920
  fmin: 0
  fmax: 8000
  center: False
compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
  feat_extractor: !ref <feat_extractor>
  token_mel_ratio: !ref <token_mel_ratio>
shuffle: !name:cosyvoice.dataset.processor.shuffle
  shuffle_size: 500 # Reduced from 1000
sort: !name:cosyvoice.dataset.processor.sort
  sort_size: 250 # Reduced from 500
batch: !name:cosyvoice.dataset.processor.batch
  batch_type: 'dynamic'
  max_frames_in_batch: 2500 # Reduced from 5000
padding: !name:cosyvoice.dataset.processor.padding
  use_speaker_encoder: !ref <use_speaker_encoder>

# dataset processor pipeline
data_pipeline:
  [
    !ref <individual_file_opener>,
    !ref <tokenize>,
    !ref <filter>,
    !ref <resample>,
    !ref <extract_reference_mel>,
    !ref <compute_fbank>,
    !ref <shuffle>,
    !ref <sort>,
    !ref <batch>,
    !ref <padding>,
  ]

# HF optimized training configuration
train_conf:
  optim: adamw
  optim_conf:
    lr: 3e-5 # Reduced from 5e-5
  scheduler: constantlr
  scheduler_conf:
    warmup_steps: 200 # Reduced from 500
  max_epoch: 50 # Reduced from 2000
  grad_clip: 1
  accum_grad: 2 # Added gradient accumulation
  log_interval: 10 # Increased from 5
  save_per_step: 1000 # Reduced from 2000
  total_iters: 100000 # Reduced from 1000000000