dataset:
  target_sample_rate: 24000
  n_mel_channels: 100
  hop_length: 256
  win_length: 1024
  n_fft: 1024
  mel_spec_type: vocos
  tokenizer: pinyin
  tokenizer_path: data/HFDatasetNew_pinyin/vocab.txt
  name: /apdcephfs_cq10/share_1297902/user/nenali/project/chukewang/data/Emilia-Dataset
  type: HFDatasetNew
  cache_dir: /apdcephfs_cq10/share_1297902/user/nenali/project/chukewang/data/Emilia-Dataset-Cache
  duration_path: scripts/duration.json
  valid_text_path: scripts/valid.json
training:
  exp_name: F5TTS_PPG_ONLY
  learning_rate: 7.5e-05
  batch_size_per_gpu: 4096
  batch_size_type: frame
  max_samples: 64
  grad_accumulation_steps: 4
  max_grad_norm: 1.0
  epochs: 11
  num_warmup_updates: 20000
  save_per_updates: 20000
  last_per_steps: 5000
  checkpoint_path: ckpts/ppg_only_cq
  combined_cond_drop_prob:
  - 1
  - 0
  - 0
  - 0
  repeat_space_token: false
  mix_condition: false
model:
  wandb_resume_id: None
  cls: DiT
  cfg:
    DiT:
      dim: 1024
      depth: 22
      heads: 16
      ff_mult: 2
      text_dim: 512
      conv_layers: 4
    UNetT:
      dim: 1024
      depth: 24
      heads: 16
      ff_mult: 4
  ppg_input: true
  ppg:
    model_path: pretrained_models/ppg/33.pt
    config: pretrained_models/ppg/train.yaml
    frame_length: 20
    mel_frame_shift: 10
    dim: 256
    output_type: map
    map:
      map_mix_ratio: 1.0
      global_phn_center_path: pretrained_models/ppg/7layer_20ms_33pt/phn_center.npy
      para_softmax_path: pretrained_models/ppg/7layer_20ms_33pt/ce_layer.pkl