# ─── GLOBAL ────────────────────────────────────────────────────────── log_dir: logs/pod_90h_30k_second_v2 device: "cuda" batch_size: 12 # 40 GB A100, fp16 max_len: 300 # ≈ 8 s (200 × 40 ms) epochs_1st: 25 # first-stage schedule epochs_2nd: 20 # second-stage schedule (later) save_freq: 1 log_interval: 50 # leave blank on first run pretrained_model: "/workspace/styletts2/logs/pod_90h_30k_second_v2/epoch_2nd_00005.pth" second_stage_load_pretrained: true load_only_params: false # ─── PRE-PROCESS ───────────────────────────────────────────────────── preprocess_params: sr: 24000 spect_params: # required by Mel extractor n_fft: 2048 win_length: 1200 hop_length: 300 # ─── DATA ──────────────────────────────────────────────────────────── data_params: root_path: /workspace train_data: /workspace/styletts2/data/train_list.txt val_data: /workspace/styletts2/data/val_list.txt min_length: 50 # sample until texts with this size are obtained for OOD texts OOD_data: /workspace/styletts2/data/OOD_texts.txt # ─── LOSS SCHEDULE ────────────────────────────────────────────────── loss_params: lambda_mel: 5. # mel reconstruction loss lambda_gen: 1. # generator loss lambda_slm: 1. # slm feature matching loss lambda_mono: 1. # monotonic alignment loss (1st stage, TMA) lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA) TMA_epoch: 14 # TMA starting epoch (1st stage) lambda_F0: 1. # F0 reconstruction loss (2nd stage) lambda_norm: 1. # norm reconstruction loss (2nd stage) lambda_dur: 1. # duration loss (2nd stage) lambda_ce: 20. # duration predictor probability output CE loss (2nd stage) lambda_sty: 1. # style reconstruction loss (2nd stage) lambda_diff: 1. # score matching loss (2nd stage) diff_epoch: 1 # style diffusion starting epoch (2nd stage) joint_epoch: 5 # joint training starting epoch (2nd stage) # ─── OPTIMISER ────────────────────────────────────────────────────── optimizer_params: lr: 0.0001 bert_lr: 0.00001 ft_lr: 0.00001 grad_accum_steps: 2 # ─── MODEL (core network & sub-modules) ───────────────────────────── model_params: multispeaker: true # speaker-ID column present dim_in: 64 hidden_dim: 512 max_conv_dim: 512 n_layer: 3 n_mels: 80 n_token: 178 # 178 phonemes max_dur: 50 style_dim: 128 dropout: 0.2 decoder: type: hifigan resblock_kernel_sizes: [3, 7, 11] upsample_rates: [10, 5, 3, 2] upsample_initial_channel: 512 resblock_dilation_sizes: [[1,3,5],[1,3,5],[1,3,5]] upsample_kernel_sizes: [20, 10, 6, 4] slm: model: microsoft/wavlm-base-plus sr: 16000 hidden: 768 nlayers: 13 initial_channel: 64 diffusion: embedding_mask_proba: 0.1 transformer: num_layers: 3 num_heads: 8 head_features: 64 multiplier: 2 dist: sigma_data: 0.2 # ← placeholder; code will overwrite if estimate_sigma_data: true mean: -3.0 std: 1.0 # ─── EXTERNAL CHECKPOINTS ─────────────────────────────────────────── F0_path: "Utils/JDC/bst.t7" ASR_config: "Utils/ASR/config.yml" ASR_path: "Utils/ASR/epoch_00080.pth" PLBERT_dir: 'Utils/PLBERT/' first_stage_path: "/workspace/styletts2/stage1_final.pth" # filled automatically after this run # ─── SLM ADVERSARIAL (ignored in stage-1, kept default) ───────────── slmadv_params: min_len: 400 max_len: 500 batch_percentage: 0.5 iter: 20 thresh: 5 scale: 0.01 sig: 1.5