Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

pretrained_models/ddt_spk_base_emilia_latent_ctc/config.yaml +62 -0
pretrained_models/ddt_spk_base_emilia_latent_ctc/model_800000.pt +3 -0
pretrained_models/vae_24khz_f1920c64_1.0/config.yaml +93 -0
pretrained_models/vae_24khz_f1920c64_1.0/model.ckpt +3 -0

pretrained_models/ddt_spk_base_emilia_latent_ctc/config.yaml ADDED Viewed

	@@ -0,0 +1,62 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.autoencoder.name}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: Emilia_ZH_EN_latent_spk
+  dataset_type: CustomLatentDataset
+  # 3750 frames ~ 300s
+  batch_size_per_gpu: 3750  # 8 GPUs, 8 * 38400 = 307200
+  batch_size_type: frame  # frame | sample
+  max_samples: 32  # max sequences per batch if use frame-wise batch_size
+  num_workers: 32
+  config:
+    target_sample_rate: 24000
+    hop_length: 1920
+optim:
+  epochs: 11  # only suitable for LibriHeavy, if you want to train it on LibriTTS, set epoch 686
+  learning_rate: 1e-4
+  num_warmup_updates: 1000  # warmup updates
+  grad_accumulation_steps: 1  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0  # gradient clipping
+  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not
+model:
+  name: ddt_spk_base_emilia_latent_ctc
+  tokenizer: pinyin
+  tokenizer_path: null  # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
+  autoencoder:
+    name: vae_24khz_f1920c64_1.0
+  cfm:
+    _target_: architts.model.latent_cfm.LatentCFM_REPA_CTC
+    num_channels: 64
+    timestep_scheduler: logit_normal
+    ctc_idx: 8
+    # static_chunk_size: 25 # =2s of 24kHz audio
+  net:
+    _target_: architts.model.backbone_latent.ddt_v2.DDT
+    dim: 1024
+    mel_dim: 64
+    spk_embed_dim: 192
+    text_dim: 512
+    encoder_depth: 18
+    decoder_depth: 4
+    conv_layers: 4
+    encoder_share_mod: True
+    decoder_share_mod: True
+    text_encoder_depth: 6
+    heads: 16
+    ff_mult: 2
+    pe_attn_head: 1
+    qk_norm: rms_norm
+    checkpoint_activations: True  # recompute activations and save memory for extra compute
+ckpts:
+  logger: wandb  # wandb | tensorboard | null
+  log_samples: True  # infer random sample per save checkpoint. wip, normal to fail with extra long samples
+  save_per_updates: 50000  # save checkpoint per updates
+  keep_last_n_checkpoints: -1  # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
+  last_per_updates: 5000  # save last checkpoint per updates
+  demo_per_updates: 5000
+  save_dir: ckpts/${model.name}_${model.autoencoder.name}_${model.tokenizer}_${datasets.name}

pretrained_models/ddt_spk_base_emilia_latent_ctc/model_800000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed4aff282bc01683f2f0a7c7217d5a1c273e9b65aa7278a78a4ad93a341ce1a7
+size 4687951573

pretrained_models/vae_24khz_f1920c64_1.0/config.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+model_type: autoencoder
+sample_size: 61440
+sample_rate: 24000
+audio_channels: 1
+model:
+  encoder:
+    type: oobleck
+    config:
+      in_channels: ${audio_channels}
+      channels: 96
+      c_mults: [1, 2, 4, 8, 16]
+      strides: [2, 4, 5, 6, 8]
+      latent_dim: 65
+      use_snake: true
+  decoder:
+    type: oobleck
+    config:
+      out_channels: ${audio_channels}
+      channels: 128
+      c_mults: [1, 2, 4, 4, 8, 16]
+      strides: [2, 2, 3, 4, 5, 8]
+      latent_dim: ${model.latent_dim}
+      use_snake: true
+      inject_noise: true
+      final_tanh: false
+  bottleneck:
+    type: vae2
+  latent_dim: 64
+  downsampling_ratio: 1920
+  io_channels: ${audio_channels}
+training:
+  learning_rate: 1.5e-4
+  # warmup_steps: 25000
+  # encoder_freeze_on_warmup: false
+  use_ema: true
+  optimizer_configs:
+    autoencoder:
+      optimizer:
+        type: AdamW
+        config:
+          betas: [0.8, 0.99]
+          lr: 1.5e-4
+          weight_decay: 1e-3
+      scheduler:
+        type: InverseLR
+        config:
+          inv_gamma: 200000
+          power: 0.5
+          warmup: 0.999
+    discriminator:
+      optimizer:
+        type: AdamW
+        config:
+          betas: [0.8, 0.99]
+          lr: 3e-4
+          weight_decay: 1e-3
+      scheduler:
+        type: InverseLR
+        config:
+          inv_gamma: 200000
+          power: 0.5
+          warmup: 0.999
+  loss_configs:
+    discriminator:
+      type: encodec
+      config:
+        filters: 64
+        n_ffts: [2048, 1024, 512, 256, 128]
+        hop_lengths: [512, 256, 128, 64, 32]
+        win_lengths: [2048, 1024, 512, 256, 128]
+      weights:
+        adversarial: 0.1
+        feature_matching: 5.0
+    spectral:
+      type: mrstft
+      config:
+        fft_sizes: [2048, 1024, 512, 256, 128, 64, 32]
+        hop_sizes: [512, 256, 128, 64, 32, 16, 8]
+        win_lengths: [2048, 1024, 512, 256, 128, 64, 32]
+        perceptual_weighting: true
+      weights:
+        mrstft: 1.0
+    time:
+      type: l1
+      weights:
+        l1: 0.0
+    bottleneck:
+      type: kl
+      weights:
+        kl: 1e-4
+  demo:
+    demo_every: 2000

pretrained_models/vae_24khz_f1920c64_1.0/model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b2971abf5448077e7d7e2bf960d813e928533331e5cb24fb54ca9941512216f
+size 502995634