Upload folder using huggingface_hub
Browse files
pretrained_models/ddt_spk_base_emilia_latent_ctc/config.yaml
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ckpts/${model.name}_${model.autoencoder.name}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 4 |
+
|
| 5 |
+
datasets:
|
| 6 |
+
name: Emilia_ZH_EN_latent_spk
|
| 7 |
+
dataset_type: CustomLatentDataset
|
| 8 |
+
# 3750 frames ~ 300s
|
| 9 |
+
batch_size_per_gpu: 3750 # 8 GPUs, 8 * 38400 = 307200
|
| 10 |
+
batch_size_type: frame # frame | sample
|
| 11 |
+
max_samples: 32 # max sequences per batch if use frame-wise batch_size
|
| 12 |
+
num_workers: 32
|
| 13 |
+
config:
|
| 14 |
+
target_sample_rate: 24000
|
| 15 |
+
hop_length: 1920
|
| 16 |
+
|
| 17 |
+
optim:
|
| 18 |
+
epochs: 11 # only suitable for LibriHeavy, if you want to train it on LibriTTS, set epoch 686
|
| 19 |
+
learning_rate: 1e-4
|
| 20 |
+
num_warmup_updates: 1000 # warmup updates
|
| 21 |
+
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
| 22 |
+
max_grad_norm: 1.0 # gradient clipping
|
| 23 |
+
bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
|
| 24 |
+
|
| 25 |
+
model:
|
| 26 |
+
name: ddt_spk_base_emilia_latent_ctc
|
| 27 |
+
tokenizer: pinyin
|
| 28 |
+
tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
|
| 29 |
+
autoencoder:
|
| 30 |
+
name: vae_24khz_f1920c64_1.0
|
| 31 |
+
cfm:
|
| 32 |
+
_target_: architts.model.latent_cfm.LatentCFM_REPA_CTC
|
| 33 |
+
num_channels: 64
|
| 34 |
+
timestep_scheduler: logit_normal
|
| 35 |
+
ctc_idx: 8
|
| 36 |
+
# static_chunk_size: 25 # =2s of 24kHz audio
|
| 37 |
+
net:
|
| 38 |
+
_target_: architts.model.backbone_latent.ddt_v2.DDT
|
| 39 |
+
dim: 1024
|
| 40 |
+
mel_dim: 64
|
| 41 |
+
spk_embed_dim: 192
|
| 42 |
+
text_dim: 512
|
| 43 |
+
encoder_depth: 18
|
| 44 |
+
decoder_depth: 4
|
| 45 |
+
conv_layers: 4
|
| 46 |
+
encoder_share_mod: True
|
| 47 |
+
decoder_share_mod: True
|
| 48 |
+
text_encoder_depth: 6
|
| 49 |
+
heads: 16
|
| 50 |
+
ff_mult: 2
|
| 51 |
+
pe_attn_head: 1
|
| 52 |
+
qk_norm: rms_norm
|
| 53 |
+
checkpoint_activations: True # recompute activations and save memory for extra compute
|
| 54 |
+
|
| 55 |
+
ckpts:
|
| 56 |
+
logger: wandb # wandb | tensorboard | null
|
| 57 |
+
log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
|
| 58 |
+
save_per_updates: 50000 # save checkpoint per updates
|
| 59 |
+
keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
|
| 60 |
+
last_per_updates: 5000 # save last checkpoint per updates
|
| 61 |
+
demo_per_updates: 5000
|
| 62 |
+
save_dir: ckpts/${model.name}_${model.autoencoder.name}_${model.tokenizer}_${datasets.name}
|
pretrained_models/ddt_spk_base_emilia_latent_ctc/model_800000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed4aff282bc01683f2f0a7c7217d5a1c273e9b65aa7278a78a4ad93a341ce1a7
|
| 3 |
+
size 4687951573
|
pretrained_models/vae_24khz_f1920c64_1.0/config.yaml
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model_type: autoencoder
|
| 2 |
+
sample_size: 61440
|
| 3 |
+
sample_rate: 24000
|
| 4 |
+
audio_channels: 1
|
| 5 |
+
model:
|
| 6 |
+
encoder:
|
| 7 |
+
type: oobleck
|
| 8 |
+
config:
|
| 9 |
+
in_channels: ${audio_channels}
|
| 10 |
+
channels: 96
|
| 11 |
+
c_mults: [1, 2, 4, 8, 16]
|
| 12 |
+
strides: [2, 4, 5, 6, 8]
|
| 13 |
+
latent_dim: 65
|
| 14 |
+
use_snake: true
|
| 15 |
+
decoder:
|
| 16 |
+
type: oobleck
|
| 17 |
+
config:
|
| 18 |
+
out_channels: ${audio_channels}
|
| 19 |
+
channels: 128
|
| 20 |
+
c_mults: [1, 2, 4, 4, 8, 16]
|
| 21 |
+
strides: [2, 2, 3, 4, 5, 8]
|
| 22 |
+
latent_dim: ${model.latent_dim}
|
| 23 |
+
use_snake: true
|
| 24 |
+
inject_noise: true
|
| 25 |
+
final_tanh: false
|
| 26 |
+
bottleneck:
|
| 27 |
+
type: vae2
|
| 28 |
+
latent_dim: 64
|
| 29 |
+
downsampling_ratio: 1920
|
| 30 |
+
io_channels: ${audio_channels}
|
| 31 |
+
training:
|
| 32 |
+
learning_rate: 1.5e-4
|
| 33 |
+
# warmup_steps: 25000
|
| 34 |
+
# encoder_freeze_on_warmup: false
|
| 35 |
+
use_ema: true
|
| 36 |
+
optimizer_configs:
|
| 37 |
+
autoencoder:
|
| 38 |
+
optimizer:
|
| 39 |
+
type: AdamW
|
| 40 |
+
config:
|
| 41 |
+
betas: [0.8, 0.99]
|
| 42 |
+
lr: 1.5e-4
|
| 43 |
+
weight_decay: 1e-3
|
| 44 |
+
scheduler:
|
| 45 |
+
type: InverseLR
|
| 46 |
+
config:
|
| 47 |
+
inv_gamma: 200000
|
| 48 |
+
power: 0.5
|
| 49 |
+
warmup: 0.999
|
| 50 |
+
discriminator:
|
| 51 |
+
optimizer:
|
| 52 |
+
type: AdamW
|
| 53 |
+
config:
|
| 54 |
+
betas: [0.8, 0.99]
|
| 55 |
+
lr: 3e-4
|
| 56 |
+
weight_decay: 1e-3
|
| 57 |
+
scheduler:
|
| 58 |
+
type: InverseLR
|
| 59 |
+
config:
|
| 60 |
+
inv_gamma: 200000
|
| 61 |
+
power: 0.5
|
| 62 |
+
warmup: 0.999
|
| 63 |
+
loss_configs:
|
| 64 |
+
discriminator:
|
| 65 |
+
type: encodec
|
| 66 |
+
config:
|
| 67 |
+
filters: 64
|
| 68 |
+
n_ffts: [2048, 1024, 512, 256, 128]
|
| 69 |
+
hop_lengths: [512, 256, 128, 64, 32]
|
| 70 |
+
win_lengths: [2048, 1024, 512, 256, 128]
|
| 71 |
+
weights:
|
| 72 |
+
adversarial: 0.1
|
| 73 |
+
feature_matching: 5.0
|
| 74 |
+
spectral:
|
| 75 |
+
type: mrstft
|
| 76 |
+
config:
|
| 77 |
+
fft_sizes: [2048, 1024, 512, 256, 128, 64, 32]
|
| 78 |
+
hop_sizes: [512, 256, 128, 64, 32, 16, 8]
|
| 79 |
+
win_lengths: [2048, 1024, 512, 256, 128, 64, 32]
|
| 80 |
+
perceptual_weighting: true
|
| 81 |
+
weights:
|
| 82 |
+
mrstft: 1.0
|
| 83 |
+
time:
|
| 84 |
+
type: l1
|
| 85 |
+
weights:
|
| 86 |
+
l1: 0.0
|
| 87 |
+
bottleneck:
|
| 88 |
+
type: kl
|
| 89 |
+
weights:
|
| 90 |
+
kl: 1e-4
|
| 91 |
+
demo:
|
| 92 |
+
demo_every: 2000
|
| 93 |
+
|
pretrained_models/vae_24khz_f1920c64_1.0/model.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2b2971abf5448077e7d7e2bf960d813e928533331e5cb24fb54ca9941512216f
|
| 3 |
+
size 502995634
|