archimickey commited on
Commit
8cab328
·
verified ·
1 Parent(s): 5bfaafe

Upload folder using huggingface_hub

Browse files
pretrained_models/ddt_spk_base_emilia_latent_ctc/config.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.autoencoder.name}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN_latent_spk
7
+ dataset_type: CustomLatentDataset
8
+ # 3750 frames ~ 300s
9
+ batch_size_per_gpu: 3750 # 8 GPUs, 8 * 38400 = 307200
10
+ batch_size_type: frame # frame | sample
11
+ max_samples: 32 # max sequences per batch if use frame-wise batch_size
12
+ num_workers: 32
13
+ config:
14
+ target_sample_rate: 24000
15
+ hop_length: 1920
16
+
17
+ optim:
18
+ epochs: 11 # only suitable for LibriHeavy, if you want to train it on LibriTTS, set epoch 686
19
+ learning_rate: 1e-4
20
+ num_warmup_updates: 1000 # warmup updates
21
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
22
+ max_grad_norm: 1.0 # gradient clipping
23
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
24
+
25
+ model:
26
+ name: ddt_spk_base_emilia_latent_ctc
27
+ tokenizer: pinyin
28
+ tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
29
+ autoencoder:
30
+ name: vae_24khz_f1920c64_1.0
31
+ cfm:
32
+ _target_: architts.model.latent_cfm.LatentCFM_REPA_CTC
33
+ num_channels: 64
34
+ timestep_scheduler: logit_normal
35
+ ctc_idx: 8
36
+ # static_chunk_size: 25 # =2s of 24kHz audio
37
+ net:
38
+ _target_: architts.model.backbone_latent.ddt_v2.DDT
39
+ dim: 1024
40
+ mel_dim: 64
41
+ spk_embed_dim: 192
42
+ text_dim: 512
43
+ encoder_depth: 18
44
+ decoder_depth: 4
45
+ conv_layers: 4
46
+ encoder_share_mod: True
47
+ decoder_share_mod: True
48
+ text_encoder_depth: 6
49
+ heads: 16
50
+ ff_mult: 2
51
+ pe_attn_head: 1
52
+ qk_norm: rms_norm
53
+ checkpoint_activations: True # recompute activations and save memory for extra compute
54
+
55
+ ckpts:
56
+ logger: wandb # wandb | tensorboard | null
57
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
58
+ save_per_updates: 50000 # save checkpoint per updates
59
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
60
+ last_per_updates: 5000 # save last checkpoint per updates
61
+ demo_per_updates: 5000
62
+ save_dir: ckpts/${model.name}_${model.autoencoder.name}_${model.tokenizer}_${datasets.name}
pretrained_models/ddt_spk_base_emilia_latent_ctc/model_800000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed4aff282bc01683f2f0a7c7217d5a1c273e9b65aa7278a78a4ad93a341ce1a7
3
+ size 4687951573
pretrained_models/vae_24khz_f1920c64_1.0/config.yaml ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_type: autoencoder
2
+ sample_size: 61440
3
+ sample_rate: 24000
4
+ audio_channels: 1
5
+ model:
6
+ encoder:
7
+ type: oobleck
8
+ config:
9
+ in_channels: ${audio_channels}
10
+ channels: 96
11
+ c_mults: [1, 2, 4, 8, 16]
12
+ strides: [2, 4, 5, 6, 8]
13
+ latent_dim: 65
14
+ use_snake: true
15
+ decoder:
16
+ type: oobleck
17
+ config:
18
+ out_channels: ${audio_channels}
19
+ channels: 128
20
+ c_mults: [1, 2, 4, 4, 8, 16]
21
+ strides: [2, 2, 3, 4, 5, 8]
22
+ latent_dim: ${model.latent_dim}
23
+ use_snake: true
24
+ inject_noise: true
25
+ final_tanh: false
26
+ bottleneck:
27
+ type: vae2
28
+ latent_dim: 64
29
+ downsampling_ratio: 1920
30
+ io_channels: ${audio_channels}
31
+ training:
32
+ learning_rate: 1.5e-4
33
+ # warmup_steps: 25000
34
+ # encoder_freeze_on_warmup: false
35
+ use_ema: true
36
+ optimizer_configs:
37
+ autoencoder:
38
+ optimizer:
39
+ type: AdamW
40
+ config:
41
+ betas: [0.8, 0.99]
42
+ lr: 1.5e-4
43
+ weight_decay: 1e-3
44
+ scheduler:
45
+ type: InverseLR
46
+ config:
47
+ inv_gamma: 200000
48
+ power: 0.5
49
+ warmup: 0.999
50
+ discriminator:
51
+ optimizer:
52
+ type: AdamW
53
+ config:
54
+ betas: [0.8, 0.99]
55
+ lr: 3e-4
56
+ weight_decay: 1e-3
57
+ scheduler:
58
+ type: InverseLR
59
+ config:
60
+ inv_gamma: 200000
61
+ power: 0.5
62
+ warmup: 0.999
63
+ loss_configs:
64
+ discriminator:
65
+ type: encodec
66
+ config:
67
+ filters: 64
68
+ n_ffts: [2048, 1024, 512, 256, 128]
69
+ hop_lengths: [512, 256, 128, 64, 32]
70
+ win_lengths: [2048, 1024, 512, 256, 128]
71
+ weights:
72
+ adversarial: 0.1
73
+ feature_matching: 5.0
74
+ spectral:
75
+ type: mrstft
76
+ config:
77
+ fft_sizes: [2048, 1024, 512, 256, 128, 64, 32]
78
+ hop_sizes: [512, 256, 128, 64, 32, 16, 8]
79
+ win_lengths: [2048, 1024, 512, 256, 128, 64, 32]
80
+ perceptual_weighting: true
81
+ weights:
82
+ mrstft: 1.0
83
+ time:
84
+ type: l1
85
+ weights:
86
+ l1: 0.0
87
+ bottleneck:
88
+ type: kl
89
+ weights:
90
+ kl: 1e-4
91
+ demo:
92
+ demo_every: 2000
93
+
pretrained_models/vae_24khz_f1920c64_1.0/model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b2971abf5448077e7d7e2bf960d813e928533331e5cb24fb54ca9941512216f
3
+ size 502995634