Upload indonesian_language_gpt_v1.yml with huggingface_hub
Browse files- indonesian_language_gpt_v1.yml +170 -0
indonesian_language_gpt_v1.yml
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: indonesian_language_gpt_v1
|
| 2 |
+
model: extensibletrainer
|
| 3 |
+
scale: 1
|
| 4 |
+
gpu_ids: [0] # <-- unless you have multiple gpus, use this
|
| 5 |
+
start_step: 0 # -1 causes 0.pth to be saved!
|
| 6 |
+
checkpointing_enabled: true # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training.
|
| 7 |
+
fp16: false # TODO: why does enabling this with 8bit slow down perf??
|
| 8 |
+
use_8bit: true
|
| 9 |
+
wandb: true # <-- enable to log to wandb. tensorboard logging is always enabled.
|
| 10 |
+
wandb_project_name: tortoise
|
| 11 |
+
use_tb_logger: true
|
| 12 |
+
|
| 13 |
+
datasets:
|
| 14 |
+
train:
|
| 15 |
+
name: train_dataset
|
| 16 |
+
n_workers: 8 # idk what this does
|
| 17 |
+
batch_size: 128 # This leads to ~16GB of vram usage on my 3090.
|
| 18 |
+
mode: paired_voice_audio
|
| 19 |
+
path: ../../dataset-v1/train.txt
|
| 20 |
+
fetcher_mode: ['lj'] # CHANGEME if your dataset isn't in LJSpeech format
|
| 21 |
+
phase: train
|
| 22 |
+
max_wav_length: 255995
|
| 23 |
+
max_text_length: 200
|
| 24 |
+
sample_rate: 22050
|
| 25 |
+
load_conditioning: True
|
| 26 |
+
num_conditioning_candidates: 2
|
| 27 |
+
conditioning_length: 44000
|
| 28 |
+
use_bpe_tokenizer: True
|
| 29 |
+
load_aligned_codes: False
|
| 30 |
+
tokenizer_vocab: ../indonesia_tokenizer_v1.json
|
| 31 |
+
val:
|
| 32 |
+
name: val_dataset
|
| 33 |
+
n_workers: 8
|
| 34 |
+
batch_size: 128 # this could be higher probably
|
| 35 |
+
mode: paired_voice_audio
|
| 36 |
+
path: ../../dataset-v1/val.txt
|
| 37 |
+
fetcher_mode: ['lj']
|
| 38 |
+
phase: val # might be broken idk
|
| 39 |
+
max_wav_length: 255995
|
| 40 |
+
max_text_length: 200
|
| 41 |
+
sample_rate: 22050
|
| 42 |
+
load_conditioning: True
|
| 43 |
+
num_conditioning_candidates: 2
|
| 44 |
+
conditioning_length: 44000
|
| 45 |
+
use_bpe_tokenizer: True
|
| 46 |
+
load_aligned_codes: False
|
| 47 |
+
tokenizer_vocab: ../indonesia_tokenizer_v1.json
|
| 48 |
+
|
| 49 |
+
steps:
|
| 50 |
+
gpt_train:
|
| 51 |
+
training: gpt
|
| 52 |
+
loss_log_buffer: 500 # no idea what this does
|
| 53 |
+
|
| 54 |
+
# Generally follows the recipe from the DALLE paper.
|
| 55 |
+
optimizer: adamw # this should be adamw_zero if you're using distributed training
|
| 56 |
+
#optimizer: lion
|
| 57 |
+
optimizer_params:
|
| 58 |
+
lr: !!float 1e-5 # CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value**
|
| 59 |
+
#lr: !!float 2e-6 # USE LOWER LR for LION
|
| 60 |
+
triton: false # ONLY RELEVANT FOR LION
|
| 61 |
+
weight_decay: !!float 1e-2
|
| 62 |
+
beta1: 0.9
|
| 63 |
+
beta2: 0.96
|
| 64 |
+
clip_grad_eps: 4
|
| 65 |
+
|
| 66 |
+
injectors: # TODO: replace this entire sequence with the GptVoiceLatentInjector
|
| 67 |
+
paired_to_mel:
|
| 68 |
+
type: torch_mel_spectrogram
|
| 69 |
+
mel_norm_file: ../experiments/clips_mel_norms.pth
|
| 70 |
+
in: wav
|
| 71 |
+
out: paired_mel
|
| 72 |
+
paired_cond_to_mel:
|
| 73 |
+
type: for_each
|
| 74 |
+
subtype: torch_mel_spectrogram
|
| 75 |
+
mel_norm_file: ../experiments/clips_mel_norms.pth
|
| 76 |
+
in: conditioning
|
| 77 |
+
out: paired_conditioning_mel
|
| 78 |
+
to_codes:
|
| 79 |
+
type: discrete_token
|
| 80 |
+
in: paired_mel
|
| 81 |
+
out: paired_mel_codes
|
| 82 |
+
dvae_config: "../experiments/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT
|
| 83 |
+
paired_fwd_text:
|
| 84 |
+
type: generator
|
| 85 |
+
generator: gpt
|
| 86 |
+
in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
|
| 87 |
+
out: [loss_text_ce, loss_mel_ce, logits]
|
| 88 |
+
losses:
|
| 89 |
+
text_ce:
|
| 90 |
+
type: direct
|
| 91 |
+
weight: .01
|
| 92 |
+
key: loss_text_ce
|
| 93 |
+
mel_ce:
|
| 94 |
+
type: direct
|
| 95 |
+
weight: 1
|
| 96 |
+
key: loss_mel_ce
|
| 97 |
+
|
| 98 |
+
networks:
|
| 99 |
+
gpt:
|
| 100 |
+
type: generator
|
| 101 |
+
which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
|
| 102 |
+
kwargs:
|
| 103 |
+
layers: 30 # WAS 8
|
| 104 |
+
model_dim: 1024 # WAS 512
|
| 105 |
+
heads: 16 # WAS 8
|
| 106 |
+
max_text_tokens: 402 # WAS 120
|
| 107 |
+
max_mel_tokens: 604 # WAS 250
|
| 108 |
+
max_conditioning_inputs: 2 # WAS 1
|
| 109 |
+
mel_length_compression: 1024
|
| 110 |
+
number_text_tokens: 256 # supposed to be 255 for newer unified_voice files
|
| 111 |
+
number_mel_codes: 8194
|
| 112 |
+
start_mel_token: 8192
|
| 113 |
+
stop_mel_token: 8193
|
| 114 |
+
start_text_token: 255
|
| 115 |
+
train_solo_embeddings: False # missing in uv3/4
|
| 116 |
+
use_mel_codes_as_input: True # ditto
|
| 117 |
+
checkpointing: True
|
| 118 |
+
tortoise_compat: True
|
| 119 |
+
#types: 1 # this is MISSING, but in my analysis 1 is equivalent to not having it.
|
| 120 |
+
#only_alignment_head: False # uv3/4
|
| 121 |
+
|
| 122 |
+
path:
|
| 123 |
+
pretrain_model_gpt: '../experiments/autoregressive.pth' # CHANGEME: copy this from tortoise cache
|
| 124 |
+
strict_load: true
|
| 125 |
+
#resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state.
|
| 126 |
+
|
| 127 |
+
# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit)
|
| 128 |
+
train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
|
| 129 |
+
niter: 50000
|
| 130 |
+
warmup_iter: -1
|
| 131 |
+
mega_batch_factor: 1 # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
|
| 132 |
+
val_freq: 318 # TODO set this to epoch size * something
|
| 133 |
+
|
| 134 |
+
default_lr_scheme: MultiStepLR
|
| 135 |
+
gen_lr_steps: [5000, 10000, 15000, 20000] #[50000, 100000, 140000, 180000]
|
| 136 |
+
lr_gamma: 0.5
|
| 137 |
+
ema_enabled: false
|
| 138 |
+
#manual_seed: 1337 # add this if you want reproducibility
|
| 139 |
+
|
| 140 |
+
eval:
|
| 141 |
+
pure: true # see train.py
|
| 142 |
+
|
| 143 |
+
logger:
|
| 144 |
+
print_freq: 318 # TODO: set this to epoch size
|
| 145 |
+
save_checkpoint_freq: 500 # CHANGEME: especially you should increase this it's really slow
|
| 146 |
+
visuals: [gen, mel] #TODO: figure this out
|
| 147 |
+
visual_debug_rate: 500
|
| 148 |
+
is_mel_spectrogram: true
|
| 149 |
+
disable_state_saving: true # CHANGEME if you plan to halt training inbetween
|
| 150 |
+
|
| 151 |
+
upgrades:
|
| 152 |
+
# Variable: number_of_checkpoints_to_save
|
| 153 |
+
# Description: Define how many checkpoints should be saved on disk (1 checkpoint = pth+ =~ 6.8 GB)
|
| 154 |
+
# Type: integer
|
| 155 |
+
# Value: should be the same value as for number_of_states_to_save
|
| 156 |
+
# smaller than 1 - turn off this option; there is no max value. For Colab use 1 or 2.
|
| 157 |
+
# For Colab use 1 or 2 for gDrive and 5 for instance drive
|
| 158 |
+
# 1 == Leave last saved checkpoint + last saved state (about 6.8 GB).
|
| 159 |
+
# 2 == Leave last 2 saved checkpoints + last saved states (about 2 *~ 6.8 GB =~ 13.6 GB).
|
| 160 |
+
number_of_checkpoints_to_save: 1
|
| 161 |
+
# Variable: number_of_states_to_save
|
| 162 |
+
# Description: Define how many states should be saved on disk (1 state =~ 3.4 GB)
|
| 163 |
+
# if disable_state_saving is set as true this option will be inactive
|
| 164 |
+
# Type: integer
|
| 165 |
+
# Value: should be the same value as for number_of_checkpoints_to_save
|
| 166 |
+
# smaller than 1 - turn off this option; there is no max value.
|
| 167 |
+
# For Colab use 1 or 2 for gDrive and 5 for instance drive
|
| 168 |
+
# 1 == Leave last saved state (about 3.4 GB).
|
| 169 |
+
# 2 == Leave last 2 saved states (about 2 *~ 3.4 GB =~ 6.8 GB).
|
| 170 |
+
number_of_states_to_save: 1
|