File size: 3,443 Bytes
9863b52 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | {
"codec": {
"mlp_in":{
"in_features": 320,
"hidden_features": 768,
"out_features": 1024,
"compute_dtype": "float32"
},
"mlp_out":{
"in_features": 1024,
"hidden_features": 768,
"out_features": 320,
"compute_dtype": "float32"
},
"decoder": {
"n_layers": 8,
"n_embd": 1024,
"n_hidden": 4096,
"n_heads": 16,
"head_dim": 64,
"compute_dtype": "float32",
"window_size": 15,
"dropout_rate": 0.1
},
"encoder": {
"n_layers": 8,
"n_embd": 1024,
"n_hidden": 4096,
"n_heads": 16,
"head_dim": 64,
"compute_dtype": "float32",
"window_size": 15,
"dropout_rate": 0.1
},
"rvq": {
"num_codebooks": 8,
"codebook_size": 1024,
"embedding_dim": 16,
"latent_dim": 16,
"updown_linears": false,
"codebook_weight_dtype": "float32"
}
},
"w2v":{
"mlp_in":{
"in_features": 320,
"hidden_features": 768,
"out_features": 1024,
"compute_dtype": "float32"
},
"encoder": {
"n_layers": 8,
"n_embd": 1024,
"n_hidden": 4096,
"n_heads": 16,
"head_dim": 64,
"compute_dtype": "float32",
"window_size": 15,
"dropout_rate": 0.1
},
"rvq": {
"num_codebooks": 8,
"codebook_size": 1024,
"embedding_dim": 1024,
"latent_dim": 1024,
"updown_linears": false,
"codebook_weight_dtype": "float32"
},
"training": {
"noise_masking": 0.1,
"noise_augmentation": 0.1
}
},
"training":{
"resume": false,
"loss_type": "cossim",
"strict_model": true,
"load_discriminator": false,
"learning_rate": 1e-4,
"weight_decay": 1e-2,
"discriminator_start_steps": 100,
"discriminator_segment_duration": 1.28,
"apply_apa": true,
"warmup_steps": 1000,
"min_lr": 1e-6,
"num_epochs": 100000,
"use_continuous": 0.1,
"max_grad_norm": 1000.0,
"batch_size": 300,
"gradient_accumulation_steps": 1,
"num_workers": 6,
"use_phaseaug": true,
"init_dataset": false,
"profile": false,
"verbose_grad_norm": false,
"verbose_norm_threshold_max": 5.0,
"verbose_norm_threshold_min": 0.001,
"verbose_paramter_norm": false,
"use_discriminator": false,
"codebook_reset_interval": 1000
},
"loss":{
"recon_loss_weight": 1
},
"data": {
"audio_dir": "/data",
"sample_rate": 16000,
"segment_duration": 10.24,
"cache_dir": "/data/dataloader/v9"
},
"logging": {
"log_interval": 100,
"save_interval": 500,
"eval_interval": 3000,
"experiment_dir": "/data/jhcodec/sw2v/{experiment_name}",
"checkpoint_dir": "/data/jhcodec/sw2v/{experiment_name}/checkpoints",
"tensorboard_dir": "/data/jhcodec/sw2v/{experiment_name}/tensorboard",
"n_samples": 3
}
}
|