breadlicker45 commited on
Commit
f8b640f
·
verified ·
1 Parent(s): c817048

Delete wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml

Browse files
wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml DELETED
@@ -1,93 +0,0 @@
1
- seed_everything: 3407
2
-
3
- data:
4
- class_path: vocos.dataset.VocosDataModule
5
- init_args:
6
- train_params:
7
- filelist_path: /cpfs_speech/jishengpeng/Code/dataprocess/path/WavTokenizer/medium_train_audio_music
8
- sampling_rate: 24000
9
- num_samples: 72000
10
- batch_size: 39 #18
11
- num_workers: 8
12
-
13
- val_params:
14
- filelist_path: /cpfs_speech/jishengpeng/Code/dataprocess/path/WavTokenizer/medium_test_audio_music
15
- sampling_rate: 24000
16
- num_samples: 72000
17
- batch_size: 2 # 10
18
- num_workers: 8
19
-
20
- model:
21
- class_path: vocos.experiment.VocosEncodecExp
22
- init_args:
23
- sample_rate: 24000
24
- initial_learning_rate: 2e-4
25
- mel_loss_coeff: 45
26
- mrd_loss_coeff: 1.0
27
- num_warmup_steps: 0 # Optimizers warmup steps
28
- pretrain_mel_steps: 0 # 0 means GAN objective from the first iteration
29
-
30
- # automatic evaluation
31
- evaluate_utmos: true
32
- evaluate_pesq: true
33
- evaluate_periodicty: true
34
-
35
- resume: true
36
- resume_config: /cpfs_speech/jishengpeng/Code/WavTokenizer/configs/wavtokenizer_smalldata_frame75_3s_nq1_code16384_dim512_kmeans800_attn.yaml
37
- resume_model: /cpfs_speech/jishengpeng/Code/WavTokenizer/result/train/wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn/lightning_logs/version_2/checkpoints/vocos_checkpoint_epoch=1_step=45240_val_loss=8.8358.ckpt
38
-
39
- feature_extractor:
40
- class_path: vocos.feature_extractors.EncodecFeatures
41
- init_args:
42
- encodec_model: encodec_24khz
43
- bandwidths: [6.6, 6.6, 6.6, 6.6]
44
- train_codebooks: true
45
- num_quantizers: 1
46
- dowmsamples: [8, 5, 4, 2]
47
- vq_bins: 4096
48
- vq_kmeans: 200
49
-
50
- backbone:
51
- class_path: vocos.models.VocosBackbone
52
- init_args:
53
- input_channels: 512
54
- dim: 768
55
- intermediate_dim: 2304
56
- num_layers: 12
57
- adanorm_num_embeddings: 4 # len(bandwidths)
58
-
59
- head:
60
- class_path: vocos.heads.ISTFTHead
61
- init_args:
62
- dim: 768
63
- n_fft: 1280 #4*hop_length
64
- hop_length: 320 # 8*5*4*2
65
- padding: same
66
-
67
- trainer:
68
- logger:
69
- class_path: pytorch_lightning.loggers.TensorBoardLogger
70
- init_args:
71
- save_dir: /cpfs_speech/jishengpeng/Code/WavTokenizer/result/train/wavtokenizer_mediumdata_music_audio_frame75_3s_nq1_code4096_dim512_kmeans200_attn/
72
- callbacks:
73
- - class_path: pytorch_lightning.callbacks.LearningRateMonitor
74
- - class_path: pytorch_lightning.callbacks.ModelSummary
75
- init_args:
76
- max_depth: 2
77
- - class_path: pytorch_lightning.callbacks.ModelCheckpoint
78
- init_args:
79
- monitor: val_loss
80
- filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
81
- save_top_k: 10
82
- save_last: true
83
- - class_path: vocos.helpers.GradNormCallback
84
-
85
- # Lightning calculates max_steps across all optimizer steps (rather than number of batches)
86
- # This equals to 1M steps per generator and 1M per discriminator
87
- max_steps: 20000000
88
- # You might want to limit val batches when evaluating all the metrics, as they are time-consuming
89
- limit_val_batches: 200
90
- accelerator: gpu
91
- strategy: ddp
92
- devices: [0,1,2,3,4,5,6,7]
93
- log_every_n_steps: 1000