Beilong commited on
Commit
19a585d
·
verified ·
1 Parent(s): 810ffe8

Upload laura_tse_librispeech_dm_e_100_config.yaml

Browse files
laura_tse_librispeech_dm_e_100_config.yaml ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##
2
+ ## Same as the config_log_mel_aux_5s.yaml. This is just to make a new config and experiment for
3
+ ## the finetune on epoch 100
4
+ ##
5
+
6
+ # init_param: ["/public/home/qinxy/bltang/LLM_TTS/egs/exp/audio_codec-encodec-zh_en-general-16k-nq32ds640-pytorch/model.pth:quantizer.rq.model:quantizer_codebook"]
7
+
8
+ # # For inference need
9
+ # codec_model_file: /public/home/qinxy/bltang/LLM_TTS/egs/exp/audio_codec-encodec-zh_en-general-16k-nq32ds640-pytorch/model.pth
10
+ # codec_config_file: /public/home/qinxy/bltang/LLM_TTS/egs/exp/audio_codec-encodec-zh_en-general-16k-nq32ds640-pytorch/config.yaml
11
+
12
+ # train_shape_file: ["/public/home/qinxy/bltang/data/LibriMix/Libri2Mix/wav16k/min/libri2mix_tse_data_funcodec/s1/train/all_shape.scp"]
13
+ # valid_shape_file: ["/public/home/qinxy/bltang/data/LibriMix/Libri2Mix/wav16k/min/libri2mix_tse_data_funcodec/s1/dev/all_shape.scp"]
14
+ # train_data_path_and_name_and_type: [
15
+ # [
16
+ # "/public/home/qinxy/bltang/data/LibriMix/Libri2Mix/wav16k/min/lists/train/all/mix.scp",
17
+ # "raw",
18
+ # "sound"
19
+ # ],
20
+ # [
21
+ # "/public/home/qinxy/bltang/data/LibriMix/Libri2Mix/wav16k/min/lists/train/all/aux_s1.scp",
22
+ # "raw_aux",
23
+ # "sound"
24
+ # ],
25
+ # [
26
+ # "/public/home/qinxy/bltang/data/LibriMix/Libri2Mix/wav16k/min/libri2mix_tse_data_funcodec/s1/train/all.scp",
27
+ # "codec",
28
+ # "npy"
29
+ # ]
30
+ # ]
31
+ # valid_data_path_and_name_and_type: [
32
+ # [
33
+ # "/public/home/qinxy/bltang/data/LibriMix/Libri2Mix/wav16k/min/lists/dev/mix.scp",
34
+ # "raw",
35
+ # "sound"
36
+ # ],
37
+ # [
38
+ # "/public/home/qinxy/bltang/data/LibriMix/Libri2Mix/wav16k/min/lists/dev/aux_s1.scp",
39
+ # "raw_aux",
40
+ # "sound"
41
+ # ],
42
+ # [
43
+ # "/public/home/qinxy/bltang/data/LibriMix/Libri2Mix/wav16k/min/libri2mix_tse_data_funcodec/s1/dev/all.scp",
44
+ # "codec",
45
+ # "npy"
46
+ # ]
47
+ # ]
48
+
49
+ grad_clip: 5
50
+ seed: 1234
51
+ init: null
52
+
53
+ # input related
54
+ input_size: 128 # Mel spectrogram the input size should be 80
55
+ use_preprocessor: False
56
+ audio_max_duration: 60
57
+ codec_token_rate: 25
58
+
59
+ # network architecture
60
+ # encoder related
61
+ text_encoder: conformer
62
+ text_encoder_conf:
63
+ output_size: 512 # dimension of attention
64
+ attention_heads: 8
65
+ linear_units: 2048 # the number of units of position-wise feed forward
66
+ num_blocks: 6 # the number of encoder blocks
67
+ dropout_rate: 0.1
68
+ positional_dropout_rate: 0.1
69
+ attention_dropout_rate: 0.0
70
+ input_layer: linear # encoder architecture type
71
+ normalize_before: true
72
+ rel_pos_type: latest
73
+ pos_enc_layer_type: rel_pos
74
+ selfattention_layer_type: rel_selfattn
75
+ use_cnn_module: false
76
+
77
+ # decoder related
78
+ codec_encoder: conformer
79
+ codec_encoder_conf:
80
+ output_size: 512 # dimension of attention
81
+ attention_heads: 8
82
+ linear_units: 2048 # the number of units of position-wise feed forward
83
+ num_blocks: 6 # the number of encoder blocks
84
+ dropout_rate: 0.1
85
+ positional_dropout_rate: 0.1
86
+ attention_dropout_rate: 0.0
87
+ input_layer: linear # encoder architecture type
88
+ normalize_before: true
89
+ rel_pos_type: latest
90
+ pos_enc_layer_type: rel_pos
91
+ selfattention_layer_type: rel_selfattn
92
+ use_cnn_module: false
93
+
94
+ # model related
95
+ model: laura_gen_model
96
+ model_conf:
97
+ codec_sampling_ratio: 0.5
98
+ lsm_weight: 0.0
99
+ length_normalized_loss: true
100
+ predict_nq: 2
101
+ codec_conf:
102
+ num_quantizers: 32
103
+ codebook_size: 1024
104
+ codebook_dim: 128
105
+ codec_lm_conf:
106
+ name: transformer
107
+ pos_enc: rel_pos
108
+ selfattention_layer_type: rel_selfattn
109
+ embed_unit: 128
110
+ att_unit: 512
111
+ head: 8
112
+ unit: 2048
113
+ # layer: 12
114
+ layer: 10
115
+ dropout_rate: 0.1
116
+ pe_type: uni
117
+ bidirectional_inputs: true
118
+ codec_groups: 1
119
+
120
+ ### Training related
121
+ batch_type: length
122
+ batch_bins: 15360 # increase the batch_bins a bit
123
+ batch_size: 40 # This does not matter here
124
+ sort_in_batch: descending
125
+ sort_batch: descending
126
+ num_workers: 8
127
+ max_cache_size: 0.0
128
+ max_cache_fd: 32
129
+ train_dtype: float32
130
+ ## Add for argument type checking
131
+ allow_variable_data_keys: true
132
+ drop_last: false
133
+ fold_length: []
134
+
135
+ ### Mel config ###
136
+ mel_config:
137
+ n_fft: 512
138
+ hop_size: 256
139
+ log_mel: True
140
+
141
+ ### Max aux length ###
142
+ max_aux_ds: 5
143
+
144
+ optim:
145
+ type: Adam
146
+ args:
147
+ lr: 1.0e-4
148
+
149
+ scheduler: warmuplr
150
+ scheduler_conf:
151
+ warmup_steps: 10000
152
+
153
+ best_field: loss
154
+ best_save_type: descend
155
+ max_ckpt: 1
156
+ log_interval: 10
157
+ epoch: 50
158
+
159
+ # training process
160
+ # num_iters_per_epoch: 10000