slyne deng commited on
Commit
f0e0360
·
1 Parent(s): 6cd09ad

init model

Browse files

Signed-off-by: slyne deng <slyned@nvidia.com>

README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Setup Environment
2
+
3
+ ```
4
+ git clone https://github.com/Slyne/FunCodec
5
+ cd FunCodec && git checkout slyne_fix && cd ..
6
+ ```
7
+
8
+ The tested environment for the below part is based on docker `nvcr.io/nvidia/pytorch:24.04-py3` OR a conda environment should be good as well.
9
+
10
+ ```
11
+ # mount the current directory to /ws; You can put your data in your current
12
+ # directory as well.
13
+ docker run --gpus all -it -v $PWD:/ws nvcr.io/nvidia/pytorch:24.04-py3
14
+
15
+ Or
16
+
17
+ conda create -n funcodec python=3.10
18
+ ```
19
+ ### Install packages
20
+ ```
21
+ cd /ws/FunCodec;
22
+ pip install --editable ./ ; pip install torchaudio;
23
+ ```
24
+
25
+ ### Prepare dataset
26
+ Please prepare your dataset similar to `${sampling_rate}_wav.scp` and put them in `/ws/test_wavscp/`
27
+ ```
28
+ 44100_wav.scp
29
+ 48000_wav.scp
30
+ 16000_wav.scp
31
+ ```
32
+
33
+ Each `wav.scp` file looks like below:
34
+ ```
35
+ <wavid> <absolute_path>
36
+ WAbHmvQ9zME_00002 /raid/slyne/codec_evaluation/Codec-SUPERB/data/vox1_test_wav/wav/id10302/WAbHmvQ9zME/00002.wav
37
+ ```
38
+
39
+ **Example**
40
+ Please follow [here](https://github.com/voidful/Codec-SUPERB/tree/SLT_Challenge?tab=readme-ov-file#2-data-download) to download `Codec-SUPERB` test datasets.
41
+
42
+ ```
43
+ # suppose the unzip data dir is /ws/data
44
+ python3 generate_wavscp.py --input_dir=/ws/data
45
+ ```
46
+
47
+ ### Download models
48
+
49
+ Download models from [TO UPDATE](xxxx). And put them under `FunCodec/egs/codecSuperb/models`
50
+
51
+
52
+ ### Do inference
53
+ Please refer to `FunCodec/egs/codecSuperb/do_codecSuperb_infer.sh` to do inference.
54
+
55
+
56
+ ```
57
+ # set model to the default model trained with 16khz data
58
+ model_dir=models/16k/
59
+ model_name=8epoch.pth
60
+ sample_rates=(16000 44100 48000) # the input wavscp sample rate ca be 16khz, 44.1khz or 48khz
61
+
62
+ ```
63
+
64
+ Run:
65
+ ```
66
+ cd FunCodec/egs/codecSuperb/
67
+ # modify the ref_audio_dir and syn_audio_dir
68
+ bash do_codecSuperb_infer.sh
69
+ ```
70
+
models/16k/12epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1ce997aff7f5a865ca535b2a1bac873d7851a682b14cb028dc80b26fe87174a
3
+ size 265939202
models/16k/8epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3a972f9fc4ea30e6c97fa4ddb38d5aebfde250012fe8606189e01c61995f4b3
3
+ size 265933318
models/16k/config.yaml ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/encodec_16k_n32_600k_step_ds640.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: ./exp/encodec_16k_n32_600k_step_ds640
7
+ ngpu: 4
8
+ seed: 0
9
+ num_workers: 8
10
+ num_att_plot: 0
11
+ dist_backend: nccl
12
+ dist_init_method: file:///raid/slyne/FunCodec/egs/codecSuperb/exp/encodec_16k_n32_600k_step_ds640/ddp_init
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 60
28
+ max_update: 9223372036854775807
29
+ patience: null
30
+ val_scheduler_criterion:
31
+ - valid
32
+ - loss
33
+ early_stopping_criterion:
34
+ - valid
35
+ - loss
36
+ - min
37
+ best_model_criterion:
38
+ - - valid
39
+ - generator_multi_spectral_recon_loss
40
+ - min
41
+ keep_nbest_models: 60
42
+ nbest_averaging_interval: 0
43
+ grad_clip: -1
44
+ grad_clip_type: 2.0
45
+ grad_noise: false
46
+ accum_grad: 1
47
+ no_forward_run: false
48
+ resume: true
49
+ train_dtype: float32
50
+ use_amp: false
51
+ log_interval: 50
52
+ use_tensorboard: true
53
+ use_wandb: false
54
+ wandb_project: null
55
+ wandb_id: null
56
+ wandb_entity: null
57
+ wandb_name: null
58
+ wandb_model_log_interval: -1
59
+ detect_anomaly: false
60
+ pretrain_path: null
61
+ init_param:
62
+ - /raid/slyne/FunCodec/egs/codecSuperb/exp/1_encodec_16k_n32_600k_step_ds640/29epoch.pth
63
+ ignore_init_mismatch: true
64
+ freeze_param: []
65
+ num_iters_per_epoch: 38418
66
+ batch_size: 64
67
+ valid_batch_size: null
68
+ batch_bins: 4000000
69
+ valid_batch_bins: null
70
+ drop_last: true
71
+ train_shape_file:
72
+ - ./exp/codecSuperb_states/train/speech_shape
73
+ valid_shape_file:
74
+ - ./exp/codecSuperb_states/dev/speech_shape
75
+ batch_type: unsorted
76
+ valid_batch_type: null
77
+ speech_length_min: -1
78
+ speech_length_max: -1
79
+ fold_length: []
80
+ sort_in_batch: descending
81
+ sort_batch: descending
82
+ multiple_iterator: false
83
+ chunk_length: 500
84
+ chunk_shift_ratio: 0.5
85
+ num_cache_chunks: 1024
86
+ dataset_type: small
87
+ dataset_conf: {}
88
+ train_data_file: null
89
+ valid_data_file: null
90
+ train_data_path_and_name_and_type:
91
+ - - ./dump/codecSuperb/train/wav.scp
92
+ - speech
93
+ - kaldi_ark
94
+ valid_data_path_and_name_and_type:
95
+ - - ./dump/codecSuperb/dev/wav.scp
96
+ - speech
97
+ - kaldi_ark
98
+ allow_variable_data_keys: false
99
+ max_cache_size: 0.0
100
+ max_cache_fd: 32
101
+ valid_max_cache_size: null
102
+ save_ckpt_every_steps: -1
103
+ optim: adam
104
+ optim_conf:
105
+ lr: 0.0003
106
+ betas:
107
+ - 0.5
108
+ - 0.9
109
+ scheduler: null
110
+ scheduler_conf: {}
111
+ optim2: adam
112
+ optim2_conf:
113
+ lr: 0.0003
114
+ betas:
115
+ - 0.5
116
+ - 0.9
117
+ scheduler2: null
118
+ scheduler2_conf: {}
119
+ use_pai: false
120
+ simple_ddp: false
121
+ num_worker_count: 1
122
+ access_key_id: null
123
+ access_key_secret: null
124
+ endpoint: null
125
+ bucket_name: null
126
+ oss_bucket: null
127
+ generator_first: false
128
+ input_size: 1
129
+ cmvn_file: null
130
+ disc_grad_clip: -1
131
+ disc_grad_clip_type: 2.0
132
+ gen_train_interval: 1
133
+ disc_train_interval: 1
134
+ stat_flops: false
135
+ use_preprocessor: true
136
+ speech_volume_normalize: null
137
+ speech_rms_normalize: false
138
+ speech_max_length: 40960
139
+ sampling_rate: 16000
140
+ valid_max_length: 40960
141
+ frontend: null
142
+ frontend_conf: {}
143
+ normalize: null
144
+ normalize_conf: {}
145
+ encoder: encodec_seanet_encoder
146
+ encoder_conf:
147
+ ratios:
148
+ - 8
149
+ - 5
150
+ - 4
151
+ - 2
152
+ - 2
153
+ norm: time_group_norm
154
+ causal: false
155
+ quantizer: costume_quantizer
156
+ quantizer_conf:
157
+ codebook_size: 1024
158
+ num_quantizers: 32
159
+ ema_decay: 0.99
160
+ kmeans_init: true
161
+ sampling_rate: 16000
162
+ quantize_dropout: true
163
+ rand_num_quant:
164
+ - 2
165
+ - 4
166
+ - 8
167
+ - 16
168
+ - 32
169
+ use_ddp: true
170
+ encoder_hop_length: 640
171
+ decoder: encodec_seanet_decoder
172
+ decoder_conf:
173
+ ratios:
174
+ - 8
175
+ - 5
176
+ - 4
177
+ - 2
178
+ - 2
179
+ norm: time_group_norm
180
+ causal: false
181
+ model: encodec
182
+ model_conf:
183
+ odim: 128
184
+ multi_spectral_window_powers_of_two:
185
+ - 5
186
+ - 6
187
+ - 7
188
+ - 8
189
+ - 9
190
+ - 10
191
+ target_sample_hz: 16000
192
+ audio_normalize: true
193
+ use_power_spec_loss: true
194
+ segment_dur: null
195
+ overlap_ratio: null
196
+ discriminator: multiple_disc
197
+ discriminator_conf:
198
+ disc_conf_list:
199
+ - filters: 32
200
+ name: encodec_multi_scale_stft_discriminator
201
+ gpu_id: 0
202
+ distributed: true
203
+ version: 0.2.0
models/48k/25epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1231b674ad208973eff571241218631ac7e4720382b1e3a275021ccff529a199
3
+ size 265939202
models/48k/config.yaml ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/encodec_16k_n32_600k_step_ds640_48khz.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: ./exp/encodec_16k_n32_600k_step_ds640_48khz
7
+ ngpu: 8
8
+ seed: 0
9
+ num_workers: 8
10
+ num_att_plot: 0
11
+ dist_backend: nccl
12
+ dist_init_method: file:///ws/FunCodec/egs/codecSuperb/exp/encodec_16k_n32_600k_step_ds640_48khz/ddp_init
13
+ dist_world_size: 8
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 60
28
+ max_update: 9223372036854775807
29
+ patience: null
30
+ val_scheduler_criterion:
31
+ - valid
32
+ - loss
33
+ early_stopping_criterion:
34
+ - valid
35
+ - loss
36
+ - min
37
+ best_model_criterion:
38
+ - - valid
39
+ - generator_multi_spectral_recon_loss
40
+ - min
41
+ keep_nbest_models: 5
42
+ nbest_averaging_interval: 0
43
+ grad_clip: -1
44
+ grad_clip_type: 2.0
45
+ grad_noise: false
46
+ accum_grad: 1
47
+ no_forward_run: false
48
+ resume: true
49
+ train_dtype: float32
50
+ use_amp: false
51
+ log_interval: 50
52
+ use_tensorboard: true
53
+ use_wandb: false
54
+ wandb_project: null
55
+ wandb_id: null
56
+ wandb_entity: null
57
+ wandb_name: null
58
+ wandb_model_log_interval: -1
59
+ detect_anomaly: false
60
+ pretrain_path: null
61
+ init_param:
62
+ - exp/encodec_16k_n32_600k_step_ds640_8gpus/30epoch.pth
63
+ ignore_init_mismatch: true
64
+ freeze_param: []
65
+ num_iters_per_epoch: 200
66
+ batch_size: 512
67
+ valid_batch_size: null
68
+ batch_bins: 8000000
69
+ valid_batch_bins: null
70
+ drop_last: true
71
+ train_shape_file:
72
+ - ./exp/codecSuperb_states/train/speech_shape
73
+ valid_shape_file:
74
+ - ./exp/codecSuperb_states/dev/speech_shape
75
+ batch_type: unsorted
76
+ valid_batch_type: null
77
+ speech_length_min: -1
78
+ speech_length_max: -1
79
+ fold_length: []
80
+ sort_in_batch: descending
81
+ sort_batch: descending
82
+ multiple_iterator: false
83
+ chunk_length: 500
84
+ chunk_shift_ratio: 0.5
85
+ num_cache_chunks: 1024
86
+ dataset_type: small
87
+ dataset_conf: {}
88
+ train_data_file: null
89
+ valid_data_file: null
90
+ train_data_path_and_name_and_type:
91
+ - - ./dump/codecSuperb_48k/train/wav.scp
92
+ - speech
93
+ - kaldi_ark
94
+ valid_data_path_and_name_and_type:
95
+ - - ./dump/codecSuperb_48k/dev/wav.scp
96
+ - speech
97
+ - kaldi_ark
98
+ allow_variable_data_keys: false
99
+ max_cache_size: 0.0
100
+ max_cache_fd: 32
101
+ valid_max_cache_size: null
102
+ save_ckpt_every_steps: -1
103
+ optim: adam
104
+ optim_conf:
105
+ lr: 0.0003
106
+ betas:
107
+ - 0.5
108
+ - 0.9
109
+ scheduler: null
110
+ scheduler_conf: {}
111
+ optim2: adam
112
+ optim2_conf:
113
+ lr: 0.0001
114
+ betas:
115
+ - 0.5
116
+ - 0.9
117
+ scheduler2: null
118
+ scheduler2_conf: {}
119
+ use_pai: false
120
+ simple_ddp: false
121
+ num_worker_count: 1
122
+ access_key_id: null
123
+ access_key_secret: null
124
+ endpoint: null
125
+ bucket_name: null
126
+ oss_bucket: null
127
+ generator_first: false
128
+ input_size: 1
129
+ cmvn_file: null
130
+ disc_grad_clip: -1
131
+ disc_grad_clip_type: 2.0
132
+ gen_train_interval: 1
133
+ disc_train_interval: 1
134
+ stat_flops: false
135
+ use_preprocessor: true
136
+ speech_volume_normalize: null
137
+ speech_rms_normalize: false
138
+ speech_max_length: 40960
139
+ sampling_rate: 48000
140
+ valid_max_length: 40960
141
+ frontend: null
142
+ frontend_conf: {}
143
+ normalize: null
144
+ normalize_conf: {}
145
+ encoder: encodec_seanet_encoder
146
+ encoder_conf:
147
+ ratios:
148
+ - 8
149
+ - 5
150
+ - 4
151
+ - 2
152
+ - 2
153
+ norm: time_group_norm
154
+ causal: false
155
+ quantizer: costume_quantizer
156
+ quantizer_conf:
157
+ codebook_size: 1024
158
+ num_quantizers: 32
159
+ ema_decay: 0.99
160
+ kmeans_init: true
161
+ sampling_rate: 48000
162
+ quantize_dropout: true
163
+ rand_num_quant:
164
+ - 2
165
+ - 4
166
+ - 8
167
+ - 16
168
+ - 32
169
+ use_ddp: true
170
+ encoder_hop_length: 640
171
+ decoder: encodec_seanet_decoder
172
+ decoder_conf:
173
+ ratios:
174
+ - 8
175
+ - 5
176
+ - 4
177
+ - 2
178
+ - 2
179
+ norm: time_group_norm
180
+ causal: false
181
+ model: encodec
182
+ model_conf:
183
+ odim: 128
184
+ multi_spectral_window_powers_of_two:
185
+ - 5
186
+ - 6
187
+ - 7
188
+ - 8
189
+ - 9
190
+ - 10
191
+ target_sample_hz: 48000
192
+ audio_normalize: true
193
+ use_power_spec_loss: true
194
+ segment_dur: null
195
+ overlap_ratio: null
196
+ discriminator: multiple_disc
197
+ discriminator_conf:
198
+ disc_conf_list:
199
+ - filters: 32
200
+ name: encodec_multi_scale_stft_discriminator
201
+ gpu_id: 0
202
+ distributed: true
203
+ version: 0.2.0