init model

Browse files

Signed-off-by: slyne deng <slyned@nvidia.com>

Files changed (6) hide show

README.md +70 -0
models/16k/12epoch.pth +3 -0
models/16k/8epoch.pth +3 -0
models/16k/config.yaml +203 -0
models/48k/25epoch.pth +3 -0
models/48k/config.yaml +203 -0

README.md ADDED Viewed

	@@ -0,0 +1,70 @@

+# Setup Environment
+```
+git clone https://github.com/Slyne/FunCodec
+cd FunCodec && git checkout slyne_fix && cd ..
+```
+The tested environment for the below part is based on docker `nvcr.io/nvidia/pytorch:24.04-py3` OR a conda environment should be good as well.
+```
+# mount the current directory to /ws; You can put your data in your current
+# directory as well.
+docker run --gpus all -it -v $PWD:/ws  nvcr.io/nvidia/pytorch:24.04-py3
+Or
+conda create -n funcodec python=3.10
+```
+### Install packages
+```
+cd /ws/FunCodec;
+pip install --editable ./ ; pip install torchaudio;
+```
+### Prepare dataset
+Please prepare your dataset similar to `${sampling_rate}_wav.scp` and put them in `/ws/test_wavscp/`
+```
+44100_wav.scp
+48000_wav.scp
+16000_wav.scp
+```
+Each `wav.scp` file looks like below:
+```
+<wavid> <absolute_path>
+WAbHmvQ9zME_00002 /raid/slyne/codec_evaluation/Codec-SUPERB/data/vox1_test_wav/wav/id10302/WAbHmvQ9zME/00002.wav
+```
+**Example**
+Please follow [here](https://github.com/voidful/Codec-SUPERB/tree/SLT_Challenge?tab=readme-ov-file#2-data-download) to download `Codec-SUPERB` test datasets.
+```
+# suppose the unzip data dir is /ws/data
+python3 generate_wavscp.py --input_dir=/ws/data
+```
+### Download models
+Download models from [TO UPDATE](xxxx). And put them under `FunCodec/egs/codecSuperb/models`
+### Do inference
+Please refer to `FunCodec/egs/codecSuperb/do_codecSuperb_infer.sh` to do inference.
+```
+# set model to the default model trained with 16khz data
+model_dir=models/16k/
+model_name=8epoch.pth
+sample_rates=(16000 44100 48000)  # the input wavscp sample rate ca be 16khz, 44.1khz or 48khz
+```
+Run:
+```
+cd FunCodec/egs/codecSuperb/
+# modify the ref_audio_dir and syn_audio_dir
+bash do_codecSuperb_infer.sh
+```

models/16k/12epoch.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1ce997aff7f5a865ca535b2a1bac873d7851a682b14cb028dc80b26fe87174a
+size 265939202

models/16k/8epoch.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3a972f9fc4ea30e6c97fa4ddb38d5aebfde250012fe8606189e01c61995f4b3
+size 265933318

models/16k/config.yaml ADDED Viewed

	@@ -0,0 +1,203 @@

+config: conf/encodec_16k_n32_600k_step_ds640.yaml
+print_config: false
+log_level: INFO
+dry_run: false
+iterator_type: sequence
+output_dir: ./exp/encodec_16k_n32_600k_step_ds640
+ngpu: 4
+seed: 0
+num_workers: 8
+num_att_plot: 0
+dist_backend: nccl
+dist_init_method: file:///raid/slyne/FunCodec/egs/codecSuperb/exp/encodec_16k_n32_600k_step_ds640/ddp_init
+dist_world_size: 4
+dist_rank: 0
+local_rank: 0
+dist_master_addr: null
+dist_master_port: null
+dist_launcher: null
+multiprocessing_distributed: true
+unused_parameters: true
+sharded_ddp: false
+cudnn_enabled: true
+cudnn_benchmark: false
+cudnn_deterministic: false
+collect_stats: false
+write_collected_feats: false
+max_epoch: 60
+max_update: 9223372036854775807
+patience: null
+val_scheduler_criterion:
+- valid
+- loss
+early_stopping_criterion:
+- valid
+- loss
+- min
+best_model_criterion:
+-   - valid
+    - generator_multi_spectral_recon_loss
+    - min
+keep_nbest_models: 60
+nbest_averaging_interval: 0
+grad_clip: -1
+grad_clip_type: 2.0
+grad_noise: false
+accum_grad: 1
+no_forward_run: false
+resume: true
+train_dtype: float32
+use_amp: false
+log_interval: 50
+use_tensorboard: true
+use_wandb: false
+wandb_project: null
+wandb_id: null
+wandb_entity: null
+wandb_name: null
+wandb_model_log_interval: -1
+detect_anomaly: false
+pretrain_path: null
+init_param:
+- /raid/slyne/FunCodec/egs/codecSuperb/exp/1_encodec_16k_n32_600k_step_ds640/29epoch.pth
+ignore_init_mismatch: true
+freeze_param: []
+num_iters_per_epoch: 38418
+batch_size: 64
+valid_batch_size: null
+batch_bins: 4000000
+valid_batch_bins: null
+drop_last: true
+train_shape_file:
+- ./exp/codecSuperb_states/train/speech_shape
+valid_shape_file:
+- ./exp/codecSuperb_states/dev/speech_shape
+batch_type: unsorted
+valid_batch_type: null
+speech_length_min: -1
+speech_length_max: -1
+fold_length: []
+sort_in_batch: descending
+sort_batch: descending
+multiple_iterator: false
+chunk_length: 500
+chunk_shift_ratio: 0.5
+num_cache_chunks: 1024
+dataset_type: small
+dataset_conf: {}
+train_data_file: null
+valid_data_file: null
+train_data_path_and_name_and_type:
+-   - ./dump/codecSuperb/train/wav.scp
+    - speech
+    - kaldi_ark
+valid_data_path_and_name_and_type:
+-   - ./dump/codecSuperb/dev/wav.scp
+    - speech
+    - kaldi_ark
+allow_variable_data_keys: false
+max_cache_size: 0.0
+max_cache_fd: 32
+valid_max_cache_size: null
+save_ckpt_every_steps: -1
+optim: adam
+optim_conf:
+    lr: 0.0003
+    betas:
+    - 0.5
+    - 0.9
+scheduler: null
+scheduler_conf: {}
+optim2: adam
+optim2_conf:
+    lr: 0.0003
+    betas:
+    - 0.5
+    - 0.9
+scheduler2: null
+scheduler2_conf: {}
+use_pai: false
+simple_ddp: false
+num_worker_count: 1
+access_key_id: null
+access_key_secret: null
+endpoint: null
+bucket_name: null
+oss_bucket: null
+generator_first: false
+input_size: 1
+cmvn_file: null
+disc_grad_clip: -1
+disc_grad_clip_type: 2.0
+gen_train_interval: 1
+disc_train_interval: 1
+stat_flops: false
+use_preprocessor: true
+speech_volume_normalize: null
+speech_rms_normalize: false
+speech_max_length: 40960
+sampling_rate: 16000
+valid_max_length: 40960
+frontend: null
+frontend_conf: {}
+normalize: null
+normalize_conf: {}
+encoder: encodec_seanet_encoder
+encoder_conf:
+    ratios:
+    - 8
+    - 5
+    - 4
+    - 2
+    - 2
+    norm: time_group_norm
+    causal: false
+quantizer: costume_quantizer
+quantizer_conf:
+    codebook_size: 1024
+    num_quantizers: 32
+    ema_decay: 0.99
+    kmeans_init: true
+    sampling_rate: 16000
+    quantize_dropout: true
+    rand_num_quant:
+    - 2
+    - 4
+    - 8
+    - 16
+    - 32
+    use_ddp: true
+    encoder_hop_length: 640
+decoder: encodec_seanet_decoder
+decoder_conf:
+    ratios:
+    - 8
+    - 5
+    - 4
+    - 2
+    - 2
+    norm: time_group_norm
+    causal: false
+model: encodec
+model_conf:
+    odim: 128
+    multi_spectral_window_powers_of_two:
+    - 5
+    - 6
+    - 7
+    - 8
+    - 9
+    - 10
+    target_sample_hz: 16000
+    audio_normalize: true
+    use_power_spec_loss: true
+    segment_dur: null
+    overlap_ratio: null
+discriminator: multiple_disc
+discriminator_conf:
+    disc_conf_list:
+    -   filters: 32
+        name: encodec_multi_scale_stft_discriminator
+gpu_id: 0
+distributed: true
+version: 0.2.0

models/48k/25epoch.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1231b674ad208973eff571241218631ac7e4720382b1e3a275021ccff529a199
+size 265939202

models/48k/config.yaml ADDED Viewed

	@@ -0,0 +1,203 @@

+config: conf/encodec_16k_n32_600k_step_ds640_48khz.yaml
+print_config: false
+log_level: INFO
+dry_run: false
+iterator_type: sequence
+output_dir: ./exp/encodec_16k_n32_600k_step_ds640_48khz
+ngpu: 8
+seed: 0
+num_workers: 8
+num_att_plot: 0
+dist_backend: nccl
+dist_init_method: file:///ws/FunCodec/egs/codecSuperb/exp/encodec_16k_n32_600k_step_ds640_48khz/ddp_init
+dist_world_size: 8
+dist_rank: 0
+local_rank: 0
+dist_master_addr: null
+dist_master_port: null
+dist_launcher: null
+multiprocessing_distributed: true
+unused_parameters: true
+sharded_ddp: false
+cudnn_enabled: true
+cudnn_benchmark: false
+cudnn_deterministic: false
+collect_stats: false
+write_collected_feats: false
+max_epoch: 60
+max_update: 9223372036854775807
+patience: null
+val_scheduler_criterion:
+- valid
+- loss
+early_stopping_criterion:
+- valid
+- loss
+- min
+best_model_criterion:
+-   - valid
+    - generator_multi_spectral_recon_loss
+    - min
+keep_nbest_models: 5
+nbest_averaging_interval: 0
+grad_clip: -1
+grad_clip_type: 2.0
+grad_noise: false
+accum_grad: 1
+no_forward_run: false
+resume: true
+train_dtype: float32
+use_amp: false
+log_interval: 50
+use_tensorboard: true
+use_wandb: false
+wandb_project: null
+wandb_id: null
+wandb_entity: null
+wandb_name: null
+wandb_model_log_interval: -1
+detect_anomaly: false
+pretrain_path: null
+init_param:
+- exp/encodec_16k_n32_600k_step_ds640_8gpus/30epoch.pth
+ignore_init_mismatch: true
+freeze_param: []
+num_iters_per_epoch: 200
+batch_size: 512
+valid_batch_size: null
+batch_bins: 8000000
+valid_batch_bins: null
+drop_last: true
+train_shape_file:
+- ./exp/codecSuperb_states/train/speech_shape
+valid_shape_file:
+- ./exp/codecSuperb_states/dev/speech_shape
+batch_type: unsorted
+valid_batch_type: null
+speech_length_min: -1
+speech_length_max: -1
+fold_length: []
+sort_in_batch: descending
+sort_batch: descending
+multiple_iterator: false
+chunk_length: 500
+chunk_shift_ratio: 0.5
+num_cache_chunks: 1024
+dataset_type: small
+dataset_conf: {}
+train_data_file: null
+valid_data_file: null
+train_data_path_and_name_and_type:
+-   - ./dump/codecSuperb_48k/train/wav.scp
+    - speech
+    - kaldi_ark
+valid_data_path_and_name_and_type:
+-   - ./dump/codecSuperb_48k/dev/wav.scp
+    - speech
+    - kaldi_ark
+allow_variable_data_keys: false
+max_cache_size: 0.0
+max_cache_fd: 32
+valid_max_cache_size: null
+save_ckpt_every_steps: -1
+optim: adam
+optim_conf:
+    lr: 0.0003
+    betas:
+    - 0.5
+    - 0.9
+scheduler: null
+scheduler_conf: {}
+optim2: adam
+optim2_conf:
+    lr: 0.0001
+    betas:
+    - 0.5
+    - 0.9
+scheduler2: null
+scheduler2_conf: {}
+use_pai: false
+simple_ddp: false
+num_worker_count: 1
+access_key_id: null
+access_key_secret: null
+endpoint: null
+bucket_name: null
+oss_bucket: null
+generator_first: false
+input_size: 1
+cmvn_file: null
+disc_grad_clip: -1
+disc_grad_clip_type: 2.0
+gen_train_interval: 1
+disc_train_interval: 1
+stat_flops: false
+use_preprocessor: true
+speech_volume_normalize: null
+speech_rms_normalize: false
+speech_max_length: 40960
+sampling_rate: 48000
+valid_max_length: 40960
+frontend: null
+frontend_conf: {}
+normalize: null
+normalize_conf: {}
+encoder: encodec_seanet_encoder
+encoder_conf:
+    ratios:
+    - 8
+    - 5
+    - 4
+    - 2
+    - 2
+    norm: time_group_norm
+    causal: false
+quantizer: costume_quantizer
+quantizer_conf:
+    codebook_size: 1024
+    num_quantizers: 32
+    ema_decay: 0.99
+    kmeans_init: true
+    sampling_rate: 48000
+    quantize_dropout: true
+    rand_num_quant:
+    - 2
+    - 4
+    - 8
+    - 16
+    - 32
+    use_ddp: true
+    encoder_hop_length: 640
+decoder: encodec_seanet_decoder
+decoder_conf:
+    ratios:
+    - 8
+    - 5
+    - 4
+    - 2
+    - 2
+    norm: time_group_norm
+    causal: false
+model: encodec
+model_conf:
+    odim: 128
+    multi_spectral_window_powers_of_two:
+    - 5
+    - 6
+    - 7
+    - 8
+    - 9
+    - 10
+    target_sample_hz: 48000
+    audio_normalize: true
+    use_power_spec_loss: true
+    segment_dur: null
+    overlap_ratio: null
+discriminator: multiple_disc
+discriminator_conf:
+    disc_conf_list:
+    -   filters: 32
+        name: encodec_multi_scale_stft_discriminator
+gpu_id: 0
+distributed: true
+version: 0.2.0