| | --- |
| | tags: |
| | - espnet |
| | - audio |
| | - codec |
| | language: multilingual |
| | datasets: |
| | - amuse |
| | license: cc-by-4.0 |
| | --- |
| | |
| | ## ESPnet2 Codec model |
| |
|
| | ### `espnet/owsm_dac_v2_16k` |
| | |
| | This model was trained by ftshijt using amuse recipe in [espnet](https://github.com/espnet/espnet/). |
| | |
| | ### Demo: How to use in ESPnet2 |
| | |
| | Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html) |
| | if you haven't done that already. |
| | |
| | ```bash |
| | cd espnet |
| | git checkout 280bfedf2c9a19038e79d3402472bde30397a02c |
| | pip install -e . |
| | cd egs2/amuse/codec1 |
| | ./run.sh --skip_data_prep false --skip_train true --download_model espnet/owsm_dac_v2_16k |
| | ``` |
| | |
| | |
| | |
| | ## Codec config |
| | |
| | <details><summary>expand</summary> |
| | |
| | ``` |
| | config: conf/train_dac_large_v2.yaml |
| | print_config: false |
| | log_level: INFO |
| | drop_last_iter: false |
| | dry_run: false |
| | iterator_type: chunk |
| | valid_iterator_type: null |
| | output_dir: exp/codec_train_dac_large_v2_raw_fs16000 |
| | ngpu: 1 |
| | seed: 777 |
| | num_workers: 1 |
| | num_att_plot: 0 |
| | dist_backend: nccl |
| | dist_init_method: env:// |
| | dist_world_size: 2 |
| | dist_rank: 0 |
| | local_rank: 0 |
| | dist_master_addr: localhost |
| | dist_master_port: 45173 |
| | dist_launcher: null |
| | multiprocessing_distributed: true |
| | unused_parameters: true |
| | sharded_ddp: false |
| | use_deepspeed: false |
| | deepspeed_config: null |
| | cudnn_enabled: true |
| | cudnn_benchmark: false |
| | cudnn_deterministic: false |
| | use_tf32: false |
| | collect_stats: false |
| | write_collected_feats: false |
| | max_epoch: 360 |
| | patience: null |
| | val_scheduler_criterion: |
| | - valid |
| | - loss |
| | early_stopping_criterion: |
| | - valid |
| | - loss |
| | - min |
| | best_model_criterion: |
| | - - valid |
| | - mel_loss |
| | - min |
| | - - train |
| | - mel_loss |
| | - min |
| | - - train |
| | - total_count |
| | - max |
| | keep_nbest_models: 5 |
| | nbest_averaging_interval: 0 |
| | grad_clip: -1 |
| | grad_clip_type: 2.0 |
| | grad_noise: false |
| | accum_grad: 1 |
| | no_forward_run: false |
| | resume: true |
| | train_dtype: float32 |
| | use_amp: false |
| | log_interval: 50 |
| | use_matplotlib: true |
| | use_tensorboard: true |
| | create_graph_in_tensorboard: false |
| | use_wandb: false |
| | wandb_project: null |
| | wandb_id: null |
| | wandb_entity: null |
| | wandb_name: null |
| | wandb_model_log_interval: -1 |
| | detect_anomaly: false |
| | use_adapter: false |
| | adapter: lora |
| | save_strategy: all |
| | adapter_conf: {} |
| | pretrain_path: null |
| | init_param: [] |
| | ignore_init_mismatch: false |
| | freeze_param: [] |
| | num_iters_per_epoch: 5000 |
| | batch_size: 64 |
| | valid_batch_size: null |
| | batch_bins: 1000000 |
| | valid_batch_bins: null |
| | category_sample_size: 10 |
| | train_shape_file: |
| | - exp/codec_stats_raw/train/audio_shape |
| | valid_shape_file: |
| | - exp/codec_stats_raw/valid/audio_shape |
| | batch_type: unsorted |
| | valid_batch_type: null |
| | fold_length: |
| | - 256000 |
| | sort_in_batch: descending |
| | shuffle_within_batch: false |
| | sort_batch: descending |
| | multiple_iterator: false |
| | chunk_length: 32000 |
| | chunk_shift_ratio: 0.5 |
| | num_cache_chunks: 256 |
| | chunk_excluded_key_prefixes: [] |
| | chunk_default_fs: null |
| | chunk_max_abs_length: null |
| | chunk_discard_short_samples: true |
| | train_data_path_and_name_and_type: |
| | - - dump/raw/owsm_all/wav.scp |
| | - audio |
| | - kaldi_ark |
| | valid_data_path_and_name_and_type: |
| | - - dump/raw/dev-small/wav.scp |
| | - audio |
| | - kaldi_ark |
| | multi_task_dataset: false |
| | allow_variable_data_keys: false |
| | max_cache_size: 0.0 |
| | max_cache_fd: 32 |
| | allow_multi_rates: false |
| | valid_max_cache_size: null |
| | exclude_weight_decay: false |
| | exclude_weight_decay_conf: {} |
| | optim: adamw |
| | optim_conf: |
| | lr: 0.0002 |
| | betas: |
| | - 0.5 |
| | - 0.9 |
| | eps: 1.0e-09 |
| | weight_decay: 0.0 |
| | scheduler: exponentiallr |
| | scheduler_conf: |
| | gamma: 0.999875 |
| | optim2: adamw |
| | optim2_conf: |
| | lr: 0.0002 |
| | betas: |
| | - 0.5 |
| | - 0.9 |
| | eps: 1.0e-09 |
| | weight_decay: 0.0 |
| | scheduler2: exponentiallr |
| | scheduler2_conf: |
| | gamma: 0.999875 |
| | generator_first: true |
| | skip_discriminator_prob: 0.0 |
| | model_conf: {} |
| | use_preprocessor: true |
| | codec: dac |
| | codec_conf: |
| | sampling_rate: 16000 |
| | generator_params: |
| | hidden_dim: 512 |
| | codebook_dim: 512 |
| | encdec_channels: 1 |
| | encdec_n_filters: 32 |
| | encdec_n_residual_layers: 3 |
| | encdec_ratios: |
| | - 8 |
| | - 5 |
| | - 4 |
| | - 2 |
| | encdec_activation: Snake |
| | encdec_norm: weight_norm |
| | encdec_kernel_size: 7 |
| | encdec_residual_kernel_size: 7 |
| | encdec_last_kernel_size: 7 |
| | encdec_dilation_base: 2 |
| | encdec_causal: false |
| | encdec_pad_mode: reflect |
| | encdec_true_skip: false |
| | encdec_compress: 2 |
| | encdec_lstm: 2 |
| | decoder_trim_right_ratio: 1.0 |
| | decoder_final_activation: null |
| | decoder_final_activation_params: null |
| | quantizer_n_q: 8 |
| | quantizer_bins: 1024 |
| | quantizer_decay: 0.99 |
| | quantizer_kmeans_init: true |
| | quantizer_kmeans_iters: 50 |
| | quantizer_threshold_ema_dead_code: 2 |
| | quantizer_target_bandwidth: |
| | - 0.5 |
| | - 1 |
| | - 2 |
| | - 4 |
| | quantizer_dropout: true |
| | sample_rate: 16000 |
| | discriminator_params: |
| | msmpmb_discriminator_params: |
| | rates: [] |
| | sample_rate: 16000 |
| | fft_sizes: |
| | - 1024 |
| | - 512 |
| | - 256 |
| | - 128 |
| | periods: |
| | - 2 |
| | - 3 |
| | - 5 |
| | - 7 |
| | - 11 |
| | period_discriminator_params: |
| | in_channels: 1 |
| | out_channels: 1 |
| | kernel_sizes: |
| | - 5 |
| | - 3 |
| | channels: 32 |
| | downsample_scales: |
| | - 3 |
| | - 3 |
| | - 3 |
| | - 3 |
| | - 1 |
| | max_downsample_channels: 1024 |
| | bias: true |
| | nonlinear_activation: LeakyReLU |
| | nonlinear_activation_params: |
| | negative_slope: 0.1 |
| | use_weight_norm: true |
| | use_spectral_norm: false |
| | band_discriminator_params: |
| | hop_factor: 0.25 |
| | sample_rate: 16000 |
| | bands: |
| | - - 0.0 |
| | - 0.1 |
| | - - 0.1 |
| | - 0.25 |
| | - - 0.25 |
| | - 0.5 |
| | - - 0.5 |
| | - 0.75 |
| | - - 0.75 |
| | - 1.0 |
| | channel: 32 |
| | generator_adv_loss_params: |
| | average_by_discriminators: false |
| | loss_type: mse |
| | discriminator_adv_loss_params: |
| | average_by_discriminators: false |
| | loss_type: mse |
| | use_feat_match_loss: true |
| | feat_match_loss_params: |
| | average_by_discriminators: false |
| | average_by_layers: false |
| | include_final_outputs: true |
| | use_mel_loss: true |
| | mel_loss_params: |
| | range_start: 6 |
| | range_end: 11 |
| | window: hann |
| | n_mels: 80 |
| | fmin: 0 |
| | fmax: null |
| | log_base: null |
| | fs: 16000 |
| | lambda_quantization: 0.25 |
| | lambda_commit: 1.0 |
| | lambda_reconstruct: 1.0 |
| | lambda_adv: 1.0 |
| | lambda_mel: 45.0 |
| | lambda_feat_match: 2.0 |
| | cache_generator_outputs: true |
| | required: |
| | - output_dir |
| | version: '202402' |
| | distributed: true |
| | ``` |
| | |
| | </details> |
| |
|
| |
|
| |
|
| | ### Citing ESPnet |
| |
|
| | ```BibTex |
| | @inproceedings{watanabe2018espnet, |
| | author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
| | title={{ESPnet}: End-to-End Speech Processing Toolkit}, |
| | year={2018}, |
| | booktitle={Proceedings of Interspeech}, |
| | pages={2207--2211}, |
| | doi={10.21437/Interspeech.2018-1456}, |
| | url={http://dx.doi.org/10.21437/Interspeech.2018-1456} |
| | } |
| | |
| | |
| | |
| | |
| | |
| | |
| | ``` |
| |
|
| | or arXiv: |
| |
|
| | ```bibtex |
| | @misc{watanabe2018espnet, |
| | title={ESPnet: End-to-End Speech Processing Toolkit}, |
| | author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai}, |
| | year={2018}, |
| | eprint={1804.00015}, |
| | archivePrefix={arXiv}, |
| | primaryClass={cs.CL} |
| | } |
| | ``` |
| |
|