LSCodec inference bundle: vocoder + codebook + pre-computed UK/EN prompt embeds

Browse files

Files changed (6) hide show

codebook.npy +3 -0
lscodec_inference_src.tar.gz +3 -0
lscodec_vocoder.pt +3 -0
prompt_emb_en.pt +3 -0
prompt_emb_uk.pt +3 -0
vocoder_config.yml +220 -0

codebook.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64a97b3ce91abc2704083025edf0a5d585396632747b47f6b4cfdc2babe28494
+size 262272

lscodec_inference_src.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9c29757dd913e3373459aae6e1df6ff34ccbc2550e0f1b0bce0cf26b1176c25
+size 61739

lscodec_vocoder.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11d6eab6ee4a529bbb47a7b40a5e31ef1fd14dfe5ca80ad169eb8ec4635b1fd0
+size 125569137

prompt_emb_en.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cbaee2f330026c277ea65da312dcd8447dc76f4c9c004f34b0be93abf2a1d63
+size 595588

prompt_emb_uk.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79c8442da361224555503a20f28227db294cbf7f662c2c93e3887722e80b51d5
+size 890500

vocoder_config.yml ADDED Viewed

	@@ -0,0 +1,220 @@

+allow_cache: false
+batch_frames: 10000
+config: conf/ctxv2w.v1.yaml
+crop_max_frames: 100
+dev_aux_scp: feats/normed_ppe/dev_all/feats.scp
+dev_mel_scp: feats/normed_fbank/dev_all/feats.scp
+dev_num_frames: data/dev_all/utt2num_frames
+dev_prompt_scp: feats/wavlm_l6/dev_all/feats.scp
+dev_segments: null
+dev_vqidx_scp: feats/vqidx/dev_all/feats.scp
+dev_wav_scp: data/dev_all/wav.scp
+dev_xvector_scp: null
+discriminator_adv_loss_params:
+  average_by_discriminators: false
+discriminator_grad_norm: -1
+discriminator_optimizer_params:
+  betas:
+  - 0.5
+  - 0.9
+  lr: 0.0002
+  weight_decay: 0.0
+discriminator_optimizer_type: Adam
+discriminator_params:
+  follow_official_norm: true
+  period_discriminator_params:
+    bias: true
+    channels: 32
+    downsample_scales:
+    - 3
+    - 3
+    - 3
+    - 3
+    - 1
+    in_channels: 1
+    kernel_sizes:
+    - 5
+    - 3
+    max_downsample_channels: 1024
+    nonlinear_activation: LeakyReLU
+    nonlinear_activation_params:
+      negative_slope: 0.1
+    out_channels: 1
+    use_spectral_norm: false
+    use_weight_norm: true
+  periods:
+  - 2
+  - 3
+  - 5
+  - 7
+  - 11
+  scale_discriminator_params:
+    bias: true
+    channels: 128
+    downsample_scales:
+    - 4
+    - 4
+    - 4
+    - 4
+    - 1
+    in_channels: 1
+    kernel_sizes:
+    - 15
+    - 41
+    - 5
+    - 3
+    max_downsample_channels: 1024
+    max_groups: 16
+    nonlinear_activation: LeakyReLU
+    nonlinear_activation_params:
+      negative_slope: 0.1
+    out_channels: 1
+  scale_downsample_pooling: AvgPool1d
+  scale_downsample_pooling_params:
+    kernel_size: 4
+    padding: 2
+    stride: 2
+  scales: 3
+discriminator_scheduler_params:
+  gamma: 0.5
+  milestones:
+  - 200000
+  - 400000
+  - 600000
+  - 800000
+discriminator_scheduler_type: MultiStepLR
+discriminator_train_start_steps: 0
+discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
+distributed: true
+dropout_features: 0.0
+eval_interval_steps: 100000
+feat_match_loss_params:
+  average_by_discriminators: false
+  average_by_layers: false
+  include_final_outputs: false
+frontend_mel_prediction_stop_steps: 200000
+frontend_params:
+  conformer_params:
+    activation_type: swish
+    attention_dim: 184
+    attention_dropout_rate: 0.2
+    attention_heads: 2
+    cnn_module_kernel: 31
+    concat_after: false
+    dropout_rate: 0.2
+    linear_units: 1536
+    macaron_style: true
+    normalize_before: true
+    num_blocks: 2
+    pos_enc_layer_type: rel_pos
+    positional_dropout_rate: 0.2
+    positionwise_conv_kernel_size: 3
+    positionwise_layer_type: conv1d
+    selfattention_layer_type: rel_selfattn
+    use_cnn_module: true
+  prompt_channels: 1024
+  vqvec_channels: 64
+generator_adv_loss_params:
+  average_by_discriminators: false
+generator_grad_norm: -1
+generator_optimizer_params:
+  betas:
+  - 0.5
+  - 0.9
+  lr: 0.0002
+  weight_decay: 0.0
+generator_optimizer_type: Adam
+generator_params:
+  bias: true
+  channels: 512
+  in_channels: 184
+  kernel_size: 7
+  nonlinear_activation: LeakyReLU
+  nonlinear_activation_params:
+    negative_slope: 0.1
+  out_channels: 1
+  resblock_dilations:
+  - - 1
+    - 3
+    - 5
+  - - 1
+    - 3
+    - 5
+  - - 1
+    - 3
+    - 5
+  resblock_kernel_sizes:
+  - 3
+  - 7
+  - 11
+  upsample_kernel_sizes:
+  - 16
+  - 10
+  - 8
+  - 6
+  upsample_scales:
+  - 8
+  - 5
+  - 4
+  - 3
+  use_additional_convs: true
+  use_weight_norm: true
+generator_scheduler_params:
+  gamma: 0.5
+  milestones:
+  - 200000
+  - 400000
+  - 600000
+  - 800000
+generator_scheduler_type: MultiStepLR
+generator_train_start_steps: 1
+generator_type: HiFiGANGenerator
+hop_size: 480
+lambda_adv: 1.0
+lambda_aux: 45.0
+lambda_feat_match: 2.0
+lambda_frontend_mel_prediction: 60
+length_tolerance: 5
+log_interval_steps: 1000
+max_num_frames: 3000
+mel_loss_params:
+  fft_size: 2048
+  fmax: 8000
+  fmin: 40
+  fs: 24000
+  hop_size: 300
+  log_base: null
+  num_mels: 80
+  win_length: 1200
+  window: hann
+min_num_frames: 600
+num_mels: 80
+num_save_intermediate_results: 4
+num_workers: 64
+outdir: exp/train_all_ctxv2w.v1
+pin_memory: true
+pretrain: ''
+prompt_net_type: ConvPromptPrenet
+rank: 0
+resume: ''
+sampling_rate: 24000
+save_interval_steps: 10000
+train_aux_scp: feats/normed_ppe/train_all/feats.scp
+train_max_steps: 1000000
+train_mel_scp: feats/normed_fbank/train_all/feats.scp
+train_num_frames: data/train_all/utt2num_frames
+train_prompt_scp: feats/wavlm_l6/train_all/feats.scp
+train_segments: null
+train_vqidx_scp: feats/vqidx/train_all/feats.scp
+train_wav_scp: data/train_all/wav.scp
+train_xvector_scp: null
+use_feat_match_loss: true
+use_mel_loss: true
+use_stft_loss: false
+verbose: 1
+version: 0.5.3
+vq_codebook: pretrained/codebook_25hz.npy
+win_length: 1394
+world_size: 2
+repeat_input_tokens: true