Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

russian_train_1/vc_wrapper.yaml +105 -0
russian_train_2/train.log +31 -0
russian_train_2/vc_wrapper.yaml +105 -0
russian_train_3/AR_epoch_00000_step_10000.pth +3 -0
russian_train_3/CFM_epoch_00000_step_10000.pth +3 -0
russian_train_3/train.log +0 -0
russian_train_3/vc_wrapper.yaml +105 -0

russian_train_1/vc_wrapper.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+_target_: modules.v2.vc_wrapper.VoiceConversionWrapper
+sr: 22050
+hop_size: 256
+mel_fn:
+  _target_: modules.audio.mel_spectrogram
+  _partial_: true
+  n_fft: 1024
+  win_size: 1024
+  hop_size: 256
+  num_mels: 80
+  sampling_rate: 22050
+  fmin: 0
+  fmax: null
+  center: False
+cfm:
+  _target_: modules.v2.cfm.CFM
+  estimator:
+    _target_: modules.v2.dit_wrapper.DiT
+    time_as_token: true
+    style_as_token: true
+    uvit_skip_connection: false
+    block_size: 8192
+    depth: 13
+    num_heads: 8
+    hidden_dim: 512
+    in_channels: 80
+    content_dim: 512
+    style_encoder_dim: 192
+    class_dropout_prob: 0.1
+    dropout_rate: 0.0
+    attn_dropout_rate: 0.0
+cfm_length_regulator:
+  _target_: modules.v2.length_regulator.InterpolateRegulator
+  channels: 512
+  is_discrete: true
+  codebook_size: 2048
+  sampling_ratios: [ 1, 1, 1, 1 ]
+  f0_condition: false
+ar:
+  _target_: modules.v2.ar.NaiveWrapper
+  model:
+    _target_: modules.v2.ar.NaiveTransformer
+    config:
+      _target_: modules.v2.ar.NaiveModelArgs
+      dropout: 0.0
+      rope_base: 10000.0
+      dim: 768
+      head_dim: 64
+      n_local_heads: 2
+      intermediate_size: 2304
+      n_head: 12
+      n_layer: 12
+      vocab_size: 2049  # 1 + 1 for eos
+ar_length_regulator:
+  _target_: modules.v2.length_regulator.InterpolateRegulator
+  channels: 768
+  is_discrete: true
+  codebook_size: 32
+  sampling_ratios: [ ]
+  f0_condition: false
+style_encoder:
+  _target_: modules.campplus.DTDNN.CAMPPlus
+  feat_dim: 80
+  embedding_size: 192
+content_extractor_narrow:
+  _target_: modules.astral_quantization.default_model.AstralQuantizer
+  tokenizer_name: "openai/whisper-small"
+  ssl_model_name: "facebook/hubert-large-ll60k"
+  ssl_output_layer: 18
+  skip_ssl: true
+  encoder: &bottleneck_encoder
+    _target_: modules.astral_quantization.convnext.ConvNeXtV2Stage
+    dim: 512
+    num_blocks: 12
+    intermediate_dim: 1536
+    dilation: 1
+    input_dim: 1024
+  quantizer:
+    _target_: modules.astral_quantization.bsq.BinarySphericalQuantize
+    codebook_size: 32  # codebook size, must be a power of 2
+    dim: 512
+    entropy_loss_weight: 0.1
+    diversity_gamma: 1.0
+    spherical: True
+    enable_entropy_loss: True
+    soft_entropy_loss: True
+content_extractor_wide:
+  _target_: modules.astral_quantization.default_model.AstralQuantizer
+  tokenizer_name: "openai/whisper-small"
+  ssl_model_name: "facebook/hubert-large-ll60k"
+  ssl_output_layer: 18
+  encoder: *bottleneck_encoder
+  quantizer:
+    _target_: modules.astral_quantization.bsq.BinarySphericalQuantize
+    codebook_size: 2048  # codebook size, must be a power of 2
+    dim: 512
+    entropy_loss_weight: 0.1
+    diversity_gamma: 1.0
+    spherical: True
+    enable_entropy_loss: True
+    soft_entropy_loss: True
+vocoder:
+  _target_: modules.bigvgan.bigvgan.BigVGAN.from_pretrained
+  pretrained_model_name_or_path: "nvidia/bigvgan_v2_22khz_80band_256x"
+  use_cuda_kernel: false

russian_train_2/train.log ADDED Viewed

	@@ -0,0 +1,31 @@

+Epoch 0, Iteration 0, Loss: 7.4123, Loss AR: 6.8261, Loss CFM: 0.5862, Grad Norm: 5.8548, LR: 0.000000
+Epoch 0, Iteration 10, Loss: 7.1086, Loss AR: 6.5280, Loss CFM: 0.5806, Grad Norm: 7.3128, LR: 0.000020
+Epoch 0, Iteration 20, Loss: 6.7630, Loss AR: 6.1732, Loss CFM: 0.5898, Grad Norm: 5.3300, LR: 0.000020
+Epoch 0, Iteration 30, Loss: 6.6540, Loss AR: 6.0648, Loss CFM: 0.5893, Grad Norm: 7.9873, LR: 0.000020
+Epoch 0, Iteration 40, Loss: 6.4128, Loss AR: 5.9118, Loss CFM: 0.5010, Grad Norm: 6.9926, LR: 0.000020
+Epoch 0, Iteration 50, Loss: 6.3736, Loss AR: 5.8265, Loss CFM: 0.5471, Grad Norm: 6.1079, LR: 0.000020
+Epoch 0, Iteration 60, Loss: 6.3835, Loss AR: 5.7620, Loss CFM: 0.6215, Grad Norm: 5.3433, LR: 0.000020
+Epoch 0, Iteration 70, Loss: 6.3277, Loss AR: 5.7967, Loss CFM: 0.5310, Grad Norm: 5.4081, LR: 0.000020
+Epoch 0, Iteration 80, Loss: 6.1539, Loss AR: 5.5468, Loss CFM: 0.6071, Grad Norm: 6.1189, LR: 0.000020
+Epoch 0, Iteration 90, Loss: 6.1051, Loss AR: 5.5220, Loss CFM: 0.5830, Grad Norm: 5.2862, LR: 0.000020
+Epoch 0, Iteration 100, Loss: 6.0231, Loss AR: 5.3957, Loss CFM: 0.6274, Grad Norm: 5.5014, LR: 0.000020
+Epoch 0, Iteration 110, Loss: 5.9073, Loss AR: 5.3184, Loss CFM: 0.5889, Grad Norm: 6.4655, LR: 0.000020
+Epoch 0, Iteration 120, Loss: 5.7812, Loss AR: 5.3116, Loss CFM: 0.4697, Grad Norm: 4.8613, LR: 0.000020
+Epoch 0, Iteration 130, Loss: 5.7230, Loss AR: 5.2267, Loss CFM: 0.4963, Grad Norm: 4.9150, LR: 0.000020
+Epoch 0, Iteration 140, Loss: 5.9952, Loss AR: 5.3835, Loss CFM: 0.6117, Grad Norm: 4.8557, LR: 0.000020
+Epoch 0, Iteration 150, Loss: 5.7789, Loss AR: 5.1951, Loss CFM: 0.5839, Grad Norm: 5.9602, LR: 0.000020
+Epoch 0, Iteration 160, Loss: 5.8195, Loss AR: 5.2459, Loss CFM: 0.5736, Grad Norm: 6.8558, LR: 0.000020
+Epoch 0, Iteration 170, Loss: 5.6152, Loss AR: 5.0701, Loss CFM: 0.5452, Grad Norm: 4.2240, LR: 0.000020
+Epoch 0, Iteration 180, Loss: 5.8292, Loss AR: 5.3408, Loss CFM: 0.4884, Grad Norm: 4.1121, LR: 0.000020
+Epoch 0, Iteration 190, Loss: 6.0036, Loss AR: 5.3866, Loss CFM: 0.6170, Grad Norm: 6.9337, LR: 0.000020
+Epoch 0, Iteration 200, Loss: 5.6125, Loss AR: 5.1298, Loss CFM: 0.4827, Grad Norm: 6.0064, LR: 0.000020
+Epoch 0, Iteration 210, Loss: 5.8327, Loss AR: 5.2733, Loss CFM: 0.5593, Grad Norm: 5.4030, LR: 0.000020
+Epoch 0, Iteration 220, Loss: 5.5699, Loss AR: 5.0621, Loss CFM: 0.5078, Grad Norm: 5.9616, LR: 0.000020
+Epoch 0, Iteration 230, Loss: 5.8119, Loss AR: 5.1636, Loss CFM: 0.6483, Grad Norm: 8.4473, LR: 0.000020
+Epoch 0, Iteration 240, Loss: 5.7326, Loss AR: 5.1655, Loss CFM: 0.5671, Grad Norm: 7.5353, LR: 0.000020
+Epoch 0, Iteration 250, Loss: 5.7567, Loss AR: 5.2142, Loss CFM: 0.5425, Grad Norm: 5.6969, LR: 0.000020
+Epoch 0, Iteration 260, Loss: 5.4423, Loss AR: 4.9438, Loss CFM: 0.4985, Grad Norm: 5.0690, LR: 0.000020
+Epoch 0, Iteration 270, Loss: 5.6098, Loss AR: 5.0514, Loss CFM: 0.5585, Grad Norm: 5.7998, LR: 0.000020
+Epoch 0, Iteration 280, Loss: 5.6351, Loss AR: 5.0776, Loss CFM: 0.5575, Grad Norm: 5.4499, LR: 0.000020
+Epoch 0, Iteration 290, Loss: 5.6632, Loss AR: 5.0598, Loss CFM: 0.6034, Grad Norm: 5.3381, LR: 0.000020
+Epoch 0, Iteration 300, Loss: 5.5276, Loss AR: 5.0196, Loss CFM: 0.5081, Grad Norm: 6.4791, LR: 0.000020

russian_train_2/vc_wrapper.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+_target_: modules.v2.vc_wrapper.VoiceConversionWrapper
+sr: 22050
+hop_size: 256
+mel_fn:
+  _target_: modules.audio.mel_spectrogram
+  _partial_: true
+  n_fft: 1024
+  win_size: 1024
+  hop_size: 256
+  num_mels: 80
+  sampling_rate: 22050
+  fmin: 0
+  fmax: null
+  center: False
+cfm:
+  _target_: modules.v2.cfm.CFM
+  estimator:
+    _target_: modules.v2.dit_wrapper.DiT
+    time_as_token: true
+    style_as_token: true
+    uvit_skip_connection: false
+    block_size: 8192
+    depth: 13
+    num_heads: 8
+    hidden_dim: 512
+    in_channels: 80
+    content_dim: 512
+    style_encoder_dim: 192
+    class_dropout_prob: 0.1
+    dropout_rate: 0.0
+    attn_dropout_rate: 0.0
+cfm_length_regulator:
+  _target_: modules.v2.length_regulator.InterpolateRegulator
+  channels: 512
+  is_discrete: true
+  codebook_size: 2048
+  sampling_ratios: [ 1, 1, 1, 1 ]
+  f0_condition: false
+ar:
+  _target_: modules.v2.ar.NaiveWrapper
+  model:
+    _target_: modules.v2.ar.NaiveTransformer
+    config:
+      _target_: modules.v2.ar.NaiveModelArgs
+      dropout: 0.0
+      rope_base: 10000.0
+      dim: 768
+      head_dim: 64
+      n_local_heads: 2
+      intermediate_size: 2304
+      n_head: 12
+      n_layer: 12
+      vocab_size: 2049  # 1 + 1 for eos
+ar_length_regulator:
+  _target_: modules.v2.length_regulator.InterpolateRegulator
+  channels: 768
+  is_discrete: true
+  codebook_size: 32
+  sampling_ratios: [ ]
+  f0_condition: false
+style_encoder:
+  _target_: modules.campplus.DTDNN.CAMPPlus
+  feat_dim: 80
+  embedding_size: 192
+content_extractor_narrow:
+  _target_: modules.astral_quantization.default_model.AstralQuantizer
+  tokenizer_name: "openai/whisper-small"
+  ssl_model_name: "facebook/hubert-large-ll60k"
+  ssl_output_layer: 18
+  skip_ssl: true
+  encoder: &bottleneck_encoder
+    _target_: modules.astral_quantization.convnext.ConvNeXtV2Stage
+    dim: 512
+    num_blocks: 12
+    intermediate_dim: 1536
+    dilation: 1
+    input_dim: 1024
+  quantizer:
+    _target_: modules.astral_quantization.bsq.BinarySphericalQuantize
+    codebook_size: 32  # codebook size, must be a power of 2
+    dim: 512
+    entropy_loss_weight: 0.1
+    diversity_gamma: 1.0
+    spherical: True
+    enable_entropy_loss: True
+    soft_entropy_loss: True
+content_extractor_wide:
+  _target_: modules.astral_quantization.default_model.AstralQuantizer
+  tokenizer_name: "openai/whisper-small"
+  ssl_model_name: "facebook/hubert-large-ll60k"
+  ssl_output_layer: 18
+  encoder: *bottleneck_encoder
+  quantizer:
+    _target_: modules.astral_quantization.bsq.BinarySphericalQuantize
+    codebook_size: 2048  # codebook size, must be a power of 2
+    dim: 512
+    entropy_loss_weight: 0.1
+    diversity_gamma: 1.0
+    spherical: True
+    enable_entropy_loss: True
+    soft_entropy_loss: True
+vocoder:
+  _target_: modules.bigvgan.bigvgan.BigVGAN.from_pretrained
+  pretrained_model_name_or_path: "nvidia/bigvgan_v2_22khz_80band_256x"
+  use_cuda_kernel: false

russian_train_3/AR_epoch_00000_step_10000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb478588c963544c0e665841923f397eb44ca05eae421ad062c9603768e97750
+size 333673060

russian_train_3/CFM_epoch_00000_step_10000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:699797b76af298ed84d4126b72535f9f4f7adf7f62bbed9e11bc748259bb02c2
+size 352130283

russian_train_3/train.log ADDED Viewed

The diff for this file is too large to render. See raw diff

russian_train_3/vc_wrapper.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+_target_: modules.v2.vc_wrapper.VoiceConversionWrapper
+sr: 22050
+hop_size: 256
+mel_fn:
+  _target_: modules.audio.mel_spectrogram
+  _partial_: true
+  n_fft: 1024
+  win_size: 1024
+  hop_size: 256
+  num_mels: 80
+  sampling_rate: 22050
+  fmin: 0
+  fmax: null
+  center: False
+cfm:
+  _target_: modules.v2.cfm.CFM
+  estimator:
+    _target_: modules.v2.dit_wrapper.DiT
+    time_as_token: true
+    style_as_token: true
+    uvit_skip_connection: false
+    block_size: 8192
+    depth: 13
+    num_heads: 8
+    hidden_dim: 512
+    in_channels: 80
+    content_dim: 512
+    style_encoder_dim: 192
+    class_dropout_prob: 0.1
+    dropout_rate: 0.0
+    attn_dropout_rate: 0.0
+cfm_length_regulator:
+  _target_: modules.v2.length_regulator.InterpolateRegulator
+  channels: 512
+  is_discrete: true
+  codebook_size: 2048
+  sampling_ratios: [ 1, 1, 1, 1 ]
+  f0_condition: false
+ar:
+  _target_: modules.v2.ar.NaiveWrapper
+  model:
+    _target_: modules.v2.ar.NaiveTransformer
+    config:
+      _target_: modules.v2.ar.NaiveModelArgs
+      dropout: 0.0
+      rope_base: 10000.0
+      dim: 768
+      head_dim: 64
+      n_local_heads: 2
+      intermediate_size: 2304
+      n_head: 12
+      n_layer: 12
+      vocab_size: 2049  # 1 + 1 for eos
+ar_length_regulator:
+  _target_: modules.v2.length_regulator.InterpolateRegulator
+  channels: 768
+  is_discrete: true
+  codebook_size: 32
+  sampling_ratios: [ ]
+  f0_condition: false
+style_encoder:
+  _target_: modules.campplus.DTDNN.CAMPPlus
+  feat_dim: 80
+  embedding_size: 192
+content_extractor_narrow:
+  _target_: modules.astral_quantization.default_model.AstralQuantizer
+  tokenizer_name: "openai/whisper-small"
+  ssl_model_name: "facebook/hubert-large-ll60k"
+  ssl_output_layer: 18
+  skip_ssl: true
+  encoder: &bottleneck_encoder
+    _target_: modules.astral_quantization.convnext.ConvNeXtV2Stage
+    dim: 512
+    num_blocks: 12
+    intermediate_dim: 1536
+    dilation: 1
+    input_dim: 1024
+  quantizer:
+    _target_: modules.astral_quantization.bsq.BinarySphericalQuantize
+    codebook_size: 32  # codebook size, must be a power of 2
+    dim: 512
+    entropy_loss_weight: 0.1
+    diversity_gamma: 1.0
+    spherical: True
+    enable_entropy_loss: True
+    soft_entropy_loss: True
+content_extractor_wide:
+  _target_: modules.astral_quantization.default_model.AstralQuantizer
+  tokenizer_name: "openai/whisper-small"
+  ssl_model_name: "facebook/hubert-large-ll60k"
+  ssl_output_layer: 18
+  encoder: *bottleneck_encoder
+  quantizer:
+    _target_: modules.astral_quantization.bsq.BinarySphericalQuantize
+    codebook_size: 2048  # codebook size, must be a power of 2
+    dim: 512
+    entropy_loss_weight: 0.1
+    diversity_gamma: 1.0
+    spherical: True
+    enable_entropy_loss: True
+    soft_entropy_loss: True
+vocoder:
+  _target_: modules.bigvgan.bigvgan.BigVGAN.from_pretrained
+  pretrained_model_name_or_path: "nvidia/bigvgan_v2_22khz_80band_256x"
+  use_cuda_kernel: false