Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

kmeans_v200.npy +3 -0
vae-gslm/hp.yaml +278 -0
vae-gslm/last-cpt.ckpt +3 -0
vocoder/hp.yaml +104 -0
vocoder/last-cpt.ckpt +3 -0

kmeans_v200.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:148b638a3f392906bb0ba92c398d2e6c252f186e99db4fa38501e428c58c1de9
+size 819328

vae-gslm/hp.yaml ADDED Viewed

	@@ -0,0 +1,278 @@

+data:
+  train:
+    batch_size: 48
+    bits_per_second: 18500
+    min_audio_length: 3.0
+    num_workers: 6
+    path: /usr3/liweiche/libri-light/libri-light/data_preparation/vad_20s/tokens_v200.txt
+    post_pad:
+      mel:
+        length: 12.8
+      tokens:
+        num_tokens: 640
+    preprocess_mels: /usr3/liweiche/libri-light/libri-light/data_preparation/vad_20s/mels
+    preprocess_mels_recursive_dir: true
+    random_crop_mel_utt:
+      max_seg_sec: 4.0
+      min_seg_sec: 2.0
+    sample_rate: 16000
+    sampler:
+      shuffle: true
+      type: standard
+    token_segment_size: 640
+    wavdir: /usr3/liweiche/libri-light/libri-light/data_preparation/vad_20s/
+    with_text: false
+    with_tokens: true
+  val:
+    batch_size: 8
+    bits_per_second: 18500
+    min_audio_length: 3.2
+    num_workers: 2
+    pad:
+      mode: constant
+      multiple_of: 400
+    path: /usr3/liweiche/LibriSpeech-960/dev/tokens_v200_libri-light.txt
+    random_crop_mel_utt:
+      max_seg_sec: 5.0
+      min_seg_sec: 1.0
+    sample_rate: 16000
+    sampler:
+      shuffle: true
+      type: standard
+    token_segment_size: 150
+    wavdir: /usr3/liweiche/LibriSpeech-960/dev
+    with_text: false
+    with_tokens: true
+hubert:
+  sample_rate: 50
+logging:
+  log_dir: outputs/libri-light/lvtr
+  num_samples: 10
+  plot_attn: false
+  sample_length: 7.0
+  sample_prior_length: 2.0
+  temperature: 1.0
+model:
+  decoder:
+    cond_unet:
+      time_embedding:
+        activation:
+          identifier: SiLU
+        dim: 256
+        maxpos: 1000
+      unet:
+        condition_dim: 32
+        conditional:
+        - false
+        - true
+        - true
+        - true
+        - true
+        - false
+        connection_type: concat
+        final_norm: true
+        hidden_channels:
+        - 2048
+        - 2048
+        - 2048
+        - 2048
+        - 2048
+        - 2048
+        init_channel: 512
+        layer:
+          activation:
+            identifier: SiLU
+          aux_in_channels: 32
+          causal_padding: true
+          condition_type: concat
+          hidden_channels: 2048
+          in_channels: 512
+          in_dim: 32
+          kernel_size: 7
+          norm:
+            eps: 1.0e-06
+            identifier: InstanceNorm
+          time_dim: 256
+        num_layers: 6
+        out_channels:
+        - 512
+        - 512
+        - 512
+        - 512
+        - 512
+        - 512
+        resample_ksize:
+        - 1
+        - 1
+        - 1
+        - 1
+        - 1
+        - 1
+        resample_rates:
+        - 1
+        - 1
+        - 1
+        - 1
+        - 1
+        - 1
+        skip_connection:
+        - null
+        - null
+        - null
+        - 2
+        - 1
+        - 0
+        time_dim: 256
+        upward_layer:
+          activation:
+            identifier: SiLU
+          aux_in_channels: 0
+          boundary: 3
+          condition_type: concat
+          future_padding: true
+          hidden_channels: 2048
+          in_channels: 512
+          in_dim: 32
+          kernel_size: 7
+          norm:
+            eps: 1.0e-06
+            identifier: InstanceNorm
+          time_dim: 256
+    diffusion:
+      beta_schedule:
+        identifier: cosine
+      clamp_range:
+      - -3.0
+      - 1.2
+      ddim_sampling_eta: 1.0
+      identifier: ConditionalBottleNeckUNet
+      input_scale: 5.0
+      loss_type: l1
+      objective: pred_noise
+      timesteps: 1000
+  encoder:
+    final_norm: true
+    hidden_channels:
+    - 2048
+    - 2048
+    - 2048
+    identifier: BottleNeckResNet
+    init_channel: 512
+    layer:
+      activation:
+        identifier: ReLU
+      aux_in_channels: 0
+      causal_padding: true
+      hidden_channels: 2048
+      in_channels: 512
+      kernel_size: 7
+      norm:
+        eps: 1.0e-06
+        identifier: InstanceNorm
+    num_layers: 3
+    out_channels:
+    - 512
+    - 512
+    - 512
+    resample_ksize:
+    - 1
+    - 1
+    - 1
+    resample_rates:
+    - 1
+    - 1
+    - 1
+  latent_dim: 4
+  tokens:
+    embedding_dim: 64
+    vocab_size: 200
+  transformer:
+    bias: false
+    flow:
+      conditional: true
+      layer:
+        activation:
+          identifier: GELU
+        hidden_dim: 64
+        mean_only: false
+        norm:
+          eps: 1.0e-06
+          identifier: LayerNorm
+        scale_range:
+        - 0.5
+        - 2.0
+      num_layers: 4
+    layer:
+      activation:
+        identifier: GELU
+      dim: 1024
+      ffd_size: 4096
+      norm:
+        eps: 1.0e-06
+        identifier: RMSNorm
+      self_attn:
+        causal: true
+        nheads: 16
+    num_layers: 16
+    rpe:
+      identifier: ALiBi
+      maxpos: 1024
+  utterance_encoder:
+    embedding_dim: 128
+    init_channel: 64
+    layer:
+      activation:
+        identifier: ReLU
+      in_channels: 256
+      kernel_size: 4
+      norm:
+        eps: 1.0e-06
+        identifier: InstanceNorm
+      out_channels: 512
+      stride: -2
+    num_layers: 3
+    out_channels:
+    - 128
+    - 256
+    - 512
+    resample_ksize:
+    - 4
+    - 4
+    - 4
+    resample_rates:
+    - -2
+    - -2
+    - -2
+trainer:
+  compile:
+    mode: default
+  ddp_strategy: ddp
+  distributed: true
+  identifier: trainers.speech.lvtr.LVTRTrainer
+  limit_val_batches: 500
+  precision: 16-mixed
+  save_every_n_epoch: 1
+  total_steps: 1200000
+  val_check_interval: 10000
+training:
+  fixed_beta: 0.04
+  gradient_accumulation: 2
+  mel_rescale:
+    mean: -1.5
+    std: 2.0
+  optimizer:
+    beta1: 0.9
+    beta2: 0.98
+    exclude_norm_and_bias_from_weight_decay: true
+    identifier: AdamW
+    lr: 0.0005
+    weight_decay: 0.1
+  scale_rec_beta: false
+  scheduler:
+    flat_steps: 30000
+    identifier: cosine
+    min_lr: 5.0e-05
+    warmup_kld: 30000
+  token_kld_weight: 0.5
+vocoder:
+  path: ./vocoder_ckpt

vae-gslm/last-cpt.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01e59383ebf8c510af9f2390aae9485696fac16c4e83e0d0b460731270095310
+size 908011386

vocoder/hp.yaml ADDED Viewed

	@@ -0,0 +1,104 @@

+trainer:
+    identifier: "trainers.vocoder.hfgan.HiFiGANTrainer"
+    total_steps: 1600000 # Total Steps * 2 (GANs)
+    check_val_every_n_epoch: 2
+    save_every_n_epoch: 2
+    limit_val_batches: 500
+    precision: "32"
+    distributed: false
+logging:
+    log_dir: "outputs/hfgan_50hz_librispeech"
+    num_samples: 10
+feature:
+    sample_rate: 16000
+    n_fft: 1025
+    win_length: 1024
+    hop_length: 320
+    n_mels: 80
+    f_min: 0
+    f_max: 8000
+    power: 1.0
+    log_scale: true
+model:
+    generator:
+        weight_norm: true
+        upsample_rates: [5, 4, 2, 2, 2, 2]
+        upsample_kernel_sizes: [10, 8, 4, 4, 4, 4]
+        upsample_initial_channel: 512
+        resblock_kernel_sizes: [3, 7, 11]
+        resblock_dilation_sizes:
+            - [1, 3, 5]
+            - [1, 3, 5]
+            - [1, 3, 5]
+        in_channels: 80
+        kernel_size: 7
+    mrd:
+        weight_norm: true
+        resolutions:
+            - [1024, 120, 600]
+            - [2048, 240, 1200]
+            - [512, 50, 240]
+    mpd:
+        weight_norm: true
+        periods: [2, 3, 5, 7, 11]
+training:
+    generator:
+        optimizer:
+            identifier: "Adam"
+            lr: 0.0001
+            beta1: 0.8
+            beta2: 0.98
+        scheduler:
+            identifier: "triangle"
+            warmup_steps: 0
+            flat_steps: 100000
+    discriminator:
+        optimizer:
+            identifier: "Adam"
+            lr: 0.0001
+            beta1: 0.8
+            beta2: 0.98
+        scheduler:
+            identifier: "triangle"
+            warmup_steps: 0
+            flat_steps: 100000
+    mel_loss_weight: 40.0
+data:
+    train:
+        path: "/usr2/liweiche/LibriSpeech-960/train/metadata.txt"
+        wavdir: "/usr2/liweiche/LibriSpeech-960/train"
+        segment_size: 1.0
+        sample_rate: 16000
+        dither: true
+        with_text: false
+        num_workers: 32
+        batch_size: 24
+        min_audio_length: 1.5
+        bits_per_second: 18500
+        sampler:
+            type: "standard"
+            shuffle: true
+    val:
+        path: "/usr2/liweiche/LibriSpeech-960/dev/metadata.txt"
+        wavdir: "/usr2/liweiche/LibriSpeech-960/dev"
+        sample_rate: 16000
+        segment_size: 7.0
+        with_text: false
+        num_workers: 8
+        batch_size: 4
+        min_audio_length: 4.0
+        bits_per_second: 18500
+        sampler:
+            type: "standard"
+            shuffle: false

vocoder/last-cpt.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0497a846126b5b0a7df7f2004f303bb17010f11ba424ec0132d5b14cfffbac8c
+size 51877178