Spaces:

mnhatdaous
/

learnable-speech

Sleeping

App Files Files Community

primepake commited on Jul 3

Commit

997d9c0

1 Parent(s): 8387742

release DAC-VAE continous latent space

Browse files

Files changed (4) hide show

README.md +2 -1
dac-vae/config.yml +128 -0
dac-vae/inference.py +8 -5
dac-vae/model.py +1 -0

README.md CHANGED Viewed

@@ -66,7 +66,8 @@ pip install -r requirements.txt
 2. **Extracting DAC-VAE latent**
    ```bash
-   python inference.py
    ```
 3. **Stage 1: Auto Regressive Transformer**

 2. **Extracting DAC-VAE latent**
    ```bash
+   cd dac-vae
+   python inference.py --checkpoint checkpoint.pt --config config.yml
    ```
 3. **Stage 1: Auto Regressive Transformer**

dac-vae/config.yml ADDED Viewed

	@@ -0,0 +1,128 @@

+# Model setup
+vae:
+  sample_rate: 24000
+  encoder_dim: 64
+  latent_dim: 64
+  encoder_rates: [2, 4, 5, 8]
+  decoder_dim: 1536
+  decoder_rates: [8, 5, 4, 2]
+  d_in: 1
+  d_out: 1
+  weight_init: xavier
+  activation: snake
+  gain: 1.0
+discriminator:
+  sample_rate: 24000
+  d_in: 1
+  rates: []
+  periods: [2, 3, 5, 7, 11]
+  fft_sizes: [2048, 1024, 512]
+  bands:
+  - [0.0, 0.1]
+  - [0.1, 0.25]
+  - [0.25, 0.5]
+  - [0.5, 0.75]
+  - [0.75, 1.0]
+max_norm: 1000
+max_norm_d: 10
+initial_norm: 1000
+initial_norm_d: 10
+amp: false
+batch_size: 64
+val_batch_size: 4
+num_workers: 0
+device: cuda
+num_samples: 530000
+gan_start_step: 0
+num_iters: 500000
+save_iters: 1000
+valid_freq: 1000
+sample_freq: 2000
+val_idx: [0, 1, 2, 3, 4, 5, 6, 7]
+seed: 0
+lambdas:
+  mel/loss: 15.0
+  adv/feat_loss: 2.0
+  adv/gen_loss: 1.0
+  kl/loss: 0.1
+  stft/loss: 0.0
+  waveform/loss: 0.0
+  logs_penalty: 0.0 #0.02
+  grad_penalty: 0.0 #1.0
+  lipschitz_penalty: 0.0 #0.001
+VolumeNorm.db: [lufs, -18]
+# Transforms
+build_transform.preprocess:
+  - Identity
+build_transform.augment_prob: 0.0
+build_transform.augment:
+  - Identity
+build_transform.postprocess:
+  - Identity
+  - Identity
+  - Identity
+# Loss setup
+MultiScaleSTFTLoss:
+  window_lengths: [1024, 2048]
+MelSpectrogramLoss:
+  n_mels: [5, 10, 20, 40, 80, 160, 320]
+  window_lengths: [32, 64, 128, 256, 512, 1024, 2048]
+  mel_fmin: [0, 0, 0, 0, 0, 0, 0]
+  mel_fmax: [null, null, null, null, null, null, null]
+  pow: 1.0
+  clamp_eps: 1.0e-5
+  mag_weight: 0.0
+# optimizer
+optimizer:
+  type: Adamw
+  weight_decay: 0.001
+  lr: 0.0001
+  scheduler: linearlr # or constantlr
+  warmup_steps: 500
+disc_optimizer:
+  type: Adamw
+  weight_decay: 0.001
+  lr: 0.0001
+  scheduler: linearlr # or constantlr
+  warmup_steps: 500
+# Data
+train:
+  duration: 0.38
+  n_examples: 10000000
+  without_replacement: true
+  shuffle_loaders: true
+val:
+  duration: 5.0
+  n_examples: 100
+  without_replacement: true
+  shuffle_loaders: false
+test:
+  duration: 10.0
+  n_examples: 1000
+  without_replacement: true
+  shuffle_loaders: false
+train_folders:
+  Emilia_EN:
+    - /home/masuser/minimax-audio/dataset/Emilia/EN
+val_folders:
+  Emilia_EN:
+    - /home/masuser/minimax-audio/dataset/libritts
+test_folders:
+  Emilia_EN:
+    - /home/masuser/minimax-audio/dataset/libritts

dac-vae/inference.py CHANGED Viewed

@@ -137,6 +137,9 @@ class DACVAEInference:
         # Forward pass through model
         print("Processing through DACVAE...")
         out = self.model(audio_tensor, self.sample_rate)
         # Extract outputs
@@ -146,7 +149,7 @@ class DACVAEInference:
         z = out['z']
         mu = out['mu']
         logs = out['logs']
         # Clamp output
         recons_audio = np.clip(recons_audio, -1.0, 1.0)
@@ -167,13 +170,13 @@ class DACVAEInference:
 def main():
     parser = argparse.ArgumentParser(description="DACVAE Audio Inference")
-    parser.add_argument('--checkpoint', type=str, required=True,
                         help='Path to model checkpoint')
-    parser.add_argument('--config', type=str, default=None,
                         help='Path to config YAML (optional if config is in checkpoint)')
-    parser.add_argument('--input', type=str, required=True,
                         help='Path to input audio file')
-    parser.add_argument('--output', type=str, default=None,
                         help='Path to save output audio (default: input_reconstructed.wav)')
     parser.add_argument('--device', type=str, default='cuda',
                         choices=['cuda', 'cpu'], help='Device to run on')

         # Forward pass through model
         print("Processing through DACVAE...")
+        audio_tensor = audio_tensor[:, :, :9120]
+        print('audio_tensor shape: ', audio_tensor.shape)
         out = self.model(audio_tensor, self.sample_rate)
         # Extract outputs
         z = out['z']
         mu = out['mu']
         logs = out['logs']
+        print('z shape: ', z.shape)
         # Clamp output
         recons_audio = np.clip(recons_audio, -1.0, 1.0)
 def main():
     parser = argparse.ArgumentParser(description="DACVAE Audio Inference")
+    parser.add_argument('--checkpoint', type=str, required=False, default="/mnt/nvme/ckpts/24khz/364k_20250702_043748/checkpoint.pt",
                         help='Path to model checkpoint')
+    parser.add_argument('--config', type=str, default="./config.yml",
                         help='Path to config YAML (optional if config is in checkpoint)')
+    parser.add_argument('--input', type=str, required=False, default='./output.wav',
                         help='Path to input audio file')
+    parser.add_argument('--output', type=str, default='./test.wav',
                         help='Path to save output audio (default: input_reconstructed.wav)')
     parser.add_argument('--device', type=str, default='cuda',
                         choices=['cuda', 'cpu'], help='Device to run on')

dac-vae/model.py CHANGED Viewed

@@ -474,6 +474,7 @@ class DACVAE(BaseModel, CodecMixin):
         x = self.encoder(audio_data)
         x = F.leaky_relu(x)
         x = self.en_conv_post(x)
         m, logs = torch.split(x, self.latent_dim, dim=1)
         logs = torch.clamp(logs, min=-14.0, max=14.0)

         x = self.encoder(audio_data)
         x = F.leaky_relu(x)
         x = self.en_conv_post(x)
+        print('x shape: ', x.shape)
         m, logs = torch.split(x, self.latent_dim, dim=1)
         logs = torch.clamp(logs, min=-14.0, max=14.0)