Spaces:

mnhatdaous
/

learnable-speech

Sleeping

App Files Files Community

primepake commited on Jul 3

Commit

2279ae0

1 Parent(s): 997d9c0

add reconstruction for audio

Browse files

Files changed (20) hide show

flowae/audio_dito_inference.py +332 -0
flowae/configs/datasets/dae.yaml +12 -21
flowae/configs/experiments/dito-B-audio.yaml +10 -6
flowae/datasets/__init__.py +2 -2
flowae/datasets/class_folder.py +2 -0
flowae/datasets/class_folder_audio.py +196 -0
flowae/datasets/wrapper_audio_cae.py +89 -0
flowae/datasets/wrapper_cae.py +1 -193
flowae/{reconstruction.py → image_dito_inference.py} +0 -0
flowae/models/diffusion/fm.py +24 -6
flowae/models/ldm/dac/layers.py +1 -1
flowae/models/ldm/dac/model.py +3 -1
flowae/models/ldm/dac/utils.py +11 -11
flowae/models/ldm/dito.py +142 -1
flowae/models/ldm/ldm_base.py +224 -0
flowae/models/networks/__init__.py +2 -1
flowae/models/networks/consistency_audio_decoder_unet.py +322 -0
flowae/models/networks/consistency_decoder_unet.py +1 -0
flowae/run.sh +2 -0
flowae/upload.sh +2 -0

flowae/audio_dito_inference.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import torch
+import torch.nn as nn
+import torchaudio
+import numpy as np
+from pathlib import Path
+import argparse
+import soundfile as sf
+from omegaconf import OmegaConf
+import matplotlib.pyplot as plt
+# Import models
+import models
+from models.ldm.dac.audiotools import AudioSignal
+class AudioDiToInference:
+    def __init__(self, checkpoint_path, device='cuda'):
+        """Initialize Audio DiTo model from checkpoint"""
+        self.device = device
+        # Load checkpoint
+        print(f"Loading checkpoint from {checkpoint_path}")
+        ckpt = torch.load(checkpoint_path, map_location='cpu')
+        # Extract config
+        self.config = OmegaConf.create(ckpt['config'])
+        # Create model
+        self.model = models.make(self.config['model'])
+        # Load state dict
+        self.model.load_state_dict(ckpt['model']['sd'])
+        # Move to device and set to eval
+        self.model = self.model.to(device)
+        self.model.eval()
+        # Get audio parameters from config
+        self.sample_rate = self.config.get('sample_rate', 24000)
+        self.mono = self.config.get('mono', True)
+        print(f"Model loaded successfully!")
+        print(f"Sample rate: {self.sample_rate} Hz")
+        print(f"Mono: {self.mono}")
+    def load_audio(self, audio_path, duration=None, offset=0.0):
+        """Load audio file using AudioSignal
+        Args:
+            audio_path: Path to audio file
+            duration: Duration in seconds (None for full audio)
+            offset: Start offset in seconds
+        """
+        # Load audio using AudioSignal
+        if duration is not None:
+            signal = AudioSignal(
+                str(audio_path),
+                duration=duration,
+                offset=offset,
+            )
+        else:
+            # Load full audio
+            signal = AudioSignal(str(audio_path))
+        # Convert to mono if needed
+        if self.mono and signal.num_channels > 1:
+            signal = signal.to_mono()
+        # Resample to model sample rate
+        if signal.sample_rate != self.sample_rate:
+            signal = signal.resample(self.sample_rate)
+        # Normalize
+        signal = signal.normalize()
+        # Clamp to [-1, 1]
+        signal.audio_data = signal.audio_data.clamp(-1.0, 1.0)
+        return signal
+    def save_audio(self, reconstructed, output_path):
+        """Save AudioSignal to file"""
+        # Get audio data
+        print('shape of reconstructed: ', reconstructed.shape)
+        sf.write(output_path, reconstructed, self.sample_rate)
+        print(f"Saved audio to {output_path}")
+    def reconstruct_audio(self, audio_path, num_steps=50, save_latent=False):
+        """Reconstruct entire audio file at once
+        Args:
+            audio_path: Path to audio file
+            num_steps: Number of diffusion steps
+            save_latent: Whether to return the latent representation
+        """
+        # Load full audio without duration limit
+        signal = self.load_audio(audio_path, duration=None, offset=0.0)
+        # Get audio tensor
+        audio_tensor = signal.audio_data  # [channels, samples]
+        if audio_tensor.dim() == 2:
+            audio_tensor = audio_tensor.squeeze(0)  # [samples] for mono
+        # Add batch dimension
+        audio_tensor = audio_tensor.to(self.device)  # [1, samples]
+        print(f"Input shape: {audio_tensor.shape}")
+        print(f"Full audio duration: {audio_tensor.shape[-1] / self.sample_rate:.2f}s")
+        with torch.no_grad():
+            # Prepare data dict
+            data = {'inp': audio_tensor}
+            # Step 1: Encode to latent
+            print('shape of audio_tensor: ', audio_tensor.shape)
+            z = self.model.encode(audio_tensor)
+            print(f"Latent shape: {z.shape}")
+            # Step 2: Decode latent (if model has separate decode step)
+            if hasattr(self.model, 'decode'):
+                z_dec = self.model.decode(z)
+            else:
+                z_dec = z
+            print(f"Decoded latent shape: {z_dec.shape}")
+            # Step 3: Prepare dummy coordinates (based on training code)
+            b, *_ = audio_tensor.shape
+            # Step 4: Render using diffusion
+            if hasattr(self.model, 'render'):
+                # Render expects z_dec, coord, scale
+                print('using render diffusion model')
+                reconstructed = self.model.render(z_dec)
+            else:
+                # Alternative: direct decode if render not available
+                reconstructed = self.model(data, mode='pred')
+        # Remove batch dimension
+        reconstructed = reconstructed.squeeze(0).squeeze(0).cpu().numpy()  # [samples]
+        print('shape of reconstructed: ', reconstructed.shape)
+        if save_latent:
+            return reconstructed, z.cpu()
+        else:
+            return reconstructed
+    def save_reconstruction(self, audio_path, output_path, num_steps=50):
+        """Reconstruct and save entire audio file"""
+        reconstructed = self.reconstruct_audio(audio_path, num_steps)
+        self.save_audio(reconstructed, output_path)
+    def compare_reconstruction(self, audio_path, output_path, num_steps=50):
+        """Save original and reconstruction concatenated"""
+        # Load original full audio
+        original = self.load_audio(audio_path, duration=None, offset=0.0)
+        # Get reconstruction of full audio
+        reconstructed = self.reconstruct_audio(audio_path, num_steps)
+        # Add 0.5 second silence between clips
+        silence_samples = int(0.5 * self.sample_rate)
+        silence_data = torch.zeros(1, silence_samples)
+        # Concatenate: original -> silence -> reconstruction
+        concat_data = torch.cat([
+            original.audio_data.cpu(),
+            silence_data,
+            reconstructed.audio_data.cpu()
+        ], dim=1)
+        # Create concatenated signal
+        comparison = AudioSignal(
+            concat_data,
+            sample_rate=self.sample_rate
+        )
+        self.save_audio(comparison, output_path)
+        print(f"Saved comparison (original + reconstruction) to {output_path}")
+    def visualize_latent(self, audio_path, output_path):
+        """Visualize the latent representation of full audio"""
+        # Get latent
+        _, z = self.reconstruct_audio(audio_path, save_latent=True)
+        z_np = z.squeeze(0).numpy()  # Remove batch dimension
+        # Create visualization
+        if z_np.ndim == 2:  # [channels, frames]
+            n_channels = z_np.shape[0]
+            fig, axes = plt.subplots(n_channels, 1, figsize=(12, 2*n_channels))
+            if n_channels == 1:
+                axes = [axes]
+            for i in range(n_channels):
+                im = axes[i].imshow(
+                    z_np[i:i+1],
+                    aspect='auto',
+                    cmap='coolwarm',
+                    interpolation='nearest'
+                )
+                axes[i].set_title(f'Latent Channel {i+1}')
+                axes[i].set_xlabel('Time Frames')
+                axes[i].set_ylabel('Feature')
+                plt.colorbar(im, ax=axes[i])
+        else:  # 1D latent
+            plt.figure(figsize=(12, 4))
+            plt.plot(z_np.T)
+            plt.title('Latent Representation')
+            plt.xlabel('Time Frames')
+            plt.ylabel('Value')
+        plt.tight_layout()
+        plt.savefig(output_path, dpi=150)
+        plt.close()
+        print(f"Saved latent visualization to {output_path}")
+    def batch_reconstruct(self, audio_folder, output_folder, max_files=None, num_steps=50):
+        """Reconstruct all audio files in a folder (full audio)"""
+        audio_folder = Path(audio_folder)
+        output_folder = Path(output_folder)
+        output_folder.mkdir(exist_ok=True, parents=True)
+        # Get all audio files
+        audio_extensions = ['.wav', '.mp3', '.flac', '.m4a', '.ogg']
+        audio_paths = []
+        for ext in audio_extensions:
+            audio_paths.extend(audio_folder.glob(f'*{ext}'))
+            audio_paths.extend(audio_folder.glob(f'*{ext.upper()}'))
+        if max_files:
+            audio_paths = audio_paths[:max_files]
+        print(f"Processing {len(audio_paths)} audio files...")
+        for audio_path in audio_paths:
+            output_path = output_folder / f"recon_{audio_path.stem}.wav"
+            try:
+                self.save_reconstruction(
+                    str(audio_path), str(output_path),
+                    num_steps=num_steps
+                )
+            except Exception as e:
+                print(f"Error processing {audio_path}: {e}")
+                continue
+        print("Batch reconstruction complete!")
+def main():
+    parser = argparse.ArgumentParser(description='Audio DiTo Inference')
+    parser.add_argument('--checkpoint', type=str, required=True,
+                        help='Path to Audio DiTo checkpoint')
+    parser.add_argument('--input', type=str, required=True,
+                        help='Input audio path or folder')
+    parser.add_argument('--output', type=str, required=True,
+                        help='Output path')
+    parser.add_argument('--compare', action='store_true',
+                        help='Save comparison with original')
+    parser.add_argument('--batch', action='store_true',
+                        help='Process entire folder')
+    parser.add_argument('--visualize', action='store_true',
+                        help='Visualize latent representation')
+    parser.add_argument('--steps', type=int, default=50,
+                        help='Number of diffusion steps')
+    parser.add_argument('--device', type=str, default='cuda',
+                        help='Device to use (cuda/cpu)')
+    parser.add_argument('--max-files', type=int, default=None,
+                        help='Maximum files to process in batch mode')
+    args = parser.parse_args()
+    # Initialize model
+    audio_dito = AudioDiToInference(args.checkpoint, device=args.device)
+    # Process based on mode
+    if args.batch:
+        # Batch processing
+        audio_dito.batch_reconstruct(
+            args.input, args.output,
+            max_files=args.max_files,
+            num_steps=args.steps
+        )
+    elif args.visualize:
+        # Visualize latent
+        audio_dito.visualize_latent(
+            args.input, args.output
+        )
+    elif args.compare:
+        # Save comparison
+        audio_dito.compare_reconstruction(
+            args.input, args.output,
+            num_steps=args.steps
+        )
+    else:
+        # Single reconstruction
+        audio_dito.save_reconstruction(
+            args.input, args.output,
+            num_steps=args.steps
+        )
+# Example usage function for direct Python use
+def reconstruct_single_audio(checkpoint_path, audio_path, output_path):
+    """Simple function to reconstruct a single audio file"""
+    audio_dito = AudioDiToInference(checkpoint_path)
+    audio_dito.save_reconstruction(audio_path, output_path)
+if __name__ == "__main__":
+    main()
+# Usage examples:
+# 1. Single audio reconstruction (full audio):
+#    python audio_dito_inference.py --checkpoint ckpt-best.pth --input audio.wav --output recon.wav
+#
+# 2. Save comparison (original + reconstruction):
+#    python audio_dito_inference.py --checkpoint ckpt-best.pth --input audio.wav --output compare.wav --compare
+#
+# 3. Batch processing (reconstruct all audio files in folder):
+#    python audio_dito_inference.py --checkpoint ckpt-best.pth --input audio_folder/ --output output_folder/ --batch
+#
+# 4. Visualize latent representation:
+#    python audio_dito_inference.py --checkpoint ckpt-best.pth --input audio.wav --output latent.png --visualize
+#
+# 5. Use fewer diffusion steps for faster inference:
+#    python audio_dito_inference.py --checkpoint ckpt-best.pth --input audio.wav --output recon.wav --steps 25

flowae/configs/datasets/dae.yaml CHANGED Viewed

@@ -4,22 +4,19 @@ datasets:
     name: wrapper_audio_cae
     args:
       dataset:
-        name: audio_dataset_from_folders
         args:
-          folders:
-            Emilia_EN: ["/home/masuser/minimax-audio/dataset/Emilia/EN"]
           sample_rate: 24000
           duration: 0.38
-          n_examples: 10000000
           shuffle: true
-          mono: true
       sample_rate: 24000
       duration: 0.38
       mono: true
       normalize: true
-      return_coords: true
     loader:
-      batch_size: 64
       num_workers: 8
       drop_last: true
@@ -27,20 +24,17 @@ datasets:
     name: wrapper_audio_cae
     args:
       dataset:
-        name: audio_dataset_from_folders
         args:
-          folders:
-            Emilia_EN: ["/home/masuser/minimax-audio/dataset/libritts"]
           sample_rate: 24000
           duration: 5.0
-          n_examples: 100
           shuffle: false
-          mono: true
       sample_rate: 24000
       duration: 5.0
       mono: true
       normalize: true
-      return_coords: true
     loader:
       batch_size: 4
       num_workers: 8
@@ -50,20 +44,17 @@ datasets:
     name: wrapper_audio_cae
     args:
       dataset:
-        name: audio_dataset_from_folders
         args:
-          folders:
-            Emilia_EN: ["/home/masuser/minimax-audio/dataset/libritts"]
           sample_rate: 24000
-          duration: 10.0
-          n_examples: 1000
           shuffle: false
-          mono: true
       sample_rate: 24000
-      duration: 10.0
       mono: true
       normalize: true
-      return_coords: true
     loader:
       batch_size: 1
       num_workers: 8

     name: wrapper_audio_cae
     args:
       dataset:
+        name: class_folder_audio
         args:
+          root_path: "/home/masuser/minimax-audio/dataset/Emilia/EN"
           sample_rate: 24000
           duration: 0.38
           shuffle: true
+          num_channels: 1
       sample_rate: 24000
       duration: 0.38
       mono: true
       normalize: true
     loader:
+      batch_size: 52
       num_workers: 8
       drop_last: true
     name: wrapper_audio_cae
     args:
       dataset:
+        name: class_folder_audio
         args:
+          root_path: "/home/masuser/minimax-audio/dataset/libritts"
           sample_rate: 24000
           duration: 5.0
           shuffle: false
+          num_channels: 1
       sample_rate: 24000
       duration: 5.0
       mono: true
       normalize: true
     loader:
       batch_size: 4
       num_workers: 8
     name: wrapper_audio_cae
     args:
       dataset:
+        name: class_folder_audio
         args:
+          root_path: "/home/masuser/minimax-audio/dataset/libritts"
           sample_rate: 24000
+          duration: 5.0
           shuffle: false
+          num_channels: 1
       sample_rate: 24000
+      duration: 5.0
       mono: true
       normalize: true
     loader:
       batch_size: 1
       num_workers: 8

flowae/configs/experiments/dito-B-audio.yaml CHANGED Viewed

@@ -8,12 +8,16 @@ model:
     # Encoder
     encoder:
       name: dac_encoder
-      args: {config_name: snakebeta}
     # Latent configuration - now fully convolutional
     z_channels: 64  # Number of latent channels
-    z_downsample_factor: 320  # Product of encoder_rates: 2*4*5*8
-    z_layernorm: true
     # Decoder (identity for DiTo)
     decoder:
@@ -21,10 +25,10 @@ model:
     # Renderer - Fully convolutional for dynamic duration
     renderer:
-      name: audio_renderer_wrapper
       args:
         net:
-          name: consistency_decoder_unet  # Fully Convolutional Network
           args:
             in_channels: 1
             z_dec_channels: 64
@@ -39,6 +43,6 @@ model:
       name: fm
       args: {timescale: 1000.0}
-    render_sampler: {name: fm_euler_sampler}
     render_n_steps: 50

     # Encoder
     encoder:
       name: dac_encoder
+      args: {config_name: snake}
     # Latent configuration - now fully convolutional
     z_channels: 64  # Number of latent channels
+    zaug_p: 0.1
+    zaug_decoding_loss_type: suffix
+    zaug_zdm_diffusion:
+      name: fm
+      args: {timescale: 1000.0}
     # Decoder (identity for DiTo)
     decoder:
     # Renderer - Fully convolutional for dynamic duration
     renderer:
+      name: fixres_renderer_wrapper
       args:
         net:
+          name: audio_diffusion_unet
           args:
             in_channels: 1
             z_dec_channels: 64
       name: fm
       args: {timescale: 1000.0}
+    render_sampler: {name: fm_euler_sampler_audio}
     render_n_steps: 50

flowae/datasets/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 from .datasets import register, make
-from . import image_folder, class_folder, webdataset
-from . import wrapper_cae

 from .datasets import register, make
+from . import image_folder, class_folder, webdataset, class_folder_audio
+from . import wrapper_cae, wrapper_audio_cae

flowae/datasets/class_folder.py CHANGED Viewed

@@ -6,6 +6,8 @@ from datasets import register
 from torch.utils.data import Dataset
 from torchvision import transforms
 Image.MAX_IMAGE_PIXELS = 933120000
 ImageFile.LOAD_TRUNCATED_IMAGES = True

 from torch.utils.data import Dataset
 from torchvision import transforms
+import os
+import random
 Image.MAX_IMAGE_PIXELS = 933120000
 ImageFile.LOAD_TRUNCATED_IMAGES = True

flowae/datasets/class_folder_audio.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import os
+import random
+from PIL import Image, ImageFile
+from datasets import register
+from torch.utils.data import Dataset
+from torchvision import transforms
+import os
+import random
+from pathlib import Path
+from typing import Optional, Callable
+from models.ldm.dac.audiotools import AudioSignal
+from models.ldm.dac.audiotools.core import util
+# Audio file extensions (from audiotools)
+AUDIO_EXTS = ('.wav', '.WAV', '.flac', '.FLAC', '.mp3', '.MP3', '.mp4', '.MP4', '.m4a', '.M4A')
+@register('class_folder_audio')
+class AudioFolder(Dataset):
+    """
+    Audio dataset that loads audio files from a folder structure.
+    Similar to ClassFolder but for audio files.
+    Expected folder structure:
+    root_path/
+    ├── class1/
+    │   ├── audio1.wav
+    │   ├── audio2.wav
+    │   └── ...
+    ├── class2/
+    │   ├── audio1.wav
+    │   └── ...
+    └── ...
+    Or for single class (no subfolders):
+    root_path/
+    ├── audio1.wav
+    ├── audio2.wav
+    └── ...
+    """
+    def __init__(
+        self,
+        root_path: str,
+        sample_rate: int = 24000,
+        duration: float = 2.0,
+        num_channels: int = 1,
+        random_crop: bool = True,
+        loudness_cutoff: float = -40,
+        audio_only: bool = False,
+        drop_label_p: float = 0.0,
+        shuffle: bool = True,
+        shuffle_state: int = 0,
+        transform: Optional[Callable] = None,
+        normalize: bool = True,
+        trim_silence: bool = False,
+    ):
+        """
+        Args:
+            root_path: Path to audio files
+            sample_rate: Target sample rate for audio
+            duration: Duration in seconds for audio clips
+            num_channels: Number of channels (1 for mono, 2 for stereo)
+            random_crop: Whether to randomly crop audio (vs deterministic)
+            loudness_cutoff: Minimum loudness threshold for audio selection
+            audio_only: If True, return only audio signal. If False, return dict with labels
+            drop_label_p: Probability of dropping labels (for unconditional training)
+            shuffle: Whether to shuffle files
+            shuffle_state: Random state for shuffling
+            transform: Additional audio transforms
+            normalize: Whether to normalize audio amplitude
+            trim_silence: Whether to trim silence from audio
+        """
+        self.root_path = root_path
+        self.sample_rate = sample_rate
+        self.duration = duration
+        self.num_channels = num_channels
+        self.random_crop = random_crop
+        self.loudness_cutoff = loudness_cutoff
+        self.audio_only = audio_only
+        self.drop_label_p = drop_label_p
+        self.transform = transform
+        self.normalize = normalize
+        self.trim_silence = trim_silence
+        print(f'Audio root_path: {root_path}')
+        # Find audio files and labels
+        self.files = []
+        # Fin all audio in recursive in root_path
+        for root, dirs, files in os.walk(self.root_path):
+            for file in files:
+                if file.lower().endswith(AUDIO_EXTS):
+                    self.files.append(os.path.join(root, file))
+        print(f'Found {len(self.files)} audio files')
+        # Shuffle files if requested
+        if shuffle:
+            state = util.random_state(shuffle_state)
+            combined = self.files
+            state.shuffle(combined)
+            self.files = combined
+    def __len__(self):
+        return len(self.files)
+    def __getitem__(self, idx):
+        try:
+            file_path = self.files[idx]
+            # Load audio using AudioSignal
+            if self.random_crop:
+                # Use salient excerpt for random cropping with loudness filtering
+                signal = AudioSignal.salient_excerpt(
+                    str(file_path),
+                    duration=self.duration,
+                    loudness_cutoff=self.loudness_cutoff,
+                )
+            else:
+                # Load from beginning or deterministic offset
+                signal = AudioSignal(
+                    str(file_path),
+                    duration=self.duration,
+                    offset=0.0,
+                )
+            # Convert to mono/stereo as needed
+            if self.num_channels == 1:
+                signal = signal.to_mono()
+            # Resample to target sample rate
+            signal = signal.resample(self.sample_rate)
+            # Ensure duration by padding or trimming
+            target_samples = int(self.duration * self.sample_rate)
+            if signal.length < target_samples:
+                signal = signal.zero_pad_to(target_samples)
+            elif signal.length > target_samples:
+                signal = signal.truncate_samples(target_samples)
+            # Optional audio processing
+            if self.trim_silence:
+                signal = signal.trim_silence()
+                # Re-pad if trimming made it too short
+                if signal.length < target_samples:
+                    signal = signal.zero_pad_to(target_samples)
+            if self.normalize:
+                signal = signal.normalize()
+            # Clamp audio to [-1, 1] range
+            signal.audio_data = signal.audio_data.clamp(-1.0, 1.0)
+            # Apply additional transforms if provided
+            if self.transform is not None:
+                # Create a random state for transforms
+                state = util.random_state(idx)
+                transform_args = self.transform.instantiate(state, signal=signal)
+                signal = self.transform(signal, **transform_args)
+            # print('before process: ', signal.audio_data.shape)
+            # Store metadata
+            signal.metadata.update(
+                    {
+                    'file_path': str(file_path),
+                    'original_sr': signal.sample_rate,
+                    'duration': self.duration,
+                }
+            )
+            if self.audio_only:
+                return signal
+            else:
+                return {
+                    'signal': signal,
+                    'file_path': str(file_path),
+                    'idx': idx,
+                }
+        except Exception as e:
+            print(f'Error loading audio file {self.files[idx]}: {e}')
+            # Return next file on error to avoid crashing training
+            return self.__getitem__((idx + 1) % len(self))
+    def collate(self, batch):
+        """Collate function for DataLoader"""
+        if self.audio_only:
+            # Batch AudioSignals
+            return AudioSignal.batch(batch)
+        else:
+            # Collate dictionary batch
+            return util.collate(batch)

flowae/datasets/wrapper_audio_cae.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import random
+from PIL import Image
+import torch
+from torch.utils.data import Dataset, IterableDataset
+from datasets import register
+import datasets
+class BaseWrapperAudioCAE:
+    """Base wrapper for audio Convolutional Autoencoder (CAE) training.
+    Similar to the image wrapper, but for audio data.
+    """
+    def __init__(
+        self,
+        dataset,
+        sample_rate=24000,
+        duration=0.38,  # Duration in seconds
+        n_samples=None,  # Alternative: specify exact number of samples
+        return_gt=True,
+        gt_sample_rate=None,  # Ground truth sample rate (if different)
+        mono=True,
+        normalize=True,
+        return_coords=True,  # Whether to return coordinate grids
+    ):
+        self.dataset = datasets.make(dataset)
+        self.sample_rate = sample_rate
+        self.duration = duration
+        self.n_samples = int(duration * sample_rate)
+        self.return_gt = return_gt
+        self.gt_sample_rate = gt_sample_rate or sample_rate
+        self.mono = mono
+        self.normalize = normalize
+        self.return_coords = return_coords
+    def process(self, audio_data):
+        """Process audio data for DiTo training.
+        Args:
+            audio_data: Dictionary with 'signal' key containing AudioSignal
+                       or AudioSignal directly
+        """
+        ret = {}
+        # Extract AudioSignal
+        if isinstance(audio_data, dict):
+            signal = audio_data['signal']
+        else:
+            signal = audio_data
+        # Normalize audio
+        audio_tensor = signal.audio_data  # Shape: [channels, samples]
+        audio_tensor = audio_tensor.squeeze(0)
+        # Create input tensor
+        ret['inp'] = audio_tensor
+        if not self.return_gt:
+            return ret
+        ret['gt'] = audio_tensor
+        # print('audio_tensor shape: ', audio_tensor.shape)
+        return ret
+@register('wrapper_audio_cae')
+class WrapperAudioCAE(BaseWrapperAudioCAE, Dataset):
+    """Dataset wrapper for audio CAE training."""
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        data = self.dataset[idx]
+        return self.process(data)
+@register('wrapper_audio_cae_iterable')
+class WrapperAudioCAEIterable(BaseWrapperAudioCAE, IterableDataset):
+    """Iterable dataset wrapper for audio CAE training."""
+    def __iter__(self):
+        for data in self.dataset:
+            yield self.process(data)

flowae/datasets/wrapper_cae.py CHANGED Viewed

@@ -113,196 +113,4 @@ class WrapperCAE(BaseWrapperCAE, IterableDataset):
                 ret.update(data)
                 yield ret
             else:
-                yield self.process(data)
-class BaseWrapperAudioCAE:
-    """Base wrapper for audio Convolutional Autoencoder (CAE) training.
-    Similar to the image wrapper, but for audio data.
-    """
-    def __init__(
-        self,
-        dataset,
-        sample_rate=24000,
-        duration=0.38,  # Duration in seconds
-        n_samples=None,  # Alternative: specify exact number of samples
-        return_gt=True,
-        gt_sample_rate=None,  # Ground truth sample rate (if different)
-        mono=True,
-        normalize=True,
-        return_coords=True,  # Whether to return coordinate grids
-    ):
-        self.dataset = dataset
-        self.sample_rate = sample_rate
-        self.duration = duration
-        self.n_samples = n_samples or int(duration * sample_rate)
-        self.return_gt = return_gt
-        self.gt_sample_rate = gt_sample_rate or sample_rate
-        self.mono = mono
-        self.normalize = normalize
-        self.return_coords = return_coords
-    def process(self, audio_data):
-        """Process audio data for DiTo training.
-        Args:
-            audio_data: Dictionary with 'signal' key containing AudioSignal
-                       or AudioSignal directly
-        """
-        ret = {}
-        # Extract AudioSignal
-        if isinstance(audio_data, dict):
-            signal = audio_data['signal']
-        else:
-            signal = audio_data
-        # Convert to mono if needed
-        if self.mono and signal.num_channels > 1:
-            signal = signal.to_mono()
-        # Resample to target sample rate
-        if signal.sample_rate != self.sample_rate:
-            signal = signal.resample(self.sample_rate)
-        # Extract fixed duration
-        if signal.duration < self.duration:
-            # Pad if too short
-            signal = signal.zero_pad_to(self.n_samples)
-        else:
-            # Take random excerpt if too long
-            max_start = signal.num_samples - self.n_samples
-            if max_start > 0:
-                start_idx = random.randint(0, max_start)
-                signal = signal[..., start_idx:start_idx + self.n_samples]
-            else:
-                signal = signal[..., :self.n_samples]
-        # Normalize audio
-        audio_tensor = signal.audio_data  # Shape: [channels, samples]
-        if self.normalize:
-            # Normalize to [-1, 1]
-            max_val = audio_tensor.abs().max()
-            if max_val > 0:
-                audio_tensor = audio_tensor / max_val
-        # Create input tensor
-        ret['inp'] = audio_tensor
-        if not self.return_gt:
-            return ret
-        ret['gt'] = audio_tensor
-        return ret
-@register('wrapper_audio_cae')
-class WrapperAudioCAE(BaseWrapperAudioCAE, Dataset):
-    """Dataset wrapper for audio CAE training."""
-    def __len__(self):
-        return len(self.dataset)
-    def __getitem__(self, idx):
-        data = self.dataset[idx]
-        return self.process(data)
-@register('wrapper_audio_cae_iterable')
-class WrapperAudioCAEIterable(BaseWrapperAudioCAE, IterableDataset):
-    """Iterable dataset wrapper for audio CAE training."""
-    def __iter__(self):
-        for data in self.dataset:
-            yield self.process(data)
-# Example usage with your existing AudioDataset
-def create_dito_audio_dataset(config):
-    """Create DiTo audio dataset from config."""
-    # Create base audio dataset using audiotools
-    # Setup audio loaders
-    train_folders = config.get("train_folders", {})
-    loader = AudioLoader(
-        sources=list(train_folders.values()),
-        transform=tfm.Compose(
-            tfm.VolumeNorm(("uniform", -20, -10)),
-            tfm.RescaleAudio(),
-        ),
-        ext=['.wav', '.flac', '.mp3'],
-    )
-    # Create base dataset
-    base_dataset = AudioDataset(
-        loaders=loader,
-        sample_rate=config['sample_rate'],
-        duration=config['duration'],
-        n_examples=config['n_examples'],
-        num_channels=1 if config.get('mono', True) else 2,
-    )
-    # Wrap with DiTo wrapper
-    dito_dataset = WrapperAudioCAE(
-        dataset=base_dataset,
-        sample_rate=config['sample_rate'],
-        duration=config['duration'],
-        mono=config.get('mono', True),
-        normalize=True,
-        return_coords=True,
-    )
-    return dito_dataset
-# For your training config, you would use it like:
-"""
-datasets:
-  train:
-    name: wrapper_audio_cae
-    args:
-      dataset:
-        name: audio_dataset  # Your base audio dataset
-        args:
-          sources: ["/path/to/audio/files"]
-          sample_rate: 44100
-          duration: 2.0
-          n_examples: 10000
-      sample_rate: 44100
-      duration: 2.0
-      mono: true
-      normalize: true
-      return_coords: true
-    loader:
-      batch_size: 16
-      num_workers: 8
-  val:
-    name: wrapper_audio_cae
-    args:
-      dataset:
-        name: audio_dataset
-        args:
-          sources: ["/path/to/val/audio/files"]
-          sample_rate: 44100
-          duration: 2.0
-          n_examples: 1000
-      sample_rate: 44100
-      duration: 2.0
-      mono: true
-      normalize: true
-      return_coords: true
-    loader:
-      batch_size: 16
-      num_workers: 8
-"""

                 ret.update(data)
                 yield ret
             else:
+                yield self.process(data)

flowae/{reconstruction.py → image_dito_inference.py} RENAMED Viewed

File without changes

flowae/models/diffusion/fm.py CHANGED Viewed

@@ -22,6 +22,21 @@ class FM:
     def B(self, t):
         return -(1.0 - self.sigma_min)
     def get_betas(self, n_timesteps):
         return torch.zeros(n_timesteps) # Not VP and not supported
@@ -38,17 +53,20 @@ class FM:
         if t is None:
             t = torch.rand(x.shape[0], device=x.device)
-        print('x shape: ', x.shape)
         x_t, noise = self.add_noise(x, t)
-        print('x_t shape: ', x_t.shape)
         pred = net(x_t, t=t * self.timescale, **net_kwargs)
-        print('pred shape: ', pred.shape)
         target = self.A(t) * x + self.B(t) * noise # -dxt/dt
-        print('target shape: ', target.shape)
-        print('return_loss_unreduced: ', return_loss_unreduced, 'return_all: ', return_all)
         if return_loss_unreduced:
-            loss = ((pred.float() - target.float()) ** 2).mean(dim=[1, 2, 3])
             if return_all:
                 return loss, t, x_t, pred
             else:

     def B(self, t):
         return -(1.0 - self.sigma_min)
+    def _get_reduction_dims(self, x):
+        """Get appropriate dimensions for loss reduction based on tensor shape"""
+        if x.dim() == 4:
+            # Images: [batch, channels, height, width]
+            return [1, 2, 3]
+        elif x.dim() == 3:
+            # Audio: [batch, channels, samples] or [batch, latent_dim, time_frames]
+            return [1, 2]
+        elif x.dim() == 2:
+            # 1D signals: [batch, samples]
+            return [1]
+        else:
+            # Fallback: reduce over all non-batch dimensions
+            return list(range(1, x.dim()))
     def get_betas(self, n_timesteps):
         return torch.zeros(n_timesteps) # Not VP and not supported
         if t is None:
             t = torch.rand(x.shape[0], device=x.device)
+        # print('x shape: ', x.shape)
         x_t, noise = self.add_noise(x, t)
+        # print('x_t shape: ', x_t.shape)
         pred = net(x_t, t=t * self.timescale, **net_kwargs)
+        # print('pred shape: ', pred.shape)
         target = self.A(t) * x + self.B(t) * noise # -dxt/dt
+        # print('target shape: ', target.shape)
+        # print('return_loss_unreduced: ', return_loss_unreduced, 'return_all: ', return_all)
         if return_loss_unreduced:
+            print('pred shape: ', pred.shape, 'target shape: ', target.shape)
+            reduce_dims = self._get_reduction_dims(x)
+            loss = ((pred.float() - target.float()) ** 2).mean(dim=reduce_dims)
+            # loss = ((pred.float() - target.float()) ** 2).mean(dim=[1, 2, 3])
             if return_all:
                 return loss, t, x_t, pred
             else:

flowae/models/ldm/dac/layers.py CHANGED Viewed

@@ -74,7 +74,7 @@ def get_activation(activation, channels, alpha):
         return nn.LeakyReLU()
     elif activation == "tanh":
         return nn.Tanh()
-    elif activation == "snake_beta":
         return SnakeBeta(channels, alpha)
     else:
         raise ValueError(f"Activation {activation} not supported")

         return nn.LeakyReLU()
     elif activation == "tanh":
         return nn.Tanh()
+    elif activation == "snakebeta":
         return SnakeBeta(channels, alpha)
     else:
         raise ValueError(f"Activation {activation} not supported")

flowae/models/ldm/dac/model.py CHANGED Viewed

@@ -236,7 +236,8 @@ class Encoder(nn.Module):
     def forward(self, x):
         x = F.leaky_relu(x)
-        return self.block(x)
 class DecoderBlock(nn.Module):
@@ -478,6 +479,7 @@ class DACVAE(BaseModel, CodecMixin):
     ):
         x = self.encoder(audio_data)
         x = self.en_conv_post(x)
         m, logs = torch.split(x, self.latent_dim, dim=1)
         logs = torch.clamp(logs, min=-14.0, max=14.0)

     def forward(self, x):
         x = F.leaky_relu(x)
+        x = self.block(x)
+        return x
 class DecoderBlock(nn.Module):
     ):
         x = self.encoder(audio_data)
         x = self.en_conv_post(x)
+        print('x shape: ', x.shape)
         m, logs = torch.split(x, self.latent_dim, dim=1)
         logs = torch.clamp(logs, min=-14.0, max=14.0)

flowae/models/ldm/dac/utils.py CHANGED Viewed

@@ -7,16 +7,16 @@ from .model import Encoder, Decoder, WNConv1d
 default_configs = {
     'snake': dict(
-        encoder_dim=64,
-        encoder_rates=[2, 4, 5, 8],
-        latent_dim=64,
         d_in=1,
         activation='snake',
     ),
-    'snake': dict(
-        encoder_dim=64,
-        encoder_rates=[2, 4, 5, 8],
-        latent_dim=64,
         d_in=1,
         activation='snakebeta',
     ),
@@ -27,10 +27,10 @@ default_configs = {
 def make_dac_encoder(config_name, **kwargs):
     encoder_kwargs = default_configs[config_name]
     encoder_kwargs.update(kwargs)
-    latent_dim = encoder_kwargs['latent_dim']
     return nn.Sequential(
         Encoder(**encoder_kwargs),
-        WNConv1d(latent_dim, latent_dim, kernel_size=1),
     )
@@ -38,8 +38,8 @@ def make_dac_encoder(config_name, **kwargs):
 def make_vqgan_decoder(config_name, **kwargs):
     decoder_kwargs = default_configs[config_name]
     decoder_kwargs.update(kwargs)
-    latent_dim = decoder_kwargs['latent_dim']
     return nn.Sequential(
-        WNConv1d(latent_dim, latent_dim, kernel_size=1),
         Decoder(**decoder_kwargs),
     )

 default_configs = {
     'snake': dict(
+        d_model=64,
+        strides=[2, 4, 5, 8],
+        d_latent=64,
         d_in=1,
         activation='snake',
     ),
+    'snakebeta': dict(
+        d_model=64,
+        strides=[2, 4, 5, 8],
+        d_latent=64,
         d_in=1,
         activation='snakebeta',
     ),
 def make_dac_encoder(config_name, **kwargs):
     encoder_kwargs = default_configs[config_name]
     encoder_kwargs.update(kwargs)
+    d_model = encoder_kwargs['d_model']
     return nn.Sequential(
         Encoder(**encoder_kwargs),
+        WNConv1d(d_model, d_model, kernel_size=1),
     )
 def make_vqgan_decoder(config_name, **kwargs):
     decoder_kwargs = default_configs[config_name]
     decoder_kwargs.update(kwargs)
+    d_model = decoder_kwargs['d_model']
     return nn.Sequential(
+        WNConv1d(d_model, d_model, kernel_size=1),
         Decoder(**decoder_kwargs),
     )

flowae/models/ldm/dito.py CHANGED Viewed

@@ -6,7 +6,8 @@ import torch
 import models
 from omegaconf import OmegaConf
 from models import register
-from models.ldm.ldm_base import LDMBase
 from models.ldm.vqgan.lpips import LPIPS
@@ -178,3 +179,143 @@ class DiTo(LDMBase):
             dae_loss_w = loss_config.get('dae_loss', 1)
             ret['loss'] = ret['loss'] + dae_loss * dae_loss_w
             return ret

 import models
 from omegaconf import OmegaConf
 from models import register
+from models.ldm.ldm_base import LDMBase, LDMBaseAudio
 from models.ldm.vqgan.lpips import LPIPS
             dae_loss_w = loss_config.get('dae_loss', 1)
             ret['loss'] = ret['loss'] + dae_loss * dae_loss_w
             return ret
+@register('dito_audio')
+class DiToAudio(LDMBaseAudio):
+    def __init__(self, render_diffusion, render_sampler, render_n_steps, renderer_guidance=1,**kwargs):
+        super().__init__(**kwargs)
+        self.render_diffusion = models.make(render_diffusion)
+        if OmegaConf.is_config(render_sampler):
+            render_sampler = OmegaConf.to_container(render_sampler, resolve=True)
+        render_sampler = copy.deepcopy(render_sampler)
+        if render_sampler.get('args') is None:
+            render_sampler['args'] = {}
+        render_sampler['args']['diffusion'] = self.render_diffusion
+        self.render_sampler = models.make(render_sampler)
+        self.render_n_steps = render_n_steps
+        self.renderer_guidance = renderer_guidance
+        self.t_loss_monitor_v = [0 for _ in range(10)]
+        self.t_loss_monitor_n = [0 for _ in range(10)]
+        self.t_loss_monitor_decay = 0.99
+    def render(self, z_dec):
+        net_kwargs = {'z_dec': z_dec}
+        n_frames = z_dec.size(2) * 320
+        shape = (z_dec.size(0), z_dec.size(0), n_frames)
+        if self.renderer_guidance > 1:
+            uncond_z_dec = self.drop_z_emb.unsqueeze(0).expand(z_dec.shape[0], -1, -1, -1)
+            uncond_net_kwargs = {'z_dec': uncond_z_dec}
+        else:
+            uncond_net_kwargs = None
+        ret = self.render_sampler.sample(
+            net=self.renderer,
+            n_steps=self.render_n_steps,
+            shape=shape,
+            net_kwargs=net_kwargs,
+            uncond_net_kwargs=uncond_net_kwargs,
+            guidance=self.renderer_guidance,
+        )
+        # if self.use_ema_renderer:
+        #     self.swap_ema_renderer()
+        return ret
+    def forward(self, data, mode, has_optimizer=None):
+        if mode in ['z', 'z_dec']:
+            ret_z, _ = super().forward(data, mode=mode, has_optimizer=has_optimizer)
+            return ret_z
+        grad = self.get_grad_plan(has_optimizer)
+        loss_config = self.loss_config
+        if mode == 'pred':
+            z_dec, ret = super().forward(data, mode='z_dec', has_optimizer=has_optimizer)
+            gt_patch = data['gt']
+            if grad['renderer']:
+                return self.render(z_dec)
+            else:
+                with torch.no_grad():
+                    return self.render(z_dec)
+        elif mode == 'loss':
+            if not grad['renderer']: # Only training zdm
+                _, ret = super().forward(data, mode='z', has_optimizer=has_optimizer)
+                return ret
+            gt_patch = data['gt']
+            z_dec, ret = super().forward(data, mode='z_dec', has_optimizer=has_optimizer)
+            net_kwargs = {'z_dec': z_dec}
+            # print('latent z_dec shape: ', z_dec.shape)
+            t = torch.rand(gt_patch.shape[0], device=gt_patch.device)
+            # print('self.zaug_p:', self.zaug_p)
+            # print('self.training:', self.training)
+            if (self.zaug_p is not None) and self.training:
+                tz = self._tz
+                mask_aug = self._mask_aug
+                typ = self.zaug_decoding_loss_type
+                if typ == 'all':
+                    tmin = torch.ones_like(tz) * 0
+                    tmax = torch.ones_like(tz) * 1
+                elif typ == 'suffix':
+                    tmin = tz
+                    tmax = torch.ones_like(tz) * 1
+                elif typ == 'tz':
+                    tmin = tz
+                    tmax = tz
+                elif typ == 'tmax':
+                    tmin = torch.ones_like(tz) * 1
+                    tmax = torch.ones_like(tz) * 1
+                else:
+                    raise NotImplementedError
+                t_aug = tmin + (tmax - tmin) * torch.rand_like(tmin)
+                t = mask_aug * t_aug + (1 - mask_aug) * t
+            loss, t = self.render_diffusion.loss(
+                net=self.renderer,
+                x=gt_patch,
+                t=t,
+                net_kwargs=net_kwargs,
+                return_loss_unreduced=True
+            )
+            # Visualize diffusion network loss for different timesteps #
+            if self.training:
+                m = len(self.t_loss_monitor_v)
+                for i in range(len(loss)):
+                    q = min(math.floor(t[i].item() * m), m - 1)
+                    self.t_loss_monitor_v[q] = self.t_loss_monitor_v[q] * self.t_loss_monitor_decay + loss[i].item() * (1 - self.t_loss_monitor_decay)
+                    self.t_loss_monitor_n[q] += 1
+                for q in range(m):
+                    if self.t_loss_monitor_n[q] > 0:
+                        if self.t_loss_monitor_n[q] < 500:
+                            r = 1 - math.pow(self.t_loss_monitor_decay, self.t_loss_monitor_n[q])
+                        else:
+                            r = 1
+                        ret[f'_loss_t{q}'] = self.t_loss_monitor_v[q] / r
+            # - #
+            dae_loss = loss.mean()
+            ret['dae_loss'] = dae_loss.item()
+            dae_loss_w = loss_config.get('dae_loss', 1)
+            ret['loss'] = ret['loss'] + dae_loss * dae_loss_w
+            return ret

flowae/models/ldm/ldm_base.py CHANGED Viewed

@@ -47,6 +47,39 @@ class LDMBase(nn.Module):
         use_ema_decoder=False,
         use_ema_renderer=False,
     ):
         super().__init__()
         self.loss_config = loss_config if loss_config is not None else dict()
@@ -442,3 +475,194 @@ class DiagonalGaussianDistribution(object):
     def mode(self):
         return self.mean

         use_ema_decoder=False,
         use_ema_renderer=False,
     ):
+        print('print all the args   ')
+        print("encoder: ", encoder)
+        print("z_shape: ",z_shape)
+        print("decoder: ",decoder)
+        print("renderer: ",renderer)
+        print("encoder_ema_rate: ",encoder_ema_rate)
+        print("decoder_ema_rate: ",decoder_ema_rate)
+        print("renderer_ema_rate: ",renderer_ema_rate)
+        print("z_gaussian: ",z_gaussian)
+        print("z_gaussian_sample: ",z_gaussian_sample)
+        print("z_quantizer: ",z_quantizer)
+        print("z_quantizer_n_embed: ",z_quantizer_n_embed)
+        print("z_quantizer_beta: ",z_quantizer_beta)
+        print("z_layernorm: ",z_layernorm)
+        print("zaug_p: ",zaug_p)
+        print("zaug_tmax: ",zaug_tmax)
+        print("zaug_tmax_always: ",zaug_tmax_always)
+        print("zaug_decoding_loss_type: ",zaug_decoding_loss_type)
+        print("zaug_zdm_diffusion: ",zaug_zdm_diffusion)
+        print("gt_noise_lb: ",gt_noise_lb)
+        print("drop_z_p: ",drop_z_p)
+        print("zdm_net: ",zdm_net)
+        print("zdm_diffusion: ",zdm_diffusion)
+        print("zdm_sampler: ",zdm_sampler)
+        print("zdm_n_steps: ",zdm_n_steps)
+        print("zdm_ema_rate: ",zdm_ema_rate)
+        print("zdm_train_normalize: ",zdm_train_normalize)
+        print("zdm_class_cond: ",zdm_class_cond)
+        print("zdm_force_guidance: ",zdm_force_guidance)
+        print("loss_config: ",loss_config)
+        print("use_ema_encoder: ",use_ema_encoder)
+        print("use_ema_decoder: ",use_ema_decoder)
+        print("use_ema_renderer: ",use_ema_renderer)
         super().__init__()
         self.loss_config = loss_config if loss_config is not None else dict()
     def mode(self):
         return self.mean
+class LDMBaseAudio(nn.Module):
+    def __init__(
+        self,
+        encoder,
+        z_channels,
+        decoder,
+        renderer,
+        zaug_p=0.1,
+        zaug_tmax=1.0,
+        zaug_tmax_always=False,
+        zaug_decoding_loss_type='all',
+        zaug_zdm_diffusion={'name': 'fm', 'args': {'timescale': 1000.0}},
+        zdm_ema_rate=0.9999,
+        loss_config={},
+        encoder_ema_rate=None,
+        decoder_ema_rate=None,
+        renderer_ema_rate=None,
+    ):
+        super().__init__()
+        self.loss_config = loss_config
+        self.encoder = models.make(encoder)
+        self.decoder = models.make(decoder)
+        self.renderer = models.make(renderer)
+        self.z_layernorm = nn.LayerNorm(
+            z_channels,  # e.g., 64
+            elementwise_affine=False
+        )
+        self.zaug_p = zaug_p
+        self.zaug_tmax = zaug_tmax
+        self.zaug_tmax_always = zaug_tmax_always
+        self.zaug_decoding_loss_type = zaug_decoding_loss_type
+        if zaug_zdm_diffusion is not None:
+            self.zaug_zdm_diffusion = models.make(zaug_zdm_diffusion)
+        # EMA models #
+        self.encoder_ema_rate = encoder_ema_rate
+        if self.encoder_ema_rate is not None:
+            self.encoder_ema = copy.deepcopy(self.encoder)
+            for p in self.encoder_ema.parameters():
+                p.requires_grad = False
+        self.decoder_ema_rate = decoder_ema_rate
+        if self.decoder_ema_rate is not None:
+            self.decoder_ema = copy.deepcopy(self.decoder)
+            for p in self.decoder_ema.parameters():
+                p.requires_grad = False
+        self.renderer_ema_rate = renderer_ema_rate
+        if self.renderer_ema_rate is not None:
+            self.renderer_ema = copy.deepcopy(self.renderer)
+            for p in self.renderer_ema.parameters():
+                p.requires_grad = False
+        #
+    def get_grad_plan(self, has_optimizer):
+        if has_optimizer is None:
+            has_optimizer = dict()
+        grad = dict()
+        grad['encoder'] = has_optimizer.get('encoder', False)
+        grad['decoder'] = grad['encoder'] or has_optimizer.get('decoder', False)
+        grad['renderer'] = grad['decoder'] or has_optimizer.get('renderer', False)
+        return grad
+    def normalize_latents(self, z):
+        # z shape: [batch, latent_dim, n_frames] - n_frames can vary!
+        # print('bef z shape: ', z.shape)
+        z = z.transpose(-2, -1)  # [batch, latent_dim, n_frames]
+        # print('z shape: ', z.shape)
+        z = self.z_layernorm(z)  # Normalize over latent_dim for each time step
+        # print('z shape: ', z.shape)
+        z = z.transpose(-2, -1)  # [batch, latent_dim, n_frames]
+        # print('z shape: ', z.shape)
+        return z
+    def update_ema(self):
+        if self.encoder_ema_rate is not None:
+            self.update_ema_fn(self.encoder_ema, self.encoder, self.encoder_ema_rate)
+        if self.decoder_ema_rate is not None:
+            self.update_ema_fn(self.decoder_ema, self.decoder, self.decoder_ema_rate)
+        if self.renderer_ema_rate is not None:
+            self.update_ema_fn(self.renderer_ema, self.renderer, self.renderer_ema_rate)
+    def get_parameters(self, name):
+        if name == 'encoder':
+            return self.encoder.parameters()
+        elif name == 'decoder':
+            p = list(self.decoder.parameters())
+            if self.z_quantizer is not None:
+                p += list(self.z_quantizer.parameters())
+            return p
+        elif name == 'renderer':
+            return self.renderer.parameters()
+        elif name == 'zdm':
+            return self.zdm_net.parameters()
+    def encode(self, x):
+        z = self.encoder(x)
+        # print('z shape: ', z.shape)
+        z = self.normalize_latents(z)
+        # print('after norm z shape: ', z.shape)
+        if (self.zaug_p is not None) and self.training:
+            assert self.z_layernorm is not None # ensure 0 mean 1 std
+            if self.zaug_tmax_always:
+                tz = torch.ones(z.shape[0], device=z.device) * self.zaug_tmax
+            else:
+                tz = torch.rand(z.shape[0], device=z.device) * self.zaug_tmax
+            zt, _ = self.zaug_zdm_diffusion.add_noise(z, tz)
+            mask_aug = (torch.rand(z.shape[0], device=z.device) < self.zaug_p).float()
+            if z.dim() == 4:  # Image: [batch, channels, height, width]
+                mask_shape = (-1, 1, 1, 1)
+            elif z.dim() == 3:  # Audio: [batch, channels, n_frames]
+                mask_shape = (-1, 1, 1)
+            else:
+                raise ValueError(f"Unsupported tensor dimension: {z.dim()}")
+            z = mask_aug.view(*mask_shape) * zt + (1 - mask_aug).view(*mask_shape) * z
+            # z = mask_aug.view(-1, 1, 1, 1) * zt + (1 - mask_aug).view(-1, 1, 1, 1) * z
+            self._tz = tz
+            self._mask_aug = mask_aug
+        # print('after zaug z shape: ', z.shape)
+        return z
+    def decode(self, z):
+        z_dec = self.decoder(z)
+        return z_dec
+    def render(self, z_dec):
+        raise NotImplementedError
+    def forward(self, data, mode, has_optimizer=None):
+        loss = torch.tensor(0., device=data['inp'].device)
+        ret = dict()
+        # print("data['inp'] shape: ", data['inp'].shape)
+        z = self.encode(data['inp'])
+        z_dec = self.decode(z)
+        ret['loss'] = loss
+        return z_dec, ret
+    def generate_samples(
+        self,
+        batch_size,
+        n_steps,
+        net_kwargs=None,
+        uncond_net_kwargs=None,
+        ema=False,
+        guidance=1.0,
+        noise=None,
+        return_z=False,
+    ):
+        if self.zdm_force_guidance is not None:
+            guidance = self.zdm_force_guidance
+        shape = (batch_size,) + self.z_shape
+        net = self.zdm_net if not ema else self.zdm_net_ema
+        z = self.zdm_sampler.sample(
+            net,
+            shape,
+            n_steps,
+            net_kwargs=net_kwargs,
+            uncond_net_kwargs=uncond_net_kwargs,
+            guidance=guidance,
+            noise=noise,
+        )
+        if return_z:
+            return z
+        if (self.zaug_p is not None) and self.zaug_tmax_always:
+            tz = torch.ones(z.shape[0], device=z.device) * self.zaug_tmax
+            z, _ = self.zaug_zdm_diffusion.add_noise(z, tz)
+        z = self.denormalize_for_zdm(z)
+        z_dec = self.decode(z)
+        return self.render(z_dec)

flowae/models/networks/__init__.py CHANGED Viewed

@@ -1,2 +1,3 @@
 from . import consistency_decoder_unet
-from . import dit

 from . import consistency_decoder_unet
+from . import dit
+from . import consistency_audio_decoder_unet

flowae/models/networks/consistency_audio_decoder_unet.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# https://gist.github.com/mrsteyk/74ad3ec2f6f823111ae4c90e168505ac
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from models import register
+class PositionalEmbedding(nn.Module):
+    def __init__(self, pe_dim=320, out_dim=1280, max_positions=10000, endpoint=True):
+        super().__init__()
+        self.num_channels = pe_dim
+        self.max_positions = max_positions
+        self.endpoint = endpoint
+        self.f_1 = nn.Linear(pe_dim, out_dim)
+        self.f_2 = nn.Linear(out_dim, out_dim)
+    def forward(self, x):
+        freqs = torch.arange(start=0, end=self.num_channels//2, dtype=torch.float32, device=x.device)
+        freqs = freqs / (self.num_channels // 2 - (1 if self.endpoint else 0))
+        freqs = (1 / self.max_positions) ** freqs
+        x = x.ger(freqs.to(x.dtype))
+        x = torch.cat([x.cos(), x.sin()], dim=1)
+        x = self.f_1(x)
+        x = F.silu(x)
+        return self.f_2(x)
+class AudioEmbedding(nn.Module):
+    """1D convolution for audio input embedding"""
+    def __init__(self, in_channels, out_channels=320, kernel_size=3) -> None:
+        super().__init__()
+        self.f = nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, padding=kernel_size//2)
+    def forward(self, x) -> torch.Tensor:
+        return self.f(x)
+class AudioUnembedding(nn.Module):
+    """1D convolution for audio output"""
+    def __init__(self, in_channels=320, out_channels=1, kernel_size=3) -> None:
+        super().__init__()
+        self.gn = nn.GroupNorm(32, in_channels)
+        self.f = nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, padding=kernel_size//2)
+    def forward(self, x) -> torch.Tensor:
+        return self.f(F.silu(self.gn(x)))
+class AudioConvResblock(nn.Module):
+    """1D Residual block for audio"""
+    def __init__(self, in_features, out_features, t_dim, kernel_size=3) -> None:
+        super().__init__()
+        self.f_t = nn.Linear(t_dim, out_features * 2)
+        self.gn_1 = nn.GroupNorm(32, in_features)
+        self.f_1 = nn.Conv1d(in_features, out_features, kernel_size=kernel_size, padding=kernel_size//2)
+        self.gn_2 = nn.GroupNorm(32, out_features)
+        self.f_2 = nn.Conv1d(out_features, out_features, kernel_size=kernel_size, padding=kernel_size//2)
+        skip_conv = in_features != out_features
+        self.f_s = (
+            nn.Conv1d(in_features, out_features, kernel_size=1, padding=0)
+            if skip_conv
+            else nn.Identity()
+        )
+    def forward(self, x, t):
+        x_skip = x
+        t = self.f_t(F.silu(t))
+        t = t.chunk(2, dim=1)
+        t_1 = t[0].unsqueeze(dim=2) + 1  # [batch, channels, 1]
+        t_2 = t[1].unsqueeze(dim=2)      # [batch, channels, 1]
+        gn_1 = F.silu(self.gn_1(x))
+        f_1 = self.f_1(gn_1)
+        gn_2 = self.gn_2(f_1)
+        return self.f_s(x_skip) + self.f_2(F.silu(gn_2 * t_1 + t_2))
+class AudioDownsample(nn.Module):
+    """1D downsampling for audio"""
+    def __init__(self, in_channels, t_dim, downsample_factor=2) -> None:
+        super().__init__()
+        self.f_t = nn.Linear(t_dim, in_channels * 2)
+        self.downsample_factor = downsample_factor
+        self.gn_1 = nn.GroupNorm(32, in_channels)
+        self.f_1 = nn.Conv1d(in_channels, in_channels, kernel_size=3, padding=1)
+        self.gn_2 = nn.GroupNorm(32, in_channels)
+        self.f_2 = nn.Conv1d(in_channels, in_channels, kernel_size=3, padding=1)
+    def forward(self, x, t) -> torch.Tensor:
+        x_skip = x
+        t = self.f_t(F.silu(t))
+        t_1, t_2 = t.chunk(2, dim=1)
+        t_1 = t_1.unsqueeze(2) + 1
+        t_2 = t_2.unsqueeze(2)
+        gn_1 = F.silu(self.gn_1(x))
+        # 1D average pooling
+        avg_pool1d = F.avg_pool1d(gn_1, kernel_size=self.downsample_factor)
+        f_1 = self.f_1(avg_pool1d)
+        gn_2 = self.gn_2(f_1)
+        f_2 = self.f_2(F.silu(t_2 + (t_1 * gn_2)))
+        return f_2 + F.avg_pool1d(x_skip, kernel_size=self.downsample_factor)
+class AudioUpsample(nn.Module):
+    """1D upsampling for audio"""
+    def __init__(self, in_channels, t_dim, upsample_factor=2) -> None:
+        super().__init__()
+        self.f_t = nn.Linear(t_dim, in_channels * 2)
+        self.upsample_factor = upsample_factor
+        self.gn_1 = nn.GroupNorm(32, in_channels)
+        self.f_1 = nn.Conv1d(in_channels, in_channels, kernel_size=3, padding=1)
+        self.gn_2 = nn.GroupNorm(32, in_channels)
+        self.f_2 = nn.Conv1d(in_channels, in_channels, kernel_size=3, padding=1)
+    def forward(self, x, t) -> torch.Tensor:
+        x_skip = x
+        t = self.f_t(F.silu(t))
+        t_1, t_2 = t.chunk(2, dim=1)
+        t_1 = t_1.unsqueeze(2) + 1
+        t_2 = t_2.unsqueeze(2)
+        gn_1 = F.silu(self.gn_1(x))
+        # 1D interpolation upsampling
+        upsample = F.interpolate(gn_1, scale_factor=self.upsample_factor, mode='nearest')
+        f_1 = self.f_1(upsample)
+        gn_2 = self.gn_2(f_1)
+        f_2 = self.f_2(F.silu(t_2 + (t_1 * gn_2)))
+        return f_2 + F.interpolate(x_skip, scale_factor=self.upsample_factor, mode='nearest')
+@register('audio_diffusion_unet')
+class AudioDiffusionUNet(nn.Module):
+    """
+    1D UNet for audio diffusion with dynamic latent conditioning
+    Handles:
+    - x: [batch, 1, samples] - audio waveform (dynamic length)
+    - z_dec: [batch, 64, n_frames] - latent conditioning (dynamic length)
+    """
+    def __init__(
+        self,
+        in_channels=1,           # Audio channels (mono=1, stereo=2)
+        z_dec_channels=64,       # Latent conditioning channels
+        c0=128, c1=256, c2=512,  # Channel progression (smaller than image version)
+        pe_dim=320,
+        t_dim=1280,
+        kernel_size=3
+    ) -> None:
+        super().__init__()
+        # Store for dynamic conditioning
+        self.z_dec_channels = z_dec_channels
+        # Audio input embedding
+        self.embed_audio = AudioEmbedding(
+            in_channels=in_channels,
+            out_channels=c0,
+            kernel_size=kernel_size
+        )
+        # Time embedding
+        self.embed_time = PositionalEmbedding(pe_dim=pe_dim, out_dim=t_dim)
+        # Latent conditioning projection
+        if z_dec_channels is not None:
+            self.z_dec_proj = nn.Conv1d(z_dec_channels, c0, kernel_size=1)
+        # Downsampling path
+        down_0 = nn.ModuleList([
+            AudioConvResblock(c0, c0, t_dim, kernel_size),
+            AudioConvResblock(c0, c0, t_dim, kernel_size),
+            AudioConvResblock(c0, c0, t_dim, kernel_size),
+            AudioDownsample(c0, t_dim),
+        ])
+        down_1 = nn.ModuleList([
+            AudioConvResblock(c0, c1, t_dim, kernel_size),
+            AudioConvResblock(c1, c1, t_dim, kernel_size),
+            AudioConvResblock(c1, c1, t_dim, kernel_size),
+            AudioDownsample(c1, t_dim),
+        ])
+        down_2 = nn.ModuleList([
+            AudioConvResblock(c1, c2, t_dim, kernel_size),
+            AudioConvResblock(c2, c2, t_dim, kernel_size),
+            AudioConvResblock(c2, c2, t_dim, kernel_size),
+            AudioDownsample(c2, t_dim),
+        ])
+        down_3 = nn.ModuleList([
+            AudioConvResblock(c2, c2, t_dim, kernel_size),
+            AudioConvResblock(c2, c2, t_dim, kernel_size),
+            AudioConvResblock(c2, c2, t_dim, kernel_size),
+        ])
+        self.down = nn.ModuleList([down_0, down_1, down_2, down_3])
+        # Middle layers
+        self.mid = nn.ModuleList([
+            AudioConvResblock(c2, c2, t_dim, kernel_size),
+            AudioConvResblock(c2, c2, t_dim, kernel_size),
+        ])
+        # Upsampling path
+        up_3 = nn.ModuleList([
+            AudioConvResblock(c2 * 2, c2, t_dim, kernel_size),
+            AudioConvResblock(c2 * 2, c2, t_dim, kernel_size),
+            AudioConvResblock(c2 * 2, c2, t_dim, kernel_size),
+            AudioConvResblock(c2 * 2, c2, t_dim, kernel_size),
+            AudioUpsample(c2, t_dim),
+        ])
+        up_2 = nn.ModuleList([
+            AudioConvResblock(c2 * 2, c2, t_dim, kernel_size),
+            AudioConvResblock(c2 * 2, c2, t_dim, kernel_size),
+            AudioConvResblock(c2 * 2, c2, t_dim, kernel_size),
+            AudioConvResblock(c2 + c1, c2, t_dim, kernel_size),
+            AudioUpsample(c2, t_dim),
+        ])
+        up_1 = nn.ModuleList([
+            AudioConvResblock(c2 + c1, c1, t_dim, kernel_size),
+            AudioConvResblock(c1 * 2, c1, t_dim, kernel_size),
+            AudioConvResblock(c1 * 2, c1, t_dim, kernel_size),
+            AudioConvResblock(c0 + c1, c1, t_dim, kernel_size),
+            AudioUpsample(c1, t_dim),
+        ])
+        up_0 = nn.ModuleList([
+            AudioConvResblock(c0 + c1, c0, t_dim, kernel_size),
+            AudioConvResblock(c0 * 2, c0, t_dim, kernel_size),
+            AudioConvResblock(c0 * 2, c0, t_dim, kernel_size),
+            AudioConvResblock(c0 * 2, c0, t_dim, kernel_size),
+        ])
+        self.up = nn.ModuleList([up_0, up_1, up_2, up_3])
+        # Output layer
+        self.output = AudioUnembedding(in_channels=c0, out_channels=in_channels)
+    def get_last_layer_weight(self):
+        return self.output.f.weight
+    def condition_with_latents(self, x, z_dec):
+        """
+        Add latent conditioning to audio features
+        Args:
+            x: [batch, c0, audio_samples] - audio features
+            z_dec: [batch, 64, n_frames] - latent conditioning
+        Returns:
+            x: [batch, c0, audio_samples] - conditioned features
+        """
+        if z_dec is None:
+            return x
+        # Project latents to same channel dimension as audio features
+        z_proj = self.z_dec_proj(z_dec)  # [batch, c0, n_frames]
+        # Interpolate latents to match audio length
+        if z_proj.shape[-1] != x.shape[-1]:
+            z_proj = F.interpolate(
+                z_proj,
+                size=x.shape[-1],
+                mode='nearest'  # or 'linear' for smoother interpolation
+            )
+        # Add latent conditioning to audio features
+        return x + z_proj
+    def forward(self, x, t=None, z_dec=None) -> torch.Tensor:
+        """
+        Forward pass
+        Args:
+            x: [batch, 1, samples] - audio waveform (any length)
+            t: [batch] - diffusion timesteps
+            z_dec: [batch, 64, n_frames] - latent conditioning (any length)
+        """
+        # Embed audio input
+        x = self.embed_audio(x)  # [batch, c0, samples]
+        # Add latent conditioning
+        if z_dec is not None:
+            x = self.condition_with_latents(x, z_dec)
+        # Embed timestep
+        if t is None:
+            t = torch.zeros(x.shape[0], device=x.device)
+        t = self.embed_time(t)  # [batch, t_dim]
+        # Downsampling with skip connections
+        skips = [x]
+        for down in self.down:
+            for block in down:
+                x = block(x, t)
+                skips.append(x)
+        # Middle layers
+        for mid in self.mid:
+            x = mid(x, t)
+        # Upsampling with skip connections
+        for up in self.up[::-1]:
+            for block in up:
+                if isinstance(block, AudioConvResblock):
+                    x = torch.cat([x, skips.pop()], dim=1)
+                x = block(x, t)
+        # Output
+        return self.output(x)

flowae/models/networks/consistency_decoder_unet.py CHANGED Viewed

@@ -239,6 +239,7 @@ class ConsistencyDecoderUNet(nn.Module):
     def forward(self, x, t=None, z_dec=None) -> torch.Tensor:
         if z_dec is not None:
             if z_dec.shape[-2] != x.shape[-2] or z_dec.shape[-1] != x.shape[-1]:
                 assert x.shape[-2] // z_dec.shape[-2] == x.shape[-1] // z_dec.shape[-1]
                 z_dec = F.upsample_nearest(z_dec, scale_factor=x.shape[-2] // z_dec.shape[-2])

     def forward(self, x, t=None, z_dec=None) -> torch.Tensor:
         if z_dec is not None:
+            print('shape of x and z_dec: ', x.shape, z_dec.shape)
             if z_dec.shape[-2] != x.shape[-2] or z_dec.shape[-1] != x.shape[-1]:
                 assert x.shape[-2] // z_dec.shape[-2] == x.shape[-1] // z_dec.shape[-1]
                 z_dec = F.upsample_nearest(z_dec, scale_factor=x.shape[-2] // z_dec.shape[-2])

flowae/run.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torchrun --nnodes=1 --nproc-per-node=1 run.py --config configs/experiments/dito-B-f8c4-noise-sync.yaml --save-root /mnt/nvme/dito
2	+ torchrun --nnodes=1 --nproc-per-node=1 run.py --config configs/experiments/dito-B-audio.yaml --save-root /mnt/nvme/dito

flowae/upload.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ az storage blob upload-batch \
2	+ --connection-string ""