matbee
/

sam-audio-small-onnx

+#!/usr/bin/env python3
+"""
+SAM Audio ONNX Runtime Inference Example
+This script demonstrates how to use the exported ONNX models for audio source
+separation inference. It shows the complete pipeline from text input to
+separated audio output.
+Usage:
+    python onnx_inference.py --audio input.wav --text "a person speaking"
+"""
+import os
+import argparse
+import numpy as np
+import json
+from typing import Optional
+def load_audio(path: str, target_sr: int = 44100) -> np.ndarray:
+    """Load audio file and resample to target sample rate."""
+    try:
+        import librosa
+        audio, sr = librosa.load(path, sr=target_sr, mono=True)
+        return audio.astype(np.float32)
+    except ImportError:
+        raise ImportError("Please install librosa: pip install librosa")
+def save_audio(audio: np.ndarray, path: str, sample_rate: int = 44100):
+    """Save audio to WAV file."""
+    try:
+        import soundfile as sf
+        sf.write(path, audio, sample_rate)
+        print(f"Saved audio to {path}")
+    except ImportError:
+        raise ImportError("Please install soundfile: pip install soundfile")
+class SAMAudioONNXPipeline:
+    """
+    ONNX-based SAM Audio inference pipeline.
+    This class orchestrates all the ONNX models to perform audio source separation.
+    """
+    def __init__(
+        self,
+        model_dir: str = ".",
+        device: str = "cpu",
+        num_ode_steps: int = 16,
+    ):
+        import onnxruntime as ort
+        self.model_dir = model_dir
+        self.num_ode_steps = num_ode_steps
+        self.step_size = 1.0 / num_ode_steps
+        # Set up ONNX Runtime providers
+        if device == "cuda":
+            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+        else:
+            providers = ["CPUExecutionProvider"]
+        # Load models
+        print("Loading ONNX models...")
+        self.dacvae_encoder = ort.InferenceSession(
+            os.path.join(model_dir, "dacvae_encoder.onnx"),
+            providers=providers,
+        )
+        print("  ✓ DACVAE encoder loaded")
+        self.dacvae_decoder = ort.InferenceSession(
+            os.path.join(model_dir, "dacvae_decoder.onnx"),
+            providers=providers,
+        )
+        print("  ✓ DACVAE decoder loaded")
+        self.t5_encoder = ort.InferenceSession(
+            os.path.join(model_dir, "t5_encoder.onnx"),
+            providers=providers,
+        )
+        print("  ✓ T5 encoder loaded")
+        self.dit = ort.InferenceSession(
+            os.path.join(model_dir, "dit_single_step.onnx"),
+            providers=providers,
+        )
+        print("  ✓ DiT denoiser loaded")
+        # Load tokenizer
+        self._load_tokenizer()
+        print("  ✓ Tokenizer loaded")
+        print("All models loaded!")
+    def _load_tokenizer(self):
+        """Load the T5 tokenizer."""
+        from transformers import AutoTokenizer
+        tokenizer_path = os.path.join(self.model_dir, "tokenizer")
+        if os.path.exists(tokenizer_path):
+            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        else:
+            # Fall back to loading from HuggingFace
+            with open(os.path.join(self.model_dir, "tokenizer_config.json")) as f:
+                config = json.load(f)
+            self.tokenizer = AutoTokenizer.from_pretrained(config.get("model_name", "google-t5/t5-base"))
+    def encode_audio(self, audio: np.ndarray) -> np.ndarray:
+        """
+        Encode audio waveform to latent features.
+        Args:
+            audio: Audio waveform, shape (samples,) or (1, 1, samples)
+        Returns:
+            Latent features, shape (1, latent_dim, time_steps)
+        """
+        # Ensure correct shape (batch, channels, samples)
+        if audio.ndim == 1:
+            audio = audio.reshape(1, 1, -1)
+        elif audio.ndim == 2:
+            audio = audio.reshape(1, *audio.shape)
+        outputs = self.dacvae_encoder.run(
+            ["latent_features"],
+            {"audio": audio.astype(np.float32)},
+        )
+        return outputs[0]
+    def decode_audio(self, latent: np.ndarray) -> np.ndarray:
+        """
+        Decode latent features to audio waveform.
+        Uses chunked decoding since the DACVAE decoder was exported with
+        fixed 25 time steps. Processes in chunks and concatenates.
+        Args:
+            latent: Latent features, shape (1, latent_dim, time_steps)
+        Returns:
+            Audio waveform, shape (samples,)
+        """
+        chunk_size = 25  # DACVAE decoder's fixed time step size
+        hop_length = 1920  # Samples per time step at 48kHz
+        _, _, time_steps = latent.shape
+        audio_chunks = []
+        for start_idx in range(0, time_steps, chunk_size):
+            end_idx = min(start_idx + chunk_size, time_steps)
+            chunk = latent[:, :, start_idx:end_idx]
+            # Pad last chunk if needed
+            actual_size = chunk.shape[2]
+            if actual_size < chunk_size:
+                pad_size = chunk_size - actual_size
+                chunk = np.pad(chunk, ((0, 0), (0, 0), (0, pad_size)), mode='constant')
+            # Decode chunk
+            chunk_audio = self.dacvae_decoder.run(
+                ["waveform"],
+                {"latent_features": chunk.astype(np.float32)},
+            )[0]
+            # Trim padded output
+            if actual_size < chunk_size:
+                trim_samples = actual_size * hop_length
+                chunk_audio = chunk_audio[:, :, :trim_samples]
+            audio_chunks.append(chunk_audio)
+        # Concatenate all chunks
+        full_audio = np.concatenate(audio_chunks, axis=2)
+        return full_audio.squeeze()
+    def encode_text(self, text: str) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Encode text prompt to features.
+        Args:
+            text: Text description of the audio to separate
+        Returns:
+            Tuple of (hidden_states, attention_mask)
+        """
+        tokens = self.tokenizer(
+            text,
+            return_tensors="np",
+            padding=True,
+            truncation=True,
+            max_length=77,
+        )
+        outputs = self.t5_encoder.run(
+            ["hidden_states"],
+            {
+                "input_ids": tokens["input_ids"].astype(np.int64),
+                "attention_mask": tokens["attention_mask"].astype(np.int64),
+            },
+        )
+        return outputs[0], tokens["attention_mask"]
+    def dit_step(
+        self,
+        noisy_audio: np.ndarray,
+        time: np.ndarray,
+        audio_features: np.ndarray,
+        text_features: np.ndarray,
+        text_mask: np.ndarray,
+        anchor_ids: Optional[np.ndarray] = None,
+        anchor_alignment: Optional[np.ndarray] = None,
+        audio_pad_mask: Optional[np.ndarray] = None,
+    ) -> np.ndarray:
+        """
+        Run one step of the DiT denoiser.
+        Args:
+            noisy_audio: Current noisy latent, shape (batch, seq_len, latent_dim*2)
+            time: Current time step, shape (batch,)
+            audio_features: Encoded audio features
+            text_features: Encoded text features
+            text_mask: Text attention mask
+            anchor_ids: Optional anchor IDs
+            anchor_alignment: Optional anchor alignment
+            audio_pad_mask: Optional audio padding mask
+        Returns:
+            Velocity prediction for ODE step
+        """
+        batch_size, seq_len = noisy_audio.shape[:2]
+        # Create default values for optional inputs
+        if anchor_ids is None:
+            anchor_ids = np.zeros((batch_size, seq_len), dtype=np.int64)
+        if anchor_alignment is None:
+            anchor_alignment = np.zeros((batch_size, seq_len), dtype=np.int64)
+        if audio_pad_mask is None:
+            audio_pad_mask = np.ones((batch_size, seq_len), dtype=bool)
+        # Video features are zeros for audio-only inference
+        vision_dim = 1024
+        masked_video_features = np.zeros(
+            (batch_size, vision_dim, seq_len), dtype=np.float32
+        )
+        outputs = self.dit.run(
+            ["velocity"],
+            {
+                "noisy_audio": noisy_audio.astype(np.float32),
+                "time": time.astype(np.float32),
+                "audio_features": audio_features.astype(np.float32),
+                "text_features": text_features.astype(np.float32),
+                "text_mask": text_mask.astype(bool),
+                "masked_video_features": masked_video_features,
+                "anchor_ids": anchor_ids,
+                "anchor_alignment": anchor_alignment,
+                "audio_pad_mask": audio_pad_mask,
+            },
+        )
+        return outputs[0]
+    def ode_solve_midpoint(
+        self,
+        initial: np.ndarray,
+        audio_features: np.ndarray,
+        text_features: np.ndarray,
+        text_mask: np.ndarray,
+    ) -> np.ndarray:
+        """
+        Solve the ODE using midpoint method.
+        This implements the same midpoint solver as the PyTorch version,
+        unrolled for ONNX Runtime inference.
+        Args:
+            initial: Initial noisy latent (usually zeros or noise)
+            audio_features: Encoded audio features
+            text_features: Encoded text features
+            text_mask: Text attention mask
+        Returns:
+            Final denoised latent
+        """
+        dt = self.step_size
+        x = initial.copy()
+        for i in range(self.num_ode_steps):
+            t = np.array([i * dt], dtype=np.float32)
+            t_mid = np.array([t[0] + dt / 2], dtype=np.float32)
+            # Midpoint method: k1 = f(t, x)
+            k1 = self.dit_step(x, t, audio_features, text_features, text_mask)
+            # Midpoint: x_mid = x + dt/2 * k1
+            x_mid = x + (dt / 2) * k1
+            # k2 = f(t + dt/2, x_mid)
+            k2 = self.dit_step(x_mid, t_mid, audio_features, text_features, text_mask)
+            # Update: x = x + dt * k2
+            x = x + dt * k2
+            print(f"  ODE step {i+1}/{self.num_ode_steps}")
+        return x
+    def separate(
+        self,
+        audio: np.ndarray,
+        text: str,
+        sample_rate: int = 44100,
+    ) -> np.ndarray:
+        """
+        Perform audio source separation.
+        Args:
+            audio: Input audio waveform at 44.1kHz
+            text: Text description of the source to separate
+            sample_rate: Sample rate of input audio
+        Returns:
+            Separated audio waveform
+        """
+        print(f"\nSeparating: '{text}'")
+        # 1. Encode audio to latent space
+        print("1. Encoding audio...")
+        audio_latent = self.encode_audio(audio)
+        print(f"   Audio latent shape: {audio_latent.shape}")
+        # 2. Encode text
+        print("2. Encoding text...")
+        text_features, text_mask = self.encode_text(text)
+        print(f"   Text features shape: {text_features.shape}")
+        # 3. Prepare initial state and audio features
+        # SAMAudio._get_audio_features: returns torch.cat([audio_features, audio_features], dim=2)
+        batch_size, latent_dim, time_steps = audio_latent.shape
+        mixture_features = audio_latent.transpose(0, 2, 1)  # (B, T, C=128)
+        # Audio features is mixture DUPLICATED (not [mixture, zeros]!)
+        audio_features = np.concatenate([
+            mixture_features,  # Mixture latent
+            mixture_features   # Mixture latent (DUPLICATE)
+        ], axis=-1)  # -> (B, T, 256)
+        # Initial state is random noise for ODE solving from t=0 to t=1
+        initial = np.random.randn(batch_size, time_steps, latent_dim * 2).astype(np.float32)
+        # 4. Run ODE solver
+        print("3. Running ODE solver...")
+        result = self.ode_solve_midpoint(
+            initial, audio_features, text_features, text_mask
+        )
+        # 5. Extract separated audio latent
+        # SAMAudio: target is first 128 dims, residual is second 128 dims
+        target_latent = result[:, :, :latent_dim].transpose(0, 2, 1)  # (B, C, T) - TARGET
+        separated_latent = target_latent
+        print(f"   Separated latent shape: {separated_latent.shape}")
+        # 6. Decode to waveform
+        print("4. Decoding audio...")
+        separated_audio = self.decode_audio(separated_latent)
+        print(f"   Output audio shape: {separated_audio.shape}")
+        return separated_audio
+def main():
+    parser = argparse.ArgumentParser(
+        description="SAM Audio ONNX Runtime Inference"
+    )
+    parser.add_argument(
+        "--audio",
+        type=str,
+        required=True,
+        help="Path to input audio file",
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        required=True,
+        help="Text description of the source to separate",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="separated.wav",
+        help="Path for output audio file",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        default=".",
+        help="Directory containing ONNX models",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        choices=["cpu", "cuda"],
+        help="Device to use for inference",
+    )
+    parser.add_argument(
+        "--ode-steps",
+        type=int,
+        default=16,
+        help="Number of ODE solver steps",
+    )
+    args = parser.parse_args()
+    # Load pipeline
+    pipeline = SAMAudioONNXPipeline(
+        model_dir=args.model_dir,
+        device=args.device,
+        num_ode_steps=args.ode_steps,
+    )
+    # Load input audio
+    print(f"\nLoading audio: {args.audio}")
+    audio = load_audio(args.audio, target_sr=44100)
+    print(f"Audio duration: {len(audio) / 44100:.2f} seconds")
+    # Run separation
+    separated = pipeline.separate(audio, args.text)
+    # Save output
+    save_audio(separated, args.output, sample_rate=44100)
+    print(f"\n✓ Done! Separated audio saved to {args.output}")
+if __name__ == "__main__":
+    main()