matbee
/

sam-audio-small-onnx

@@ -17,26 +17,78 @@ import json
 from typing import Optional
-def load_audio(path: str, target_sr: int = 44100) -> np.ndarray:
-    """Load audio file and resample to target sample rate."""
     try:
-        import librosa
-        audio, sr = librosa.load(path, sr=target_sr, mono=True)
-        return audio.astype(np.float32)
-    except ImportError:
-        raise ImportError("Please install librosa: pip install librosa")
-def save_audio(audio: np.ndarray, path: str, sample_rate: int = 44100):
     """Save audio to WAV file."""
     try:
         import soundfile as sf
         sf.write(path, audio, sample_rate)
         print(f"Saved audio to {path}")
     except ImportError:
         raise ImportError("Please install soundfile: pip install soundfile")
 class SAMAudioONNXPipeline:
     """
     ONNX-based SAM Audio inference pipeline.
@@ -46,7 +98,7 @@ class SAMAudioONNXPipeline:
     def __init__(
         self,
-        model_dir: str = ".",
         device: str = "cpu",
         num_ode_steps: int = 16,
     ):
@@ -89,6 +141,16 @@ class SAMAudioONNXPipeline:
         )
         print("  ✓ DiT denoiser loaded")
         # Load tokenizer
         self._load_tokenizer()
         print("  ✓ Tokenizer loaded")
@@ -96,17 +158,120 @@ class SAMAudioONNXPipeline:
         print("All models loaded!")
     def _load_tokenizer(self):
-        """Load the T5 tokenizer."""
-        from transformers import AutoTokenizer
-        tokenizer_path = os.path.join(self.model_dir, "tokenizer")
-        if os.path.exists(tokenizer_path):
-            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-        else:
-            # Fall back to loading from HuggingFace
-            with open(os.path.join(self.model_dir, "tokenizer_config.json")) as f:
-                config = json.load(f)
-            self.tokenizer = AutoTokenizer.from_pretrained(config.get("model_name", "google-t5/t5-base"))
     def encode_audio(self, audio: np.ndarray) -> np.ndarray:
         """
@@ -186,189 +351,141 @@ class SAMAudioONNXPipeline:
         Returns:
             Tuple of (hidden_states, attention_mask)
         """
-        tokens = self.tokenizer(
-            text,
-            return_tensors="np",
-            padding=True,
-            truncation=True,
-            max_length=77,
-        )
         outputs = self.t5_encoder.run(
             ["hidden_states"],
             {
-                "input_ids": tokens["input_ids"].astype(np.int64),
-                "attention_mask": tokens["attention_mask"].astype(np.int64),
             },
         )
-        return outputs[0], tokens["attention_mask"]
     def dit_step(
         self,
         noisy_audio: np.ndarray,
-        time: np.ndarray,
         audio_features: np.ndarray,
         text_features: np.ndarray,
         text_mask: np.ndarray,
-        anchor_ids: Optional[np.ndarray] = None,
-        anchor_alignment: Optional[np.ndarray] = None,
-        audio_pad_mask: Optional[np.ndarray] = None,
     ) -> np.ndarray:
-        """
-        Run one step of the DiT denoiser.
-        Args:
-            noisy_audio: Current noisy latent, shape (batch, seq_len, latent_dim*2)
-            time: Current time step, shape (batch,)
-            audio_features: Encoded audio features
-            text_features: Encoded text features
-            text_mask: Text attention mask
-            anchor_ids: Optional anchor IDs
-            anchor_alignment: Optional anchor alignment
-            audio_pad_mask: Optional audio padding mask
-        Returns:
-            Velocity prediction for ODE step
-        """
-        batch_size, seq_len = noisy_audio.shape[:2]
-        # Create default values for optional inputs
-        if anchor_ids is None:
-            anchor_ids = np.zeros((batch_size, seq_len), dtype=np.int64)
-        if anchor_alignment is None:
-            anchor_alignment = np.zeros((batch_size, seq_len), dtype=np.int64)
-        if audio_pad_mask is None:
-            audio_pad_mask = np.ones((batch_size, seq_len), dtype=bool)
-        # Video features are zeros for audio-only inference
-        vision_dim = 1024
-        masked_video_features = np.zeros(
-            (batch_size, vision_dim, seq_len), dtype=np.float32
-        )
-        outputs = self.dit.run(
-            ["velocity"],
-            {
-                "noisy_audio": noisy_audio.astype(np.float32),
-                "time": time.astype(np.float32),
-                "audio_features": audio_features.astype(np.float32),
-                "text_features": text_features.astype(np.float32),
-                "text_mask": text_mask.astype(bool),
-                "masked_video_features": masked_video_features,
-                "anchor_ids": anchor_ids,
-                "anchor_alignment": anchor_alignment,
-                "audio_pad_mask": audio_pad_mask,
-            },
-        )
         return outputs[0]
-    def ode_solve_midpoint(
-        self,
-        initial: np.ndarray,
-        audio_features: np.ndarray,
-        text_features: np.ndarray,
-        text_mask: np.ndarray,
-    ) -> np.ndarray:
-        """
-        Solve the ODE using midpoint method.
-        This implements the same midpoint solver as the PyTorch version,
-        unrolled for ONNX Runtime inference.
-        Args:
-            initial: Initial noisy latent (usually zeros or noise)
-            audio_features: Encoded audio features
-            text_features: Encoded text features
-            text_mask: Text attention mask
-        Returns:
-            Final denoised latent
-        """
-        dt = self.step_size
-        x = initial.copy()
-        for i in range(self.num_ode_steps):
-            t = np.array([i * dt], dtype=np.float32)
-            t_mid = np.array([t[0] + dt / 2], dtype=np.float32)
-            # Midpoint method: k1 = f(t, x)
-            k1 = self.dit_step(x, t, audio_features, text_features, text_mask)
-            # Midpoint: x_mid = x + dt/2 * k1
-            x_mid = x + (dt / 2) * k1
-            # k2 = f(t + dt/2, x_mid)
-            k2 = self.dit_step(x_mid, t_mid, audio_features, text_features, text_mask)
-            # Update: x = x + dt * k2
-            x = x + dt * k2
-            print(f"  ODE step {i+1}/{self.num_ode_steps}")
-        return x
     def separate(
-        self,
-        audio: np.ndarray,
         text: str,
-        sample_rate: int = 44100,
-    ) -> np.ndarray:
         """
-        Perform audio source separation.
         Args:
-            audio: Input audio waveform at 44.1kHz
-            text: Text description of the source to separate
-            sample_rate: Sample rate of input audio
         Returns:
-            Separated audio waveform
         """
-        print(f"\nSeparating: '{text}'")
-        # 1. Encode audio to latent space
         print("1. Encoding audio...")
-        audio_latent = self.encode_audio(audio)
-        print(f"   Audio latent shape: {audio_latent.shape}")
-        # 2. Encode text
         print("2. Encoding text...")
         text_features, text_mask = self.encode_text(text)
         print(f"   Text features shape: {text_features.shape}")
-        # 3. Prepare initial state and audio features
-        # SAMAudio._get_audio_features: returns torch.cat([audio_features, audio_features], dim=2)
-        batch_size, latent_dim, time_steps = audio_latent.shape
-        mixture_features = audio_latent.transpose(0, 2, 1)  # (B, T, C=128)
-        # Audio features is mixture DUPLICATED (not [mixture, zeros]!)
-        audio_features = np.concatenate([
-            mixture_features,  # Mixture latent
-            mixture_features   # Mixture latent (DUPLICATE)
-        ], axis=-1)  # -> (B, T, 256)
-        # Initial state is random noise for ODE solving from t=0 to t=1
-        initial = np.random.randn(batch_size, time_steps, latent_dim * 2).astype(np.float32)
-        # 4. Run ODE solver
-        print("3. Running ODE solver...")
-        result = self.ode_solve_midpoint(
-            initial, audio_features, text_features, text_mask
-        )
-        # 5. Extract separated audio latent
-        # SAMAudio: target is first 128 dims, residual is second 128 dims
-        target_latent = result[:, :, :latent_dim].transpose(0, 2, 1)  # (B, C, T) - TARGET
-        separated_latent = target_latent
-        print(f"   Separated latent shape: {separated_latent.shape}")
         # 6. Decode to waveform
         print("4. Decoding audio...")
         separated_audio = self.decode_audio(separated_latent)
         print(f"   Output audio shape: {separated_audio.shape}")
-        return separated_audio
 def main():
@@ -378,61 +495,64 @@ def main():
     parser.add_argument(
         "--audio",
         type=str,
-        required=True,
-        help="Path to input audio file",
-    )
-    parser.add_argument(
-        "--text",
-        type=str,
-        required=True,
-        help="Text description of the source to separate",
-    )
-    parser.add_argument(
-        "--output",
-        type=str,
-        default="separated.wav",
-        help="Path for output audio file",
-    )
-    parser.add_argument(
-        "--model-dir",
-        type=str,
-        default=".",
-        help="Directory containing ONNX models",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cpu",
-        choices=["cpu", "cuda"],
-        help="Device to use for inference",
-    )
-    parser.add_argument(
-        "--ode-steps",
-        type=int,
-        default=16,
-        help="Number of ODE solver steps",
     )
     args = parser.parse_args()
-    # Load pipeline
     pipeline = SAMAudioONNXPipeline(
         model_dir=args.model_dir,
         device=args.device,
-        num_ode_steps=args.ode_steps,
     )
-    # Load input audio
-    print(f"\nLoading audio: {args.audio}")
-    audio = load_audio(args.audio, target_sr=44100)
-    print(f"Audio duration: {len(audio) / 44100:.2f} seconds")
-    # Run separation
-    separated = pipeline.separate(audio, args.text)
-    # Save output
-    save_audio(separated, args.output, sample_rate=44100)
-    print(f"\n✓ Done! Separated audio saved to {args.output}")
 if __name__ == "__main__":

 from typing import Optional
+def load_audio(path: str, target_sr: int = 48000) -> np.ndarray:
+    """Load audio file and resample to target sample rate. Supports video files via torchaudio/librosa."""
+    # Try torchaudio first as it handles video files well
     try:
+        import torchaudio
+        import torch
+        wav, sr = torchaudio.load(path)
+        if wav.shape[0] > 1:
+            wav = wav.mean(0, keepdim=True)
+        if sr != target_sr:
+            resampler = torchaudio.transforms.Resample(sr, target_sr)
+            wav = resampler(wav)
+        return wav.squeeze().numpy().astype(np.float32)
+    except Exception as e:
+        # Fallback to librosa
+        try:
+            import librosa
+            audio, sr = librosa.load(path, sr=target_sr, mono=True)
+            return audio.astype(np.float32)
+        except ImportError:
+            raise ImportError("Please install torchaudio or librosa: pip install torchaudio librosa")
+        except Exception as e2:
+            raise RuntimeError(f"Failed to load audio from {path}: {e2}")
+def save_audio(audio: np.ndarray, path: str, sample_rate: int = 48000):
     """Save audio to WAV file."""
     try:
         import soundfile as sf
+        # Ensure audio is 1D for mono output
+        if audio.ndim > 1:
+            audio = audio.flatten()
         sf.write(path, audio, sample_rate)
         print(f"Saved audio to {path}")
     except ImportError:
         raise ImportError("Please install soundfile: pip install soundfile")
+def save_video_with_audio(frames: np.ndarray, audio: np.ndarray, path: str, sample_rate: int = 48000, fps: float = 24.0):
+    """Save masked video frames and separated audio to a movie file."""
+    try:
+        import torch
+        import torchvision
+        import torchaudio
+        # frames is [T, C, H, W] in 0-255 or -1 to 1?
+        # load_video_frames returns [-1, 1], we want [0, 255]
+        frames_uint8 = ((frames * 0.5 + 0.5) * 255).astype(np.uint8)
+        # torchvision.io.write_video expects [T, H, W, C]
+        video_tensor = torch.from_numpy(frames_uint8).permute(0, 2, 3, 1)
+        # Prepare audio
+        if audio.ndim == 1:
+            audio = audio[None, :] # [1, Samples]
+        audio_tensor = torch.from_numpy(audio)
+        print(f"Saving merged video to {path}...")
+        torchvision.io.write_video(
+            path,
+            video_tensor,
+            fps=fps,
+            video_codec="libx264",
+            audio_array=audio_tensor,
+            audio_fps=sample_rate,
+            audio_codec="aac"
+        )
+        print(f" ✓ Video saved to {path}")
+    except Exception as e:
+        print(f"Warning: Failed to save video: {e}")
 class SAMAudioONNXPipeline:
     """
     ONNX-based SAM Audio inference pipeline.
     def __init__(
         self,
+        model_dir: str = "onnx_models",
         device: str = "cpu",
         num_ode_steps: int = 16,
     ):
         )
         print("  ✓ DiT denoiser loaded")
+        # Load Vision Encoder if available
+        self.vision_encoder = None
+        vision_path = os.path.join(model_dir, "vision_encoder.onnx")
+        if os.path.exists(vision_path):
+            self.vision_encoder = ort.InferenceSession(
+                vision_path,
+                providers=providers,
+            )
+            print("  ✓ Vision encoder loaded")
         # Load tokenizer
         self._load_tokenizer()
         print("  ✓ Tokenizer loaded")
         print("All models loaded!")
     def _load_tokenizer(self):
+        """
+        Load the T5 tokenizer using SentencePiece.
+        This avoids the dependency on the 'transformers' library.
+        """
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            raise ImportError("Please install sentencepiece: pip install sentencepiece")
+        # Load the sentencepiece model file
+        sp_path = os.path.join(self.model_dir, "tokenizer", "spiece.model")
+        if not os.path.exists(sp_path):
+            sp_path = os.path.join(self.model_dir, "spiece.model")
+        if not os.path.exists(sp_path):
+            raise FileNotFoundError(f"SentencePiece model not found at {sp_path}")
+        # Create a T5-compatible tokenizer wrapper
+        class T5ONNXTokenizer:
+            def __init__(self, sp_path):
+                self.sp = spm.SentencePieceProcessor()
+                self.sp.load(sp_path)
+            def encode(self, text: str) -> np.ndarray:
+                ids = self.sp.encode(text)
+                if len(ids) > 0 and ids[-1] != 1:  # Ensure </s> (ID 1)
+                    ids.append(1)
+                elif len(ids) == 0:
+                    ids = [1]
+                return np.array(ids, dtype=np.int64).reshape(1, -1)
+            def decode(self, tokens: np.ndarray) -> str:
+                if tokens.ndim > 1:
+                    tokens = tokens.flatten()
+                return self.sp.decode(tokens.tolist())
+        self.tokenizer = T5ONNXTokenizer(sp_path)
+    def load_video_frames(self, path: str, num_steps: int, mask_path: Optional[str] = None) -> tuple[np.ndarray, np.ndarray, float]:
+        """
+        Load video frames and align them to audio latent steps.
+        Optionally applies a binary mask for visual prompting.
+        Returns (normalized_frames, visual_frames).
+        """
+        try:
+            from torchcodec.decoders import VideoDecoder
+            import torch
+            import torch.nn.functional as F
+        except ImportError:
+            raise ImportError("Please install torchcodec and torch: pip install torchcodec torch")
+        decoder = VideoDecoder(path, dimension_order="NCHW")
+        all_data = decoder.get_frames_in_range(0, len(decoder))
+        # Audio feature steps are aligned to timestamps
+        # SAM Audio DACVAE: 48kHz, rates [2, 8, 10, 12] -> hop_length = 1536
+        hop_length = 1536
+        sample_rate = 48000
+        step_timestamps = np.arange(num_steps) * hop_length / sample_rate
+        # Get actual video framerate
+        metadata = decoder.metadata
+        fps = metadata.average_fps if metadata.average_fps is not None else 24.0
+        # Find nearest frame for each step
+        diffs = np.abs(all_data.pts_seconds.numpy()[:, None] - step_timestamps[None, :])
+        frame_idxs = np.argmin(diffs, axis=0)
+        frames = all_data.data[frame_idxs] # [num_steps, 3, H, W]
+        # Apply mask if provided (SAM3 style masking)
+        if mask_path:
+            print(f"    Applying mask from {mask_path}...")
+            mask_decoder = VideoDecoder(mask_path, dimension_order="NCHW")
+            mask_data = mask_decoder.get_frames_in_range(0, len(mask_decoder))
+            # Align mask frames same as video frames
+            m_diffs = np.abs(mask_data.pts_seconds.numpy()[:, None] - step_timestamps[None, :])
+            m_frame_idxs = np.argmin(m_diffs, axis=0)
+            masks = mask_data.data[m_frame_idxs] # [num_steps, C, H, W]
+            # Convert to binary mask (any non-zero is 1)
+            # In SAM Audio, masking means zeroing out the object: v * (mask == 0)
+            binary_mask = (masks.float().mean(dim=1, keepdim=True) > 128).float()
+            frames = frames.float() * (1.0 - binary_mask)
+        # Resize and normalize as per PerceptionEncoder
+        image_size = 336
+        frames_resized = F.interpolate(frames.float(), size=(image_size, image_size), mode="bicubic")
+        frames_norm = (frames_resized / 255.0 - 0.5) / 0.5
+        return frames_norm.numpy(), frames_norm.numpy(), fps
+    def encode_video(self, frames: np.ndarray) -> np.ndarray:
+        """Run vision encoder on framed images."""
+        if self.vision_encoder is None:
+            raise RuntimeError("Vision encoder model not loaded")
+        # Vision encoder might have hardcoded batch size 1 from export
+        # We run it in a loop for each frame to be safe
+        all_features = []
+        for i in range(len(frames)):
+            frame = frames[i:i+1] # [1, 3, H, W]
+            outputs = self.vision_encoder.run(
+                ["vision_features"],
+                {"video_frames": frame}
+            )
+            all_features.append(outputs[0]) # [1, 1024]
+        features = np.concatenate(all_features, axis=0) # [N, 1024]
+        # DiT expects (B, 1024, T)
+        return features.transpose(1, 0)[None, :, :]
     def encode_audio(self, audio: np.ndarray) -> np.ndarray:
         """
         Returns:
             Tuple of (hidden_states, attention_mask)
         """
+        input_ids = self.tokenizer.encode(text)
+        attention_mask = np.ones_like(input_ids)
         outputs = self.t5_encoder.run(
             ["hidden_states"],
             {
+                "input_ids": input_ids.astype(np.int64),
+                "attention_mask": attention_mask.astype(np.int64),
             },
         )
+        return outputs[0], attention_mask
     def dit_step(
         self,
         noisy_audio: np.ndarray,
+        time: float,
         audio_features: np.ndarray,
         text_features: np.ndarray,
         text_mask: np.ndarray,
+        masked_video_features: Optional[np.ndarray] = None,
     ) -> np.ndarray:
+        """Run a single DiT denoiser step."""
+        batch_size = noisy_audio.shape[0]
+        seq_len = noisy_audio.shape[1]
+        # Prepare placeholders for anchors if not used
+        # anchor_ids: <null>=0, <pad>=3. [B, 2]
+        anchor_ids = np.zeros((batch_size, 2), dtype=np.int64)
+        anchor_ids[:, 1] = 3
+        # anchor_alignment: 0 for active, 1 for pad. [B, T]
+        anchor_alignment = np.zeros((batch_size, seq_len), dtype=np.int64)
+        # audio_pad_mask: True/1 for valid, False/0 for pad. [B, T]
+        audio_pad_mask = np.ones((batch_size, seq_len), dtype=np.bool_)
+        # video features placeholder if not provided
+        if masked_video_features is None:
+            # Vision dimension is 1024 for small
+            vision_dim = 1024
+            masked_video_features = np.zeros((batch_size, vision_dim, seq_len), dtype=np.float32)
+        inputs = {
+            "noisy_audio": noisy_audio.astype(np.float32),
+            "time": np.array([time], dtype=np.float32),
+            "audio_features": audio_features.astype(np.float32),
+            "text_features": text_features.astype(np.float32),
+            "text_mask": text_mask.astype(np.bool_),
+            "masked_video_features": masked_video_features.astype(np.float32),
+            "anchor_ids": anchor_ids.astype(np.int64),
+            "anchor_alignment": anchor_alignment.astype(np.int64),
+            "audio_pad_mask": audio_pad_mask.astype(np.bool_),
+        }
+        outputs = self.dit.run(None, inputs)
         return outputs[0]
     def separate(
+        self,
+        audio: np.ndarray,
         text: str,
+        video_path: Optional[str] = None,
+        mask_path: Optional[str] = None
+    ) -> tuple[np.ndarray, Optional[np.ndarray], float]:
         """
+        Perform the full separation pipeline.
         Args:
+            audio: Input mixture waveform
+            text: Text description of the target source
+            video_path: Optional path to a video for visual conditioning
+            mask_path: Optional path to a video/image mask for visual prompting
         Returns:
+            Tuple of (Separated source waveform, Masked video frames if any, fps)
         """
+        # 1. Encode audio to latents
         print("1. Encoding audio...")
+        latent_features = self.encode_audio(audio)
+        # latent_features is (B, 128, T), DiT expects (B, T, 128)
+        latent_features = latent_features.transpose(0, 2, 1)
+        # Mixture features are duplicated (mixture, mixture) for conditioning
+        audio_features = np.concatenate([latent_features, latent_features], axis=2)
+        print(f"   Audio latent shape: {latent_features.shape}")
+        # 2. Encode text to features
         print("2. Encoding text...")
         text_features, text_mask = self.encode_text(text)
         print(f"   Text features shape: {text_features.shape}")
+        # 3. Encode video if provided
+        masked_video_features = None
+        visual_frames = None
+        fps = 24.0
+        if video_path and self.vision_encoder:
+            print("3a. Loading and encoding video...")
+            norm_frames, visual_frames, fps = self.load_video_frames(video_path, latent_features.shape[1], mask_path)
+            masked_video_features = self.encode_video(norm_frames) # This returns [B, 1024, T] (BCT)
+            print(f"    Video features shape: {masked_video_features.shape}")
+        # 4. Run ODE solver (midpoint method)
+        print("3. Running ODE solver...")
+        # Start from random noise
+        # Note: audio_features is [B, T, 256], DiT output is [B, T, 256]
+        B, T, C = audio_features.shape
+        x = np.random.randn(B, T, C).astype(np.float32)
+        steps = self.num_ode_steps
+        dt = 1.0 / steps
+        for i in range(steps):
+            t = i * dt
+            print(f"  ODE step {i+1}/{steps}", end="\r")
+            k1 = self.dit_step(x, t, audio_features, text_features, text_mask, masked_video_features)
+            x_mid = x + k1 * (dt / 2.0)
+            k2 = self.dit_step(x_mid, t + dt/2.0, audio_features, text_features, text_mask, masked_video_features)
+            x = x + k2 * dt
+        # Extract the target source (first 128 dimensions)
+        # The DiT model produces [B, T, 256] -> we want [B, T, 128]
+        separated_latent = x[:, :, :128].transpose(0, 2, 1) # Back to [B, 128, T] for decoder
+        print(f"\n   Separated latent shape: {separated_latent.shape}")
         # 6. Decode to waveform
         print("4. Decoding audio...")
         separated_audio = self.decode_audio(separated_latent)
         print(f"   Output audio shape: {separated_audio.shape}")
+        return separated_audio, visual_frames, fps
 def main():
     parser.add_argument(
         "--audio",
         type=str,
+        help="Path to input audio file (optional if --video is provided)",
     )
+    parser.add_argument("--text", type=str, default="", help="Text description of the target source (optional if --video is provided)")
+    parser.add_argument("--video", type=str, help="Optional path to video file for conditional separation")
+    parser.add_argument("--mask", type=str, help="Optional path to mask file (visual prompting)")
+    parser.add_argument("--output", type=str, default="separated.wav", help="Output WAV file path")
+    parser.add_argument("--output-video", type=str, help="Optional path to save masked video with separated audio")
+    parser.add_argument("--model-dir", type=str, default="onnx_models", help="Directory containing ONNX models")
+    parser.add_argument("--steps", type=int, default=16, help="Number of ODE solver steps")
+    parser.add_argument("--device", type=str, default="cuda", choices=["cpu", "cuda"], help="Inference device")
     args = parser.parse_args()
+    # 0. Initialize pipeline
     pipeline = SAMAudioONNXPipeline(
         model_dir=args.model_dir,
         device=args.device,
+        num_ode_steps=args.steps,
     )
+    # 1. Resolve audio/video paths
+    if not args.audio and not args.video:
+        parser.error("At least one of --audio or --video must be provided.")
+    # If no text is provided but a mask is, that's a pure visual prompt
+    if not args.text and not args.video:
+         parser.error("--text is required for audio-only separation.")
+    audio_path = args.audio if args.audio else args.video
+    # 1. Load audio
+    print(f"\nLoading audio from: {audio_path}")
+    audio = load_audio(audio_path, target_sr=48000)
+    print(f"Audio duration: {len(audio)/48000:.2f} seconds")
+    # 3. Run separation
+    try:
+        # Separate
+        separated_audio, masked_frames, fps = pipeline.separate(
+            audio,
+            args.text,
+            video_path=args.video if args.video else None,
+            mask_path=args.mask
+        )
+        # Save output audio
+        save_audio(separated_audio, args.output, sample_rate=48000)
+        # Save output video if requested
+        if args.output_video and masked_frames is not None:
+            save_video_with_audio(masked_frames, separated_audio, args.output_video, sample_rate=48000, fps=fps)
+        print(f"\n✓ Done! Separated audio saved to {args.output}")
+    except Exception as e:
+        print(f"\nError during separation: {e}")
+        import traceback
+        traceback.print_exc()
 if __name__ == "__main__":