Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

onnx_export/__init__.py +1 -0
onnx_export/export_all.py +115 -0
onnx_export/export_dacvae.py +425 -0
onnx_export/export_dit.py +543 -0
onnx_export/export_peaframe.py +288 -0
onnx_export/export_t5.py +315 -0
onnx_export/standalone_config.py +116 -0

onnx_export/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # ONNX Export utilities for SAM Audio

onnx_export/export_all.py ADDED Viewed

	@@ -0,0 +1,115 @@

+#!/usr/bin/env python3
+"""
+Export all SAM Audio components to ONNX format.
+This script exports:
+1. DACVAE encoder and decoder (audio codec)
+2. T5 text encoder
+3. DiT transformer (single-step for ODE solving)
+Usage:
+    python -m onnx_export.export_all --output-dir onnx_models --verify
+"""
+import os
+import argparse
+import subprocess
+import sys
+def run_export(module: str, args: list[str]) -> bool:
+    """Run an export module with the given arguments."""
+    cmd = [sys.executable, "-m", module] + args
+    print(f"\n{'='*60}")
+    print(f"Running: {' '.join(cmd)}")
+    print(f"{'='*60}\n")
+    result = subprocess.run(cmd, cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    return result.returncode == 0
+def main():
+    parser = argparse.ArgumentParser(description="Export all SAM Audio components to ONNX")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="onnx_models",
+        help="Output directory for ONNX models",
+    )
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Verify ONNX output matches PyTorch",
+    )
+    parser.add_argument(
+        "--skip-dacvae",
+        action="store_true",
+        help="Skip DACVAE export",
+    )
+    parser.add_argument(
+        "--skip-t5",
+        action="store_true",
+        help="Skip T5 export",
+    )
+    parser.add_argument(
+        "--skip-dit",
+        action="store_true",
+        help="Skip DiT export",
+    )
+    args = parser.parse_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+    results = {}
+    # Export DACVAE
+    if not args.skip_dacvae:
+        export_args = ["--output-dir", args.output_dir]
+        if args.verify:
+            export_args.append("--verify")
+        results["DACVAE"] = run_export("onnx_export.export_dacvae", export_args)
+    # Export T5
+    if not args.skip_t5:
+        export_args = ["--output-dir", args.output_dir]
+        if args.verify:
+            export_args.append("--verify")
+        results["T5"] = run_export("onnx_export.export_t5", export_args)
+    # Export DiT
+    if not args.skip_dit:
+        export_args = ["--output-dir", args.output_dir]
+        if args.verify:
+            export_args.append("--verify")
+        results["DiT"] = run_export("onnx_export.export_dit", export_args)
+    # Print summary
+    print(f"\n{'='*60}")
+    print("Export Summary")
+    print(f"{'='*60}")
+    all_success = True
+    for name, success in results.items():
+        status = "✓" if success else "✗"
+        print(f"  {status} {name}")
+        if not success:
+            all_success = False
+    # List exported files
+    print(f"\nExported files in {args.output_dir}:")
+    for f in sorted(os.listdir(args.output_dir)):
+        path = os.path.join(args.output_dir, f)
+        if os.path.isfile(path):
+            size_mb = os.path.getsize(path) / (1024 * 1024)
+            print(f"  {f}: {size_mb:.1f} MB")
+    if all_success:
+        print("\n✓ All exports completed successfully!")
+    else:
+        print("\n✗ Some exports failed")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

onnx_export/export_dacvae.py ADDED Viewed

	@@ -0,0 +1,425 @@

+#!/usr/bin/env python3
+"""
+Export DACVAE (audio codec) to ONNX format.
+This exports the encoder and decoder separately:
+- Encoder: audio waveform → latent features
+- Decoder: latent features → audio waveform
+Usage:
+    python -m onnx_export.export_dacvae --output-dir onnx_models --verify
+"""
+import os
+import argparse
+import torch
+import torch.nn as nn
+import dacvae
+from huggingface_hub import hf_hub_download
+# Default DACVAE configuration (matches SAM Audio)
+DEFAULT_CONFIG = {
+    "encoder_dim": 64,
+    "encoder_rates": [2, 8, 10, 12],
+    "latent_dim": 1024,
+    "decoder_dim": 1536,
+    "decoder_rates": [12, 10, 8, 2],
+    "n_codebooks": 16,
+    "codebook_size": 1024,
+    "codebook_dim": 128,
+    "quantizer_dropout": False,
+    "sample_rate": 48000,
+}
+class DACVAEEncoderWrapper(nn.Module):
+    """Wrapper for DACVAE encoder that outputs continuous latent features."""
+    def __init__(self, encoder, quantizer):
+        super().__init__()
+        self.encoder = encoder
+        self.in_proj = quantizer.in_proj
+    def forward(self, audio: torch.Tensor) -> torch.Tensor:
+        """
+        Encode audio to latent features.
+        Args:
+            audio: Input waveform, shape (batch, 1, samples)
+        Returns:
+            latent_features: Continuous latent mean, shape (batch, 128, time_steps)
+        """
+        x = self.encoder(audio)
+        # in_proj outputs 256 dim, chunk into mean and variance, use only mean
+        mean, _ = self.in_proj(x).chunk(2, dim=1)
+        return mean
+class DACVAEDecoderWrapper(nn.Module):
+    """Wrapper for DACVAE decoder that takes continuous latent features."""
+    def __init__(self, decoder, quantizer):
+        super().__init__()
+        self.decoder = decoder
+        self.out_proj = quantizer.out_proj
+    def forward(self, latent_features: torch.Tensor) -> torch.Tensor:
+        """
+        Decode latent features to audio.
+        Args:
+            latent_features: Continuous latent, shape (batch, 128, time_steps)
+        Returns:
+            audio: Output waveform, shape (batch, 1, samples)
+        """
+        x = self.out_proj(latent_features)
+        return self.decoder(x)
+def create_dacvae_model(model_id: str = "facebook/sam-audio-small") -> dacvae.DACVAE:
+    """
+    Create and load DACVAE model with weights from SAM Audio checkpoint.
+    This uses the standalone dacvae library, avoiding loading the full SAM Audio
+    model and its dependencies (vision encoder, imagebind, etc).
+    """
+    print(f"Creating DACVAE model...")
+    model = dacvae.DACVAE(
+        encoder_dim=DEFAULT_CONFIG["encoder_dim"],
+        encoder_rates=DEFAULT_CONFIG["encoder_rates"],
+        latent_dim=DEFAULT_CONFIG["latent_dim"],
+        decoder_dim=DEFAULT_CONFIG["decoder_dim"],
+        decoder_rates=DEFAULT_CONFIG["decoder_rates"],
+        n_codebooks=DEFAULT_CONFIG["n_codebooks"],
+        codebook_size=DEFAULT_CONFIG["codebook_size"],
+        codebook_dim=DEFAULT_CONFIG["codebook_dim"],
+        quantizer_dropout=DEFAULT_CONFIG["quantizer_dropout"],
+        sample_rate=DEFAULT_CONFIG["sample_rate"],
+    ).eval()
+    # Load weights from SAM Audio checkpoint
+    print(f"Downloading checkpoint from {model_id}...")
+    checkpoint_path = hf_hub_download(
+        repo_id=model_id,
+        filename="checkpoint.pt",
+    )
+    print("Loading DACVAE weights from checkpoint...")
+    state_dict = torch.load(
+        checkpoint_path,
+        map_location="cpu",
+        weights_only=True,
+        mmap=True,  # Memory-efficient loading
+    )
+    # Extract only DACVAE weights (prefixed with "audio_codec.")
+    dacvae_state_dict = {}
+    for k, v in state_dict.items():
+        if k.startswith("audio_codec."):
+            new_key = k.replace("audio_codec.", "")
+            dacvae_state_dict[new_key] = v.clone()
+    # Load weights
+    model.load_state_dict(dacvae_state_dict, strict=False)
+    # Clear large checkpoint from memory
+    del state_dict
+    print(f"  ✓ Loaded {len(dacvae_state_dict)} DACVAE weight tensors")
+    # Calculate hop_length for reference
+    import numpy as np
+    hop_length = int(np.prod(DEFAULT_CONFIG["encoder_rates"]))
+    model.hop_length = hop_length
+    model.sample_rate = DEFAULT_CONFIG["sample_rate"]
+    return model
+def export_encoder(
+    dacvae_model: dacvae.DACVAE,
+    output_path: str,
+    opset_version: int = 18,
+    device: str = "cpu",
+) -> None:
+    """Export DACVAE encoder to ONNX."""
+    print(f"Exporting DACVAE encoder to {output_path}...")
+    wrapper = DACVAEEncoderWrapper(
+        dacvae_model.encoder,
+        dacvae_model.quantizer
+    ).eval().to(device)
+    # Sample input: 1 second of audio at 48kHz
+    sample_rate = DEFAULT_CONFIG["sample_rate"]
+    dummy_audio = torch.randn(1, 1, sample_rate, device=device)
+    torch.onnx.export(
+        wrapper,
+        (dummy_audio,),
+        output_path,
+        input_names=["audio"],
+        output_names=["latent_features"],
+        dynamic_axes={
+            "audio": {0: "batch", 2: "samples"},
+            "latent_features": {0: "batch", 2: "time_steps"},
+        },
+        opset_version=opset_version,
+        do_constant_folding=True,
+        dynamo=True,
+        external_data=True,
+    )
+    print(f"  ✓ Encoder exported successfully")
+    # Validate
+    import onnx
+    model = onnx.load(output_path)
+    onnx.checker.check_model(model)
+    print(f"  ✓ ONNX model validation passed")
+def export_decoder(
+    dacvae_model: dacvae.DACVAE,
+    output_path: str,
+    opset_version: int = 18,
+    device: str = "cpu",
+) -> None:
+    """Export DACVAE decoder to ONNX."""
+    print(f"Exporting DACVAE decoder to {output_path}...")
+    wrapper = DACVAEDecoderWrapper(
+        dacvae_model.decoder,
+        dacvae_model.quantizer
+    ).eval().to(device)
+    # Sample input: 25 time steps (1 second at 48kHz with hop_length=1920)
+    hop_length = int(__import__("numpy").prod(DEFAULT_CONFIG["encoder_rates"]))
+    time_steps = DEFAULT_CONFIG["sample_rate"] // hop_length
+    dummy_latent = torch.randn(1, 128, time_steps, device=device)
+    torch.onnx.export(
+        wrapper,
+        (dummy_latent,),
+        output_path,
+        input_names=["latent_features"],
+        output_names=["waveform"],
+        dynamic_axes={
+            "latent_features": {0: "batch", 2: "time_steps"},
+            "waveform": {0: "batch", 2: "samples"},
+        },
+        opset_version=opset_version,
+        do_constant_folding=True,
+        dynamo=True,
+        external_data=True,
+    )
+    print(f"  ✓ Decoder exported successfully")
+    # Validate
+    import onnx
+    model = onnx.load(output_path)
+    onnx.checker.check_model(model)
+    print(f"  ✓ ONNX model validation passed")
+def verify_encoder(
+    dacvae_model: dacvae.DACVAE,
+    onnx_path: str,
+    device: str = "cpu",
+    tolerance: float = 1e-4,
+) -> bool:
+    """Verify ONNX encoder output matches PyTorch."""
+    import onnxruntime as ort
+    import numpy as np
+    print("Verifying encoder output...")
+    wrapper = DACVAEEncoderWrapper(
+        dacvae_model.encoder,
+        dacvae_model.quantizer
+    ).eval().to(device)
+    # Test with random audio
+    sample_rate = DEFAULT_CONFIG["sample_rate"]
+    test_audio = torch.randn(1, 1, sample_rate * 2, device=device)  # 2 seconds
+    # PyTorch output
+    with torch.no_grad():
+        pytorch_output = wrapper(test_audio).cpu().numpy()
+    # ONNX Runtime output
+    sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
+    onnx_output = sess.run(
+        ["latent_features"],
+        {"audio": test_audio.cpu().numpy()}
+    )[0]
+    # Compare
+    max_diff = np.abs(pytorch_output - onnx_output).max()
+    mean_diff = np.abs(pytorch_output - onnx_output).mean()
+    print(f"  Max diff: {max_diff:.2e}, Mean diff: {mean_diff:.2e}")
+    if max_diff > tolerance:
+        print(f"  ✗ Verification failed (tolerance: {tolerance})")
+        return False
+    print(f"  ✓ Verification passed (tolerance: {tolerance})")
+    return True
+def verify_decoder(
+    dacvae_model: dacvae.DACVAE,
+    onnx_path: str,
+    device: str = "cpu",
+    tolerance: float = 1e-3,
+) -> bool:
+    """Verify ONNX decoder output matches PyTorch."""
+    import onnxruntime as ort
+    import numpy as np
+    print("Verifying decoder output...")
+    wrapper = DACVAEDecoderWrapper(
+        dacvae_model.decoder,
+        dacvae_model.quantizer
+    ).eval().to(device)
+    # Test with random latent
+    hop_length = int(np.prod(DEFAULT_CONFIG["encoder_rates"]))
+    time_steps = DEFAULT_CONFIG["sample_rate"] // hop_length  # 25 steps = 1 second
+    test_latent = torch.randn(1, 128, time_steps, device=device)
+    # PyTorch output
+    with torch.no_grad():
+        pytorch_output = wrapper(test_latent).cpu().numpy()
+    # ONNX Runtime output
+    sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
+    onnx_output = sess.run(
+        ["waveform"],
+        {"latent_features": test_latent.cpu().numpy()}
+    )[0]
+    # Compare
+    max_diff = np.abs(pytorch_output - onnx_output).max()
+    mean_diff = np.abs(pytorch_output - onnx_output).mean()
+    print(f"  Max diff: {max_diff:.2e}, Mean diff: {mean_diff:.2e}")
+    if max_diff > tolerance:
+        print(f"  ✗ Verification failed (tolerance: {tolerance})")
+        return False
+    print(f"  ✓ Verification passed (tolerance: {tolerance})")
+    return True
+def main():
+    parser = argparse.ArgumentParser(description="Export DACVAE to ONNX")
+    parser.add_argument(
+        "--model-id",
+        type=str,
+        default="facebook/sam-audio-small",
+        help="HuggingFace model ID (default: facebook/sam-audio-small)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="onnx_models",
+        help="Output directory for ONNX models",
+    )
+    parser.add_argument(
+        "--opset-version",
+        type=int,
+        default=18,
+        help="ONNX opset version (default: 18)",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        help="Device to use for export (default: cpu)",
+    )
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Verify ONNX output matches PyTorch",
+    )
+    parser.add_argument(
+        "--tolerance",
+        type=float,
+        default=1e-4,
+        help="Tolerance for verification (default: 1e-4)",
+    )
+    parser.add_argument(
+        "--encoder-only",
+        action="store_true",
+        help="Export only the encoder",
+    )
+    parser.add_argument(
+        "--decoder-only",
+        action="store_true",
+        help="Export only the decoder",
+    )
+    args = parser.parse_args()
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Load model
+    dacvae_model = create_dacvae_model(args.model_id)
+    print(f"\nDACVAE Configuration:")
+    print(f"  Model: {args.model_id}")
+    print(f"  Sample rate: {DEFAULT_CONFIG['sample_rate']} Hz")
+    print(f"  Hop length: {int(__import__('numpy').prod(DEFAULT_CONFIG['encoder_rates']))}")
+    print(f"  Latent dim: 128 (continuous)")
+    # Export encoder
+    if not args.decoder_only:
+        encoder_path = os.path.join(args.output_dir, "dacvae_encoder.onnx")
+        export_encoder(
+            dacvae_model,
+            encoder_path,
+            opset_version=args.opset_version,
+            device=args.device,
+        )
+        if args.verify:
+            verify_encoder(
+                dacvae_model,
+                encoder_path,
+                device=args.device,
+                tolerance=args.tolerance,
+            )
+    # Export decoder
+    if not args.encoder_only:
+        decoder_path = os.path.join(args.output_dir, "dacvae_decoder.onnx")
+        export_decoder(
+            dacvae_model,
+            decoder_path,
+            opset_version=args.opset_version,
+            device=args.device,
+        )
+        if args.verify:
+            verify_decoder(
+                dacvae_model,
+                decoder_path,
+                device=args.device,
+                tolerance=args.tolerance * 10,  # Decoder has higher tolerance
+            )
+    print(f"\n✓ Export complete! Models saved to {args.output_dir}/")
+if __name__ == "__main__":
+    main()

onnx_export/export_dit.py ADDED Viewed

	@@ -0,0 +1,543 @@

+#!/usr/bin/env python3
+"""
+Export DiT Transformer with unrolled ODE solver to ONNX format.
+The DiT transformer is the core denoising model in SAM Audio. It uses a flow-based
+generative model with an ODE solver. For ONNX export, we unroll the fixed-step
+midpoint ODE solver into a static computation graph.
+The default configuration uses:
+- method: "midpoint"
+- step_size: 2/32 (0.0625)
+- integration range: [0, 1]
+- total steps: 16
+This creates a single ONNX model that performs the complete denoising process,
+taking noise and conditioning as input and producing denoised audio features.
+Usage:
+    python -m onnx_export.export_dit --output-dir onnx_models --verify
+"""
+import os
+import math
+import argparse
+import torch
+import torch.nn as nn
+from typing import Optional
+class SinusoidalEmbedding(nn.Module):
+    """Sinusoidal timestep embedding (identical to SAMAudio implementation)."""
+    def __init__(self, dim, theta=10000):
+        super().__init__()
+        assert (dim % 2) == 0
+        half_dim = dim // 2
+        inv_freq = torch.exp(
+            -math.log(theta) * torch.arange(half_dim).float() / half_dim
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, x, pos=None):
+        if pos is None:
+            seq_len, device = x.shape[1], x.device
+            pos = torch.arange(seq_len, device=device)
+        emb = torch.einsum("i, j -> i j", pos, self.inv_freq)
+        emb = torch.cat((emb.cos(), emb.sin()), dim=-1)
+        return emb
+class EmbedAnchors(nn.Module):
+    """Anchor embedding (identical to SAMAudio implementation)."""
+    def __init__(self, num_embeddings: int, embedding_dim: int, out_dim: int):
+        super().__init__()
+        self.embed = nn.Embedding(
+            num_embeddings + 1, embedding_dim, padding_idx=num_embeddings
+        )
+        self.gate = nn.Parameter(torch.tensor([0.0]))
+        self.proj = nn.Linear(embedding_dim, out_dim, bias=False)
+    def forward(
+        self,
+        x: torch.Tensor,
+        anchor_ids: Optional[torch.Tensor] = None,
+        anchor_alignment: Optional[torch.Tensor] = None,
+    ):
+        if anchor_ids is None:
+            return x
+        embs = self.embed(anchor_ids.gather(1, anchor_alignment))
+        proj = self.proj(embs)
+        return x + self.gate.tanh() * proj
+class DiTSingleStepWrapper(nn.Module):
+    """
+    Wrapper for DiT that performs a single forward pass (one ODE evaluation).
+    This mirrors the SAMAudio.forward() method exactly.
+    """
+    def __init__(
+        self,
+        transformer: nn.Module,
+        proj: nn.Module,
+        align_masked_video: nn.Module,
+        embed_anchors: nn.Module,
+        timestep_emb: nn.Module,
+        memory_proj: nn.Module,
+    ):
+        super().__init__()
+        self.transformer = transformer
+        self.proj = proj
+        self.align_masked_video = align_masked_video
+        self.embed_anchors = embed_anchors
+        self.timestep_emb = timestep_emb
+        self.memory_proj = memory_proj
+    def forward(
+        self,
+        noisy_audio: torch.Tensor,
+        time: torch.Tensor,
+        audio_features: torch.Tensor,
+        text_features: torch.Tensor,
+        text_mask: torch.Tensor,
+        masked_video_features: torch.Tensor,
+        anchor_ids: torch.Tensor,
+        anchor_alignment: torch.Tensor,
+        audio_pad_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Single forward pass of the DiT (one ODE function evaluation).
+        This exactly mirrors SAMAudio.forward() method.
+        """
+        # Align inputs (concatenate noisy_audio with audio_features)
+        # Same as SAMAudio.align_inputs()
+        x = torch.cat(
+            [
+                noisy_audio,
+                torch.zeros_like(audio_features),
+                audio_features,
+            ],
+            dim=2,
+        )
+        projected = self.proj(x)
+        aligned = self.align_masked_video(projected, masked_video_features)
+        aligned = self.embed_anchors(aligned, anchor_ids, anchor_alignment)
+        # Timestep embedding and memory
+        # Same as SAMAudio.forward()
+        timestep_emb_val = self.timestep_emb(time, pos=time).unsqueeze(1)
+        memory = self.memory_proj(text_features) + timestep_emb_val
+        # Transformer forward
+        output = self.transformer(
+            aligned,
+            time,
+            padding_mask=audio_pad_mask,
+            memory=memory,
+            memory_padding_mask=text_mask,
+        )
+        return output
+class UnrolledDiTWrapper(nn.Module):
+    """
+    DiT wrapper with unrolled midpoint ODE solver.
+    The midpoint method computes:
+        k1 = f(t, y)
+        k2 = f(t + h/2, y + h/2 * k1)
+        y_new = y + h * k2
+    With step_size=0.0625 and range [0,1], we have 16 steps.
+    """
+    def __init__(
+        self,
+        single_step: DiTSingleStepWrapper,
+        num_steps: int = 16,
+    ):
+        super().__init__()
+        self.single_step = single_step
+        self.num_steps = num_steps
+        self.step_size = 1.0 / num_steps
+    def forward(
+        self,
+        noise: torch.Tensor,
+        audio_features: torch.Tensor,
+        text_features: torch.Tensor,
+        text_mask: torch.Tensor,
+        masked_video_features: torch.Tensor,
+        anchor_ids: torch.Tensor,
+        anchor_alignment: torch.Tensor,
+        audio_pad_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """Complete denoising using unrolled midpoint ODE solver."""
+        B = noise.shape[0]
+        h = self.step_size
+        y = noise
+        t = torch.zeros(B, device=noise.device, dtype=noise.dtype)
+        for step in range(self.num_steps):
+            # k1 = f(t, y)
+            k1 = self.single_step(
+                y, t,
+                audio_features, text_features, text_mask,
+                masked_video_features, anchor_ids, anchor_alignment, audio_pad_mask
+            )
+            # k2 = f(t + h/2, y + h/2 * k1)
+            t_mid = t + h / 2
+            y_mid = y + (h / 2) * k1
+            k2 = self.single_step(
+                y_mid, t_mid,
+                audio_features, text_features, text_mask,
+                masked_video_features, anchor_ids, anchor_alignment, audio_pad_mask
+            )
+            # y = y + h * k2
+            y = y + h * k2
+            t = t + h
+        return y
+def load_sam_audio_components(model_id: str = "facebook/sam-audio-small", device: str = "cpu"):
+    """
+    Load SAM Audio components needed for DiT export.
+    Since we can't load the full SAMAudio model (missing perception_models),
+    we construct the components directly and load weights from checkpoint.
+    """
+    import json
+    import sys
+    import types
+    import importlib.util
+    from huggingface_hub import hf_hub_download
+    print(f"Loading SAM Audio components from {model_id}...")
+    # Download config
+    config_path = hf_hub_download(repo_id=model_id, filename="config.json")
+    with open(config_path) as f:
+        config = json.load(f)
+    # Download checkpoint
+    checkpoint_path = hf_hub_download(repo_id=model_id, filename="checkpoint.pt")
+    # Use our standalone config that doesn't have 'core' dependencies
+    from onnx_export.standalone_config import TransformerConfig
+    sam_audio_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    # Create fake module hierarchy so transformer.py's relative imports work
+    if 'sam_audio' not in sys.modules:
+        sam_audio_pkg = types.ModuleType('sam_audio')
+        sam_audio_pkg.__path__ = [os.path.join(sam_audio_path, 'sam_audio')]
+        sys.modules['sam_audio'] = sam_audio_pkg
+    if 'sam_audio.model' not in sys.modules:
+        model_pkg = types.ModuleType('sam_audio.model')
+        model_pkg.__path__ = [os.path.join(sam_audio_path, 'sam_audio', 'model')]
+        sys.modules['sam_audio.model'] = model_pkg
+    # Register our standalone config as sam_audio.model.config
+    if 'sam_audio.model.config' not in sys.modules:
+        import onnx_export.standalone_config as standalone_config
+        sys.modules['sam_audio.model.config'] = standalone_config
+    # Now import transformer module - it will use our standalone config
+    transformer_spec = importlib.util.spec_from_file_location(
+        "sam_audio.model.transformer",
+        os.path.join(sam_audio_path, "sam_audio", "model", "transformer.py")
+    )
+    transformer_module = importlib.util.module_from_spec(transformer_spec)
+    sys.modules['sam_audio.model.transformer'] = transformer_module
+    transformer_spec.loader.exec_module(transformer_module)
+    DiT = transformer_module.DiT
+    # Import align module
+    align_spec = importlib.util.spec_from_file_location(
+        "sam_audio.model.align",
+        os.path.join(sam_audio_path, "sam_audio", "model", "align.py")
+    )
+    align_module = importlib.util.module_from_spec(align_spec)
+    sys.modules['sam_audio.model.align'] = align_module
+    align_spec.loader.exec_module(align_module)
+    AlignModalities = align_module.AlignModalities
+    # Create transformer
+    transformer_config = TransformerConfig(**config.get("transformer", {}))
+    transformer = DiT(transformer_config)
+    # Calculate dimensions
+    in_channels = config.get("in_channels", 768)
+    num_anchors = config.get("num_anchors", 3)
+    anchor_embedding_dim = config.get("anchor_embedding_dim", 128)
+    # Get vision encoder dim for align_masked_video
+    vision_config = config.get("vision_encoder", {})
+    vision_dim = vision_config.get("dim", 768)
+    # Create components exactly as SAMAudio does
+    proj = nn.Linear(in_channels, transformer_config.d_model)
+    align_masked_video = AlignModalities(vision_dim, transformer_config.d_model)
+    embed_anchors = EmbedAnchors(num_anchors, anchor_embedding_dim, transformer_config.d_model)
+    timestep_emb = SinusoidalEmbedding(transformer_config.d_model)
+    # Memory projection for text features
+    text_encoder_config = config.get("text_encoder", {})
+    text_encoder_dim = text_encoder_config.get("dim", 1024)  # google/flan-t5-large
+    memory_proj = nn.Linear(text_encoder_dim, transformer_config.d_model)
+    # Load weights from checkpoint
+    print("Loading weights from checkpoint...")
+    state_dict = torch.load(checkpoint_path, map_location="cpu", mmap=True)
+    # Filter and load weights for each component
+    transformer_state = {}
+    proj_state = {}
+    align_state = {}
+    embed_anchors_state = {}
+    memory_proj_state = {}
+    for key, value in state_dict.items():
+        if key.startswith("transformer."):
+            new_key = key[len("transformer."):]
+            transformer_state[new_key] = value
+        elif key.startswith("proj."):
+            new_key = key[len("proj."):]
+            proj_state[new_key] = value
+        elif key.startswith("align_masked_video."):
+            new_key = key[len("align_masked_video."):]
+            align_state[new_key] = value
+        elif key.startswith("embed_anchors."):
+            new_key = key[len("embed_anchors."):]
+            embed_anchors_state[new_key] = value
+        elif key.startswith("memory_proj."):
+            new_key = key[len("memory_proj."):]
+            memory_proj_state[new_key] = value
+    transformer.load_state_dict(transformer_state)
+    proj.load_state_dict(proj_state)
+    align_masked_video.load_state_dict(align_state)
+    embed_anchors.load_state_dict(embed_anchors_state)
+    memory_proj.load_state_dict(memory_proj_state)
+    print(f"  ✓ Loaded transformer weights ({len(transformer_state)} tensors)")
+    print(f"  ✓ Loaded component weights")
+    # Create single step wrapper
+    single_step = DiTSingleStepWrapper(
+        transformer=transformer,
+        proj=proj,
+        align_masked_video=align_masked_video,
+        embed_anchors=embed_anchors,
+        timestep_emb=timestep_emb,
+        memory_proj=memory_proj,
+    ).eval().to(device)
+    return single_step, config
+def create_sample_inputs(batch_size: int = 1, seq_len: int = 25, device: str = "cpu"):
+    """Create sample inputs for tracing."""
+    latent_dim = 128
+    text_dim = 768  # T5-base hidden size (SAM Audio was trained with 768-dim text)
+    vision_dim = 1024  # Vision encoder dim from config
+    text_len = 77
+    return {
+        "noisy_audio": torch.randn(batch_size, seq_len, 2 * latent_dim, device=device),
+        "time": torch.zeros(batch_size, device=device),
+        "audio_features": torch.randn(batch_size, seq_len, 2 * latent_dim, device=device),
+        "text_features": torch.randn(batch_size, text_len, text_dim, device=device),
+        "text_mask": torch.ones(batch_size, text_len, dtype=torch.bool, device=device),
+        "masked_video_features": torch.zeros(batch_size, vision_dim, seq_len, device=device),
+        "anchor_ids": torch.zeros(batch_size, seq_len, dtype=torch.long, device=device),
+        "anchor_alignment": torch.zeros(batch_size, seq_len, dtype=torch.long, device=device),
+        "audio_pad_mask": torch.ones(batch_size, seq_len, dtype=torch.bool, device=device),
+    }
+def export_dit_single_step(
+    single_step: DiTSingleStepWrapper,
+    output_path: str,
+    opset_version: int = 18,
+    device: str = "cpu",
+):
+    """Export single-step DiT to ONNX (for runtime ODE solving)."""
+    import onnx
+    print(f"Exporting DiT single-step to {output_path}...")
+    sample_inputs = create_sample_inputs(device=device)
+    torch.onnx.export(
+        single_step,
+        tuple(sample_inputs.values()),
+        output_path,
+        input_names=list(sample_inputs.keys()),
+        output_names=["velocity"],
+        dynamic_axes={
+            "noisy_audio": {0: "batch_size", 1: "seq_len"},
+            "time": {0: "batch_size"},
+            "audio_features": {0: "batch_size", 1: "seq_len"},
+            "text_features": {0: "batch_size", 1: "text_len"},
+            "text_mask": {0: "batch_size", 1: "text_len"},
+            "masked_video_features": {0: "batch_size", 2: "seq_len"},
+            "anchor_ids": {0: "batch_size", 1: "seq_len"},
+            "anchor_alignment": {0: "batch_size", 1: "seq_len"},
+            "audio_pad_mask": {0: "batch_size", 1: "seq_len"},
+            "velocity": {0: "batch_size", 1: "seq_len"},
+        },
+        opset_version=opset_version,
+        do_constant_folding=True,
+        dynamo=True,
+        external_data=True,
+    )
+    print("  ✓ DiT single-step exported successfully")
+    model = onnx.load(output_path)
+    onnx.checker.check_model(model)
+    print("  ✓ ONNX model validation passed")
+    return True
+def verify_dit_single_step(
+    single_step: DiTSingleStepWrapper,
+    onnx_path: str,
+    device: str = "cpu",
+    tolerance: float = 1e-3,
+) -> bool:
+    """Verify single-step ONNX output matches PyTorch."""
+    import onnxruntime as ort
+    import numpy as np
+    print("Verifying DiT single-step output...")
+    sample_inputs = create_sample_inputs(device=device)
+    # PyTorch output
+    with torch.no_grad():
+        pytorch_output = single_step(**sample_inputs).cpu().numpy()
+    # ONNX Runtime output
+    sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
+    onnx_inputs = {}
+    for name, tensor in sample_inputs.items():
+        if tensor.dtype == torch.bool:
+            onnx_inputs[name] = tensor.cpu().numpy().astype(bool)
+        elif tensor.dtype == torch.long:
+            onnx_inputs[name] = tensor.cpu().numpy().astype(np.int64)
+        else:
+            onnx_inputs[name] = tensor.cpu().numpy().astype(np.float32)
+    onnx_output = sess.run(["velocity"], onnx_inputs)[0]
+    # Compare
+    max_diff = np.abs(pytorch_output - onnx_output).max()
+    mean_diff = np.abs(pytorch_output - onnx_output).mean()
+    print(f"  Max difference: {max_diff:.2e}")
+    print(f"  Mean difference: {mean_diff:.2e}")
+    if max_diff < tolerance:
+        print(f"  ✓ Verification passed (tolerance: {tolerance})")
+        return True
+    else:
+        print(f"  ✗ Verification failed (tolerance: {tolerance})")
+        return False
+def main():
+    parser = argparse.ArgumentParser(description="Export DiT Transformer to ONNX")
+    parser.add_argument(
+        "--model-id",
+        type=str,
+        default="facebook/sam-audio-small",
+        help="SAM Audio model ID from HuggingFace",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="onnx_models",
+        help="Output directory for ONNX models",
+    )
+    parser.add_argument(
+        "--num-steps",
+        type=int,
+        default=16,
+        help="Number of ODE solver steps (default: 16)",
+    )
+    parser.add_argument(
+        "--opset",
+        type=int,
+        default=18,
+        help="ONNX opset version (default: 18)",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        help="Device to use for export (default: cpu)",
+    )
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Verify ONNX output matches PyTorch",
+    )
+    parser.add_argument(
+        "--tolerance",
+        type=float,
+        default=1e-3,
+        help="Tolerance for verification (default: 1e-3)",
+    )
+    args = parser.parse_args()
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Load components
+    single_step, config = load_sam_audio_components(args.model_id, args.device)
+    print(f"\nDiT Configuration:")
+    print(f"  Model: {args.model_id}")
+    print(f"  ODE steps: {args.num_steps}")
+    print(f"  Step size: {1.0/args.num_steps:.4f}")
+    # Export single-step model
+    single_step_path = os.path.join(args.output_dir, "dit_single_step.onnx")
+    export_dit_single_step(
+        single_step,
+        single_step_path,
+        opset_version=args.opset,
+        device=args.device,
+    )
+    # Verify single-step
+    if args.verify:
+        verify_dit_single_step(
+            single_step,
+            single_step_path,
+            device=args.device,
+            tolerance=args.tolerance,
+        )
+    print(f"\n✓ Export complete! Model saved to {args.output_dir}")
+if __name__ == "__main__":
+    main()

onnx_export/export_peaframe.py ADDED Viewed

	@@ -0,0 +1,288 @@

+#!/usr/bin/env python3
+"""
+Export PE-A-Frame (Perception Encoder Audio Frame) span predictor to ONNX.
+The PE-A-Frame model is used for automatic anchor detection in SAM Audio.
+It analyzes audio features and predicts which segments correspond to the
+target audio source.
+Usage:
+    python -m onnx_export.export_peaframe --output-dir onnx_models --verify
+"""
+import os
+import argparse
+import torch
+import torch.nn as nn
+from typing import Optional
+class PEAFrameWrapper(nn.Module):
+    """
+    Wrapper for PE-A-Frame model for ONNX export.
+    Exposes the forward pass that takes audio features and returns
+    frame-level predictions.
+    """
+    def __init__(self, model: nn.Module):
+        super().__init__()
+        self.model = model
+    def forward(
+        self,
+        audio_features: torch.Tensor,
+        audio_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass for span prediction.
+        Args:
+            audio_features: Audio features [batch, seq_len, hidden_dim]
+            audio_mask: Optional attention mask [batch, seq_len]
+        Returns:
+            Frame-level predictions [batch, seq_len, num_classes]
+        """
+        return self.model(audio_features, attention_mask=audio_mask)
+def load_peaframe_model(config_name: str = "pe-a-frame-large", device: str = "cpu"):
+    """Load the PE-A-Frame model from perception_models."""
+    from core.audio_visual_encoder.pe import PEAudioFrame
+    print(f"Loading PE-A-Frame model: {config_name}...")
+    model = PEAudioFrame.from_config(config_name, pretrained=True)
+    model = model.eval().to(device)
+    num_params = sum(p.numel() for p in model.parameters())
+    print(f"  ✓ Model loaded: {num_params:,} parameters")
+    return model
+def get_tokenizer(model):
+    """Get the text tokenizer from the model config."""
+    from transformers import AutoTokenizer
+    text_model_name = model.config.text_model._name_or_path
+    return AutoTokenizer.from_pretrained(text_model_name)
+def create_sample_inputs(model, batch_size: int = 1, device: str = "cpu"):
+    """Create sample inputs for tracing."""
+    tokenizer = get_tokenizer(model)
+    # Sample text query
+    text = "a person speaking"
+    tokens = tokenizer(
+        [text] * batch_size,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=77,
+    )
+    # Sample audio (10 seconds at 16kHz)
+    # DAC encoder expects (batch, channels, samples) format
+    sample_rate = 16000
+    audio_len = sample_rate * 10
+    audio = torch.randn(batch_size, 1, audio_len, device=device)  # Added channel dimension
+    return {
+        "input_ids": tokens["input_ids"].to(device),
+        "attention_mask": tokens["attention_mask"].to(device),
+        "input_values": audio,
+    }
+def export_peaframe(
+    model: nn.Module,
+    output_path: str,
+    opset_version: int = 18,
+    device: str = "cpu",
+):
+    """Export PE-A-Frame to ONNX."""
+    import onnx
+    print(f"Exporting PE-A-Frame to {output_path}...")
+    sample_inputs = create_sample_inputs(model, device=device)
+    # Put model in eval mode
+    model = model.eval()
+    # Test forward pass first
+    with torch.no_grad():
+        try:
+            output = model(
+                input_ids=sample_inputs["input_ids"],
+                input_values=sample_inputs["input_values"],
+                attention_mask=sample_inputs["attention_mask"],
+                return_spans=False,  # Disable span return for ONNX (list output)
+            )
+            print(f"  Test forward pass: audio_embeds shape = {output.audio_embeds.shape}")
+            print(f"  Test forward pass: text_embeds shape = {output.text_embeds.shape}")
+        except Exception as e:
+            print(f"  Forward pass failed: {e}")
+            raise
+    # Create a wrapper that returns just the audio embeddings for simpler ONNX
+    class PEAFrameONNXWrapper(nn.Module):
+        def __init__(self, model):
+            super().__init__()
+            self.model = model
+        def forward(self, input_ids, input_values, attention_mask):
+            output = self.model(
+                input_ids=input_ids,
+                input_values=input_values,
+                attention_mask=attention_mask,
+                return_spans=False,
+            )
+            return output.audio_embeds, output.text_embeds
+    wrapper = PEAFrameONNXWrapper(model)
+    wrapper.eval()
+    torch.onnx.export(
+        wrapper,
+        (sample_inputs["input_ids"], sample_inputs["input_values"], sample_inputs["attention_mask"]),
+        output_path,
+        input_names=["input_ids", "input_values", "attention_mask"],
+        output_names=["audio_embeds", "text_embeds"],
+        dynamic_axes={
+            "input_ids": {0: "batch_size", 1: "seq_len"},
+            "input_values": {0: "batch_size", 1: "audio_len"},
+            "attention_mask": {0: "batch_size", 1: "seq_len"},
+            "audio_embeds": {0: "batch_size", 1: "num_frames"},
+            "text_embeds": {0: "batch_size"},
+        },
+        opset_version=opset_version,
+        do_constant_folding=True,
+        external_data=True,
+    )
+    print("  ✓ PE-A-Frame exported successfully")
+    # Validate
+    onnx_model = onnx.load(output_path)
+    onnx.checker.check_model(onnx_model)
+    print("  ✓ ONNX model validation passed")
+    return True
+def verify_peaframe(
+    model: nn.Module,
+    onnx_path: str,
+    device: str = "cpu",
+    tolerance: float = 1e-3,
+) -> bool:
+    """Verify ONNX output matches PyTorch."""
+    import onnxruntime as ort
+    import numpy as np
+    print("Verifying PE-A-Frame output...")
+    sample_inputs = create_sample_inputs(model, device=device)
+    # PyTorch output
+    model = model.eval()
+    with torch.no_grad():
+        pytorch_output = model(
+            input_ids=sample_inputs["input_ids"],
+            input_values=sample_inputs["input_values"],
+            attention_mask=sample_inputs["attention_mask"],
+            return_spans=False,
+        )
+        pytorch_audio_embeds = pytorch_output.audio_embeds.cpu().numpy()
+        pytorch_text_embeds = pytorch_output.text_embeds.cpu().numpy()
+    # ONNX Runtime output
+    sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
+    onnx_inputs = {
+        "input_ids": sample_inputs["input_ids"].cpu().numpy().astype(np.int64),
+        "input_values": sample_inputs["input_values"].cpu().numpy().astype(np.float32),
+        "attention_mask": sample_inputs["attention_mask"].cpu().numpy().astype(np.int64),
+    }
+    onnx_outputs = sess.run(["audio_embeds", "text_embeds"], onnx_inputs)
+    onnx_audio_embeds = onnx_outputs[0]
+    onnx_text_embeds = onnx_outputs[1]
+    # Compare
+    audio_max_diff = np.abs(pytorch_audio_embeds - onnx_audio_embeds).max()
+    text_max_diff = np.abs(pytorch_text_embeds - onnx_text_embeds).max()
+    print(f"  Audio embeds max diff: {audio_max_diff:.2e}")
+    print(f"  Text embeds max diff: {text_max_diff:.2e}")
+    max_diff = max(audio_max_diff, text_max_diff)
+    if max_diff < tolerance:
+        print(f"  ✓ Verification passed (tolerance: {tolerance})")
+        return True
+    else:
+        print(f"  ✗ Verification failed (tolerance: {tolerance})")
+        return False
+def main():
+    parser = argparse.ArgumentParser(description="Export PE-A-Frame to ONNX")
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="pe-a-frame-large",
+        help="PE-A-Frame config name",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="onnx_models",
+        help="Output directory for ONNX models",
+    )
+    parser.add_argument(
+        "--opset",
+        type=int,
+        default=18,
+        help="ONNX opset version",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        help="Device to use",
+    )
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Verify ONNX output",
+    )
+    parser.add_argument(
+        "--tolerance",
+        type=float,
+        default=1e-3,
+        help="Verification tolerance",
+    )
+    args = parser.parse_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Load model
+    model = load_peaframe_model(args.config, args.device)
+    # Export
+    output_path = os.path.join(args.output_dir, "peaframe.onnx")
+    export_peaframe(model, output_path, args.opset, args.device)
+    # Verify
+    if args.verify:
+        verify_peaframe(model, output_path, args.device, args.tolerance)
+    print(f"\n✓ Export complete! Model saved to {output_path}")
+if __name__ == "__main__":
+    main()

onnx_export/export_t5.py ADDED Viewed

	@@ -0,0 +1,315 @@

+#!/usr/bin/env python3
+"""
+Export T5 Text Encoder to ONNX format.
+The T5 encoder takes tokenized input_ids and attention_mask, and produces
+hidden states. For SAM Audio inference, the output hidden states and attention
+mask are used as conditioning for the DiT transformer.
+Usage:
+    python -m onnx_export.export_t5 --output-dir onnx_models --verify
+"""
+import os
+import argparse
+import torch
+import torch.nn as nn
+class T5EncoderWrapper(nn.Module):
+    """
+    Wrapper for T5EncoderModel that provides a clean interface for ONNX export.
+    The wrapper takes tokenized inputs (input_ids, attention_mask) and returns
+    the last hidden state. This matches how SAMAudio's T5TextEncoder uses the model.
+    """
+    def __init__(self, t5_model, max_length: int = 77):
+        super().__init__()
+        self.model = t5_model
+        self.max_length = max_length
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            input_ids: Tokenized input IDs, shape (batch, seq_len)
+            attention_mask: Attention mask, shape (batch, seq_len)
+        Returns:
+            hidden_states: T5 encoder output, shape (batch, seq_len, hidden_dim)
+        """
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+        )
+        return outputs.last_hidden_state
+def load_t5_encoder(model_name: str = "google-t5/t5-base", device: str = "cpu"):
+    """
+    Load T5 encoder model and tokenizer.
+    SAM Audio's DiT was trained with T5-base (768-dim) text features.
+    """
+    from transformers import T5EncoderModel, AutoTokenizer
+    print(f"Loading T5 encoder: {model_name}...")
+    model = T5EncoderModel.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = model.eval().to(device)
+    return model, tokenizer
+def export_t5_encoder(
+    t5_model,
+    tokenizer,
+    output_path: str,
+    opset_version: int = 18,
+    max_length: int = 77,
+    device: str = "cpu",
+):
+    """Export T5 encoder to ONNX format."""
+    import onnx
+    print(f"Exporting T5 encoder to {output_path}...")
+    wrapper = T5EncoderWrapper(t5_model, max_length=max_length).eval().to(device)
+    # Create sample input
+    sample_text = ["A dog barking loudly in the background"]
+    encoded = tokenizer(
+        sample_text,
+        truncation=True,
+        max_length=max_length,
+        padding="max_length",  # Pad to max_length for consistent shape
+        return_tensors="pt",
+    )
+    sample_input_ids = encoded["input_ids"].to(device)
+    sample_attention_mask = encoded["attention_mask"].to(device)
+    # Export using torch.onnx.export
+    torch.onnx.export(
+        wrapper,
+        (sample_input_ids, sample_attention_mask),
+        output_path,
+        input_names=["input_ids", "attention_mask"],
+        output_names=["hidden_states"],
+        dynamic_axes={
+            "input_ids": {0: "batch_size", 1: "sequence_length"},
+            "attention_mask": {0: "batch_size", 1: "sequence_length"},
+            "hidden_states": {0: "batch_size", 1: "sequence_length"},
+        },
+        opset_version=opset_version,
+        do_constant_folding=True,
+        dynamo=True,
+        external_data=True,  # T5-large is ~1GB
+    )
+    print("  ✓ T5 encoder exported successfully")
+    # Validate the model
+    model = onnx.load(output_path)
+    onnx.checker.check_model(model)
+    print("  ✓ ONNX model validation passed")
+    return True
+def verify_t5_encoder(
+    t5_model,
+    tokenizer,
+    onnx_path: str,
+    max_length: int = 77,
+    device: str = "cpu",
+    tolerance: float = 1e-4,
+) -> bool:
+    """Verify ONNX T5 encoder output matches PyTorch."""
+    import onnxruntime as ort
+    import numpy as np
+    print("Verifying T5 encoder output...")
+    wrapper = T5EncoderWrapper(t5_model, max_length=max_length).eval().to(device)
+    # Test with multiple texts
+    test_texts = [
+        "A dog barking in the distance",
+        "Piano music playing softly",
+        "Rain falling on a rooftop",
+    ]
+    for text in test_texts:
+        # Tokenize
+        encoded = tokenizer(
+            [text],
+            truncation=True,
+            max_length=max_length,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        input_ids = encoded["input_ids"].to(device)
+        attention_mask = encoded["attention_mask"].to(device)
+        # PyTorch output
+        with torch.no_grad():
+            pytorch_output = wrapper(input_ids, attention_mask).cpu().numpy()
+        # ONNX Runtime output
+        sess = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
+        onnx_output = sess.run(
+            ["hidden_states"],
+            {
+                "input_ids": input_ids.cpu().numpy().astype(np.int64),
+                "attention_mask": attention_mask.cpu().numpy().astype(np.int64),
+            }
+        )[0]
+        # Compare
+        max_diff = np.abs(pytorch_output - onnx_output).max()
+        mean_diff = np.abs(pytorch_output - onnx_output).mean()
+        print(f"  Text: '{text[:30]}...'")
+        print(f"    Max diff: {max_diff:.2e}, Mean diff: {mean_diff:.2e}")
+        if max_diff > tolerance:
+            print(f"  ✗ Verification failed for text: {text}")
+            return False
+    print(f"  ✓ Verification passed (tolerance: {tolerance})")
+    return True
+def save_tokenizer_config(tokenizer, output_dir: str):
+    """
+    Save tokenizer vocabulary and configuration for runtime use.
+    This allows the ONNX runtime to perform tokenization without
+    needing the full transformers library.
+    """
+    import json
+    tokenizer_dir = os.path.join(output_dir, "tokenizer")
+    tokenizer.save_pretrained(tokenizer_dir)
+    # Also save a simple config for ONNX.js usage
+    config = {
+        "model_name": tokenizer.name_or_path,
+        "max_length": 77,
+        "vocab_size": tokenizer.vocab_size,
+        "pad_token_id": tokenizer.pad_token_id,
+        "eos_token_id": tokenizer.eos_token_id,
+    }
+    config_path = os.path.join(output_dir, "tokenizer_config.json")
+    with open(config_path, "w") as f:
+        json.dump(config, f, indent=2)
+    print(f"  ✓ Tokenizer saved to {tokenizer_dir}")
+    return tokenizer_dir
+def main():
+    parser = argparse.ArgumentParser(description="Export T5 Text Encoder to ONNX")
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="google-t5/t5-base",
+        help="T5 model name from HuggingFace (default: google-t5/t5-base)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="onnx_models",
+        help="Output directory for ONNX models",
+    )
+    parser.add_argument(
+        "--max-length",
+        type=int,
+        default=77,
+        help="Maximum token sequence length (default: 77)",
+    )
+    parser.add_argument(
+        "--opset",
+        type=int,
+        default=18,
+        help="ONNX opset version (default: 18)",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        help="Device to use for export (default: cpu)",
+    )
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Verify ONNX output matches PyTorch",
+    )
+    parser.add_argument(
+        "--tolerance",
+        type=float,
+        default=1e-4,
+        help="Tolerance for verification (default: 1e-4)",
+    )
+    parser.add_argument(
+        "--save-tokenizer",
+        action="store_true",
+        default=True,
+        help="Save tokenizer for runtime use (default: True)",
+    )
+    args = parser.parse_args()
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Load T5
+    t5_model, tokenizer = load_t5_encoder(args.model_name, args.device)
+    print(f"\nT5 Configuration:")
+    print(f"  Model: {args.model_name}")
+    print(f"  Hidden size: {t5_model.config.d_model}")
+    print(f"  Max length: {args.max_length}")
+    print(f"  Vocab size: {tokenizer.vocab_size}")
+    # Export
+    encoder_path = os.path.join(args.output_dir, "t5_encoder.onnx")
+    export_t5_encoder(
+        t5_model,
+        tokenizer,
+        encoder_path,
+        opset_version=args.opset,
+        max_length=args.max_length,
+        device=args.device,
+    )
+    # Save tokenizer
+    if args.save_tokenizer:
+        save_tokenizer_config(tokenizer, args.output_dir)
+    # Verify
+    if args.verify:
+        verify_t5_encoder(
+            t5_model,
+            tokenizer,
+            encoder_path,
+            max_length=args.max_length,
+            device=args.device,
+            tolerance=args.tolerance,
+        )
+    print(f"\n✓ Export complete! Model saved to {encoder_path}")
+if __name__ == "__main__":
+    main()

onnx_export/standalone_config.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""
+Standalone configuration classes for ONNX export.
+These are copied from sam_audio/model/config.py but without the problematic
+imports that require the 'perception_models' library.
+"""
+from typing import Optional
+import numpy as np
+class DACVAEConfig:
+    def __init__(
+        self,
+        encoder_dim: int = 64,
+        encoder_rates: list[int] = [2, 8, 10, 12],
+        latent_dim: int = 1024,
+        decoder_dim: int = 1536,
+        decoder_rates: list[int] = [12, 10, 8, 2],
+        n_codebooks: int = 16,
+        codebook_size: int = 1024,
+        codebook_dim: int = 128,
+        quantizer_dropout: bool = False,
+        sample_rate: int = 48_000,
+        mean: float = 0.0,
+        std: float = 1.0,
+    ):
+        self.encoder_dim = encoder_dim
+        self.encoder_rates = encoder_rates
+        self.latent_dim = latent_dim
+        self.decoder_dim = decoder_dim
+        self.decoder_rates = decoder_rates
+        self.n_codebooks = n_codebooks
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.quantizer_dropout = quantizer_dropout
+        self.sample_rate = sample_rate
+        self.mean = mean
+        self.std = std
+    @property
+    def hop_length(self):
+        return int(np.prod(self.encoder_rates))
+class T5EncoderConfig:
+    def __init__(
+        self,
+        name: str = "t5-base",
+        max_length: Optional[int] = 512,
+        pad_mode: str = "longest",
+        dim: int = 768,
+    ):
+        self.dim = dim
+        self.name = name
+        self.max_length = max_length
+        self.pad_mode = pad_mode
+class TransformerConfig:
+    """Configuration for the DiT transformer."""
+    def __init__(
+        self,
+        dim: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 16,
+        dropout: float = 0.1,
+        norm_eps: float = 1.0e-05,
+        qk_norm: bool = True,
+        fc_bias: bool = False,
+        ffn_exp: int = 4,
+        ffn_dim_multiplier: int = 1,
+        multiple_of: int = 64,
+        non_linearity: str = "swiglu",
+        use_rope: bool = True,
+        max_positions: int = 10000,
+        frequency_embedding_dim: int = 256,
+        timestep_non_linearity: str = "swiglu",
+        t_block_non_linearity: str = "silu",
+        t_block_bias: bool = True,
+        context_dim: int = 2048,
+        context_non_linearity: str = "swiglu",
+        context_embedder_dropout: float = 0.0,
+        context_norm: bool = False,
+        out_channels: int = 256,
+        in_channels: Optional[int] = None,
+    ):
+        self.dim = dim
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.dropout = dropout
+        self.norm_eps = norm_eps
+        self.qk_norm = qk_norm
+        self.fc_bias = fc_bias
+        self.ffn_exp = ffn_exp
+        self.ffn_dim_multiplier = ffn_dim_multiplier
+        self.multiple_of = multiple_of
+        self.non_linearity = non_linearity
+        self.use_rope = use_rope
+        self.max_positions = max_positions
+        self.frequency_embedding_dim = frequency_embedding_dim
+        self.timestep_non_linearity = timestep_non_linearity
+        self.t_block_non_linearity = t_block_non_linearity
+        self.t_block_bias = t_block_bias
+        self.context_dim = context_dim
+        self.context_non_linearity = context_non_linearity
+        self.context_embedder_dropout = context_embedder_dropout
+        self.context_norm = context_norm
+        self.out_channels = out_channels
+        self.in_channels = in_channels
+    @property
+    def d_model(self):
+        """Alias for dim, used in transformer code."""
+        return self.dim