FluidInference
/

diar-streaming-sortformer-coreml

+#!/usr/bin/env python3
+"""Export Sortformer models with Gradient Descent configuration.
+This creates models compatible with the Swift SortformerDiarizer interface.
+Outputs both .mlpackage and .mlmodelc (compiled) versions.
+Gradient Descent Config:
+- chunk_len: 6
+- chunk_right_context: 7 (higher quality, more context)
+- chunk_left_context: 1
+- fifo_len: 40
+- spkcache_len: 188
+- spkcache_update_period: 31
+"""
+import os
+import subprocess
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+import torch
+import torch.nn as nn
+import numpy as np
+import coremltools as ct
+from nemo.collections.asr.models import SortformerEncLabelModel
+from coreml_wrappers import PreEncoderWrapper, PreprocessorWrapper
+# Gradient Descent configuration (matching Swift --gradient-descent)
+GRADIENT_DESCENT_CONFIG = {
+    'chunk_len': 6,
+    'chunk_right_context': 7,      # Higher quality
+    'chunk_left_context': 1,
+    'fifo_len': 40,
+    'spkcache_len': 188,
+    'spkcache_update_period': 31,
+}
+print("=" * 70)
+print("Exporting Sortformer Models - Gradient Descent Config")
+print("=" * 70)
+print(f"Config: {GRADIENT_DESCENT_CONFIG}")
+# Load model
+print("\nLoading NeMo model...")
+model = SortformerEncLabelModel.from_pretrained(
+    "nvidia/diar_streaming_sortformer_4spk-v2.1", map_location="cpu"
+)
+model.eval()
+# Apply Gradient Descent config
+modules = model.sortformer_modules
+modules.chunk_len = GRADIENT_DESCENT_CONFIG['chunk_len']
+modules.chunk_right_context = GRADIENT_DESCENT_CONFIG['chunk_right_context']
+modules.chunk_left_context = GRADIENT_DESCENT_CONFIG['chunk_left_context']
+modules.fifo_len = GRADIENT_DESCENT_CONFIG['fifo_len']
+modules.spkcache_len = GRADIENT_DESCENT_CONFIG['spkcache_len']
+modules.spkcache_update_period = GRADIENT_DESCENT_CONFIG['spkcache_update_period']
+# Calculate dimensions
+chunk_len = modules.chunk_len
+input_chunk_time = (chunk_len + modules.chunk_left_context + modules.chunk_right_context) * modules.subsampling_factor
+fc_d_model = modules.fc_d_model  # 512
+spkcache_len = modules.spkcache_len
+fifo_len = modules.fifo_len
+feat_dim = 128
+pre_encode_out_len = input_chunk_time // modules.subsampling_factor
+total_concat_len = spkcache_len + fifo_len + pre_encode_out_len
+print(f"\nDimensions:")
+print(f"  Input chunk frames: {input_chunk_time} (= ({chunk_len}+{modules.chunk_left_context}+{modules.chunk_right_context})*{modules.subsampling_factor})")
+print(f"  Pre-encode output: {pre_encode_out_len}")
+print(f"  Total concat len: {total_concat_len}")
+print(f"  FC d_model: {fc_d_model}")
+print(f"  FIFO len: {fifo_len}")
+print(f"  Spkcache len: {spkcache_len}")
+# Calculate audio samples needed for preprocessor
+# For gradient descent: (6+1+7)*8 = 112 mel frames
+# Audio samples = (112-1)*160 + 400 = 18160, but NeMo adds padding
+# Empirically: 112 frames needs specific sample count
+mel_stride = 160
+mel_window = 400
+# For 112 mel frames with NeMo padding
+preprocessor_audio_samples = (input_chunk_time - 1) * mel_stride + mel_window
+print(f"  Preprocessor audio samples: {preprocessor_audio_samples}")
+# Create output directory
+output_dir = "coreml_models_gradient_descent"
+os.makedirs(output_dir, exist_ok=True)
+# =========================================================
+# 0. Export Preprocessor (audio -> mel features)
+# =========================================================
+print("\n[0/3] Exporting Preprocessor...")
+preprocessor_wrapper = PreprocessorWrapper(model.preprocessor)
+preprocessor_wrapper.eval()
+# Trace with correct audio sample count
+audio_input = torch.randn(1, preprocessor_audio_samples)
+audio_length = torch.tensor([preprocessor_audio_samples], dtype=torch.long)
+traced_preprocessor = torch.jit.trace(preprocessor_wrapper, (audio_input, audio_length))
+preprocessor_ml = ct.convert(
+    traced_preprocessor,
+    inputs=[
+        ct.TensorType(name="audio_signal", shape=audio_input.shape, dtype=np.float32),
+        ct.TensorType(name="length", shape=audio_length.shape, dtype=np.int32),
+    ],
+    outputs=[
+        ct.TensorType(name="features", dtype=np.float32),
+        ct.TensorType(name="feature_lengths", dtype=np.int32),
+    ],
+    minimum_deployment_target=ct.target.iOS16,
+    compute_precision=ct.precision.FLOAT32,
+    compute_units=ct.ComputeUnit.CPU_ONLY  # CPU for FP32 precision
+)
+preprocessor_path = os.path.join(output_dir, "Pipeline_Preprocessor.mlpackage")
+preprocessor_ml.save(preprocessor_path)
+print(f"  Saved {preprocessor_path}")
+# =========================================================
+# 1. Export PreEncoder
+# =========================================================
+print("\n[1/3] Exporting PreEncoder...")
+input_chunk = torch.randn(1, input_chunk_time, feat_dim)
+input_chunk_len = torch.tensor([input_chunk_time], dtype=torch.long)
+input_spkcache = torch.randn(1, spkcache_len, fc_d_model)
+input_spkcache_len = torch.tensor([spkcache_len], dtype=torch.long)
+input_fifo = torch.randn(1, fifo_len, fc_d_model)
+input_fifo_len = torch.tensor([fifo_len], dtype=torch.long)
+pre_encoder = PreEncoderWrapper(model)
+pre_encoder.eval()
+traced_pre_encoder = torch.jit.trace(pre_encoder, (
+    input_chunk, input_chunk_len,
+    input_spkcache, input_spkcache_len,
+    input_fifo, input_fifo_len
+))
+# Use names that match Swift expectations
+pre_encoder_ml = ct.convert(
+    traced_pre_encoder,
+    inputs=[
+        ct.TensorType(name="chunk", shape=input_chunk.shape, dtype=np.float32),
+        ct.TensorType(name="chunk_lengths", shape=input_chunk_len.shape, dtype=np.int32),
+        ct.TensorType(name="spkcache", shape=input_spkcache.shape, dtype=np.float32),
+        ct.TensorType(name="spkcache_lengths", shape=input_spkcache_len.shape, dtype=np.int32),
+        ct.TensorType(name="fifo", shape=input_fifo.shape, dtype=np.float32),
+        ct.TensorType(name="fifo_lengths", shape=input_fifo_len.shape, dtype=np.int32),
+    ],
+    outputs=[
+        ct.TensorType(name="pre_encoder_embs", dtype=np.float32),
+        ct.TensorType(name="pre_encoder_lengths", dtype=np.int32),
+        ct.TensorType(name="chunk_embs_in", dtype=np.float32),
+        ct.TensorType(name="chunk_lens_in", dtype=np.int32),
+    ],
+    minimum_deployment_target=ct.target.iOS16,
+    compute_precision=ct.precision.FLOAT32,
+    compute_units=ct.ComputeUnit.ALL
+)
+pre_encoder_path = os.path.join(output_dir, "Pipeline_PreEncoder.mlpackage")
+pre_encoder_ml.save(pre_encoder_path)
+print(f"  Saved {pre_encoder_path}")
+# =========================================================
+# 2. Export Fixed Head (with identity ops to preserve embeddings)
+# =========================================================
+print("\n[2/3] Exporting Fixed Head...")
+class FixedSortformerHead(nn.Module):
+    """Head wrapper that forces chunk_pre_encoder_embs to be computed."""
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.identity_scale = nn.Parameter(torch.ones(1), requires_grad=False)
+    def forward(self, pre_encoder_embs, pre_encoder_lengths, chunk_embs_in, chunk_lens_in):
+        # Frontend encoder
+        spkcache_fifo_chunk_fc_encoder_embs, spkcache_fifo_chunk_fc_encoder_lengths = self.model.frontend_encoder(
+            processed_signal=pre_encoder_embs,
+            processed_signal_length=pre_encoder_lengths,
+            bypass_pre_encode=True,
+        )
+        # Forward inference
+        speaker_preds = self.model.forward_infer(
+            spkcache_fifo_chunk_fc_encoder_embs, spkcache_fifo_chunk_fc_encoder_lengths
+        )
+        # Force the embedding to be computed (prevents optimization)
+        chunk_pre_encoder_embs = chunk_embs_in * self.identity_scale
+        chunk_pre_encoder_lengths = chunk_lens_in + 0
+        return speaker_preds, chunk_pre_encoder_embs, chunk_pre_encoder_lengths
+head = FixedSortformerHead(model)
+head.eval()
+# Input shapes for head - must match PreEncoder output
+pre_encoder_embs = torch.randn(1, total_concat_len, fc_d_model)
+pre_encoder_lengths = torch.tensor([total_concat_len], dtype=torch.long)
+chunk_embs_in = torch.randn(1, pre_encode_out_len, fc_d_model)
+chunk_lens_in = torch.tensor([pre_encode_out_len], dtype=torch.long)
+traced_head = torch.jit.trace(head, (
+    pre_encoder_embs, pre_encoder_lengths,
+    chunk_embs_in, chunk_lens_in
+))
+head_ml = ct.convert(
+    traced_head,
+    inputs=[
+        ct.TensorType(name="pre_encoder_embs", shape=pre_encoder_embs.shape, dtype=np.float32),
+        ct.TensorType(name="pre_encoder_lengths", shape=pre_encoder_lengths.shape, dtype=np.int32),
+        ct.TensorType(name="chunk_embs_in", shape=chunk_embs_in.shape, dtype=np.float32),
+        ct.TensorType(name="chunk_lens_in", shape=chunk_lens_in.shape, dtype=np.int32),
+    ],
+    outputs=[
+        ct.TensorType(name="speaker_preds", dtype=np.float32),
+        ct.TensorType(name="chunk_pre_encoder_embs", dtype=np.float32),
+        ct.TensorType(name="chunk_pre_encoder_lengths", dtype=np.int32),
+    ],
+    minimum_deployment_target=ct.target.iOS16,
+    compute_precision=ct.precision.FLOAT16,
+    compute_units=ct.ComputeUnit.ALL
+)
+head_path = os.path.join(output_dir, "Pipeline_Head_Fixed.mlpackage")
+head_ml.save(head_path)
+print(f"  Saved {head_path}")
+# =========================================================
+# 3. Compile to .mlmodelc
+# =========================================================
+print("\n[3/3] Compiling to .mlmodelc...")
+def compile_model(mlpackage_path):
+    """Compile .mlpackage to .mlmodelc using xcrun coremlcompiler."""
+    output_dir_path = os.path.dirname(mlpackage_path)
+    model_name = os.path.basename(mlpackage_path).replace('.mlpackage', '')
+    try:
+        result = subprocess.run(
+            ['xcrun', 'coremlcompiler', 'compile', mlpackage_path, output_dir_path],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        mlmodelc_path = os.path.join(output_dir_path, f"{model_name}.mlmodelc")
+        if os.path.exists(mlmodelc_path):
+            print(f"  Compiled {mlmodelc_path}")
+            return True
+        else:
+            print(f"  Warning: {mlmodelc_path} not found after compilation")
+            return False
+    except subprocess.CalledProcessError as e:
+        print(f"  Error compiling {mlpackage_path}: {e.stderr}")
+        return False
+    except FileNotFoundError:
+        print("  Error: xcrun not found. Make sure Xcode Command Line Tools are installed.")
+        return False
+compile_model(preprocessor_path)
+compile_model(pre_encoder_path)
+compile_model(head_path)
+# =========================================================
+# Verification
+# =========================================================
+print("\n" + "=" * 70)
+print("Verification")
+print("=" * 70)
+# Test PreEncoder
+test_chunk = np.random.randn(1, input_chunk_time, feat_dim).astype(np.float32)
+test_chunk_len = np.array([input_chunk_time], dtype=np.int32)
+test_spkcache = np.zeros((1, spkcache_len, fc_d_model), dtype=np.float32)
+test_spkcache_len = np.array([0], dtype=np.int32)
+test_fifo = np.zeros((1, fifo_len, fc_d_model), dtype=np.float32)
+test_fifo_len = np.array([0], dtype=np.int32)
+pre_out = pre_encoder_ml.predict({
+    'chunk': test_chunk,
+    'chunk_lengths': test_chunk_len,
+    'spkcache': test_spkcache,
+    'spkcache_lengths': test_spkcache_len,
+    'fifo': test_fifo,
+    'fifo_lengths': test_fifo_len
+})
+print(f"PreEncoder output shapes:")
+print(f"  pre_encoder_embs: {pre_out['pre_encoder_embs'].shape}")
+print(f"  chunk_embs_in: {pre_out['chunk_embs_in'].shape}")
+# Test Head
+head_out = head_ml.predict({
+    'pre_encoder_embs': pre_out['pre_encoder_embs'],
+    'pre_encoder_lengths': pre_out['pre_encoder_lengths'],
+    'chunk_embs_in': pre_out['chunk_embs_in'],
+    'chunk_lens_in': pre_out['chunk_lens_in']
+})
+print(f"\nHead output shapes:")
+print(f"  speaker_preds: {head_out['speaker_preds'].shape}")
+print(f"  chunk_pre_encoder_embs: {head_out['chunk_pre_encoder_embs'].shape}")
+# Verify embedding preservation
+if np.isclose(pre_out['chunk_embs_in'][0,0,0], head_out['chunk_pre_encoder_embs'][0,0,0], atol=0.01):
+    print("\n✓ Embedding [0,0,0] preserved correctly!")
+else:
+    print(f"\n✗ WARNING: Embedding [0,0,0] corrupted!")
+print("\n" + "=" * 70)
+print("Export Complete!")
+print("=" * 70)
+print(f"\nModels saved to: {output_dir}/")
+print(f"  .mlpackage files:")
+print(f"    - Pipeline_Preprocessor.mlpackage")
+print(f"    - Pipeline_PreEncoder.mlpackage")
+print(f"    - Pipeline_Head_Fixed.mlpackage")
+print(f"  .mlmodelc files (compiled):")
+print(f"    - Pipeline_Preprocessor.mlmodelc")
+print(f"    - Pipeline_PreEncoder.mlmodelc")
+print(f"    - Pipeline_Head_Fixed.mlmodelc")
+print(f"\nConfiguration (Gradient Descent):")
+for k, v in GRADIENT_DESCENT_CONFIG.items():
+    print(f"  {k}: {v}")
+print(f"\nInput shapes:")
+print(f"  Preprocessor audio: [1, {preprocessor_audio_samples}]")
+print(f"  PreEncoder chunk: [1, {input_chunk_time}, {feat_dim}]")
+print(f"  PreEncoder spkcache: [1, {spkcache_len}, {fc_d_model}]")
+print(f"  PreEncoder fifo: [1, {fifo_len}, {fc_d_model}]")
+print(f"  Head pre_encoder_embs: [1, {total_concat_len}, {fc_d_model}]")
+print(f"  Head chunk_embs_in: [1, {pre_encode_out_len}, {fc_d_model}]")