Upload 33 files

Browse files

Files changed (7) hide show

convert.py +82 -0
convert_dynamic.py +302 -0
coreml_wrappers.py +215 -0
mic_inference.py +519 -0
nemo_streaming_reference.py +153 -0
streaming_inference.py +262 -0
streaming_preproc_inference.py +411 -0

convert.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import torch
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import matplotlib
+import seaborn as sns
+import numpy as np
+import threading
+import onnx2torch
+import onnxscript
+from nemo.collections.asr.models import SortformerEncLabelModel
+from pydub import AudioSegment
+import coremltools as ct
+from pydub.playback import play as play_audio
+# --- 1. Setup & Config ---
+device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+audio_file = "audio.wav"
+# Load Audio for Playback (pydub uses milliseconds)
+print("Loading audio file for playback...")
+full_audio = AudioSegment.from_wav(audio_file)
+# --- 2. Load Model ---
+model = SortformerEncLabelModel.from_pretrained(
+    "nvidia/diar_streaming_sortformer_4spk-v2.1",
+    map_location=device
+)
+model.eval()
+model.to(device)
+print(model.output_names)
+def streaming_input_examples(self):
+    """Input tensor examples for exporting streaming version of model"""
+    batch_size = 4
+    feat_in = self.cfg.get("preprocessor", {}).get("features", 128)
+    chunk = torch.rand([batch_size, 120, feat_in]).to(self.device)
+    chunk_lengths = torch.tensor([120] * batch_size).to(self.device)
+    spkcache = torch.randn([batch_size, 188, 512]).to(self.device)
+    spkcache_lengths = torch.tensor([40, 188, 0, 68]).to(self.device)
+    fifo = torch.randn([batch_size, 188, 512]).to(self.device)
+    fifo_lengths = torch.tensor([50, 88, 0, 90]).to(self.device)
+    return chunk, chunk_lengths, spkcache, spkcache_lengths, fifo, fifo_lengths
+inputs = streaming_input_examples(model)
+export_out = model.export("streaming-sortformer.onnx", input_example=inputs)
+scripted_model = onnx2torch.convert('streaming-sortformer.onnx')
+BATCH_SIZE = 4
+CHUNK_LEN = 120
+FEAT_DIM = 128
+CACHE_LEN = 188
+EMBED_DIM = 512
+ct_inputs = [
+    ct.TensorType(name="chunk",          shape=(BATCH_SIZE, CHUNK_LEN, FEAT_DIM)),
+    ct.TensorType(name="chunk_lens",     shape=(BATCH_SIZE,)),
+    ct.TensorType(name="spkcache",       shape=(BATCH_SIZE, CACHE_LEN, EMBED_DIM)),
+    ct.TensorType(name="spkcache_lens",  shape=(BATCH_SIZE,)),
+    ct.TensorType(name="fifo",           shape=(BATCH_SIZE, CACHE_LEN, EMBED_DIM)),
+    ct.TensorType(name="fifo_lens",      shape=(BATCH_SIZE,)),
+]
+ct_outputs = [
+    ct.TensorType(name="preds"),
+    ct.TensorType(name="new_spkcache"),
+    ct.TensorType(name="new_spkcache_lens"),
+    ct.TensorType(name="new_fifo"),
+    ct.TensorType(name="new_fifo_lens"),
+]
+ct.convert(
+    scripted_model,
+    inputs=ct_inputs,
+    outputs=ct_outputs,
+    convert_to="mlprogram",
+    minimum_deployment_target=ct.target.iOS17,
+    compute_precision=ct.precision.FLOAT16,
+)

convert_dynamic.py ADDED Viewed

	@@ -0,0 +1,302 @@

+#!/usr/bin/env python3
+"""
+Convert Sortformer to CoreML with proper dynamic length handling.
+The key issue: Original conversion traced with fixed lengths (spkcache=120, fifo=40),
+but at runtime we need to handle empty state (spkcache=0, fifo=0) for first chunk.
+Solution: Use scripting instead of tracing, or trace with multiple example lengths.
+"""
+import torch
+import torch.nn as nn
+import coremltools as ct
+import numpy as np
+import os
+import sys
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, os.path.join(SCRIPT_DIR, 'NeMo'))
+from nemo.collections.asr.models import SortformerEncLabelModel
+print("=" * 70)
+print("CONVERTING SORTFORMER WITH DYNAMIC LENGTH SUPPORT")
+print("=" * 70)
+# Load model
+model_path = os.path.join(SCRIPT_DIR, 'diar_streaming_sortformer_4spk-v2.nemo')
+print(f"Loading model: {model_path}")
+model = SortformerEncLabelModel.restore_from(model_path, map_location='cpu', strict=False)
+model.eval()
+# Configure for low-latency streaming
+modules = model.sortformer_modules
+modules.chunk_len = 6
+modules.chunk_left_context = 1
+modules.chunk_right_context = 1
+modules.fifo_len = 40
+modules.spkcache_len = 120
+modules.spkcache_update_period = 30
+print(f"Config: chunk_len={modules.chunk_len}, left={modules.chunk_left_context}, right={modules.chunk_right_context}")
+print(f"        fifo_len={modules.fifo_len}, spkcache_len={modules.spkcache_len}")
+# Dimensions
+chunk_frames = (modules.chunk_len + modules.chunk_left_context + modules.chunk_right_context) * modules.subsampling_factor
+fc_d_model = modules.fc_d_model  # 512
+feat_dim = 128
+print(f"Chunk frames: {chunk_frames}")
+class DynamicPreEncoderWrapper(nn.Module):
+    """Pre-encoder that properly handles dynamic lengths."""
+    def __init__(self, model, max_spkcache=120, max_fifo=40, max_chunk=8):
+        super().__init__()
+        self.model = model
+        self.max_spkcache = max_spkcache
+        self.max_fifo = max_fifo
+        self.max_chunk = max_chunk
+        self.max_total = max_spkcache + max_fifo + max_chunk
+    def forward(self, chunk, chunk_lengths, spkcache, spkcache_lengths, fifo, fifo_lengths):
+        # Pre-encode the chunk
+        chunk_embs, chunk_emb_lengths = self.model.encoder.pre_encode(x=chunk, lengths=chunk_lengths)
+        # Get actual lengths as scalars
+        spk_len = spkcache_lengths[0].item() if spkcache_lengths.numel() > 0 else 0
+        fifo_len = fifo_lengths[0].item() if fifo_lengths.numel() > 0 else 0
+        chunk_len = chunk_emb_lengths[0].item()
+        total_len = spk_len + fifo_len + chunk_len
+        # Create output tensor (packed at start, rest is zeros)
+        B, _, D = spkcache.shape
+        output = torch.zeros(B, self.max_total, D, device=chunk.device, dtype=chunk.dtype)
+        # Copy valid frames
+        if spk_len > 0:
+            output[:, :spk_len, :] = spkcache[:, :spk_len, :]
+        if fifo_len > 0:
+            output[:, spk_len:spk_len+fifo_len, :] = fifo[:, :fifo_len, :]
+        output[:, spk_len+fifo_len:spk_len+fifo_len+chunk_len, :] = chunk_embs[:, :chunk_len, :]
+        total_length = torch.tensor([total_len], dtype=torch.long)
+        return output, total_length, chunk_embs, chunk_emb_lengths
+class DynamicHeadWrapper(nn.Module):
+    """Head that properly handles dynamic lengths with masking."""
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+    def forward(self, pre_encoder_embs, pre_encoder_lengths, chunk_embs, chunk_emb_lengths):
+        # Encode
+        fc_embs, fc_lengths = self.model.frontend_encoder(
+            processed_signal=pre_encoder_embs,
+            processed_signal_length=pre_encoder_lengths,
+            bypass_pre_encode=True,
+        )
+        # Get predictions
+        preds = self.model.forward_infer(fc_embs, fc_lengths)
+        # Apply mask based on actual length
+        # preds shape: [B, T, num_speakers]
+        max_len = preds.shape[1]
+        length = pre_encoder_lengths[0]
+        mask = torch.arange(max_len, device=preds.device) < length
+        preds = preds * mask.unsqueeze(0).unsqueeze(-1).float()
+        return preds, chunk_embs, chunk_emb_lengths
+# Test with both empty and full state
+print("\n" + "=" * 70)
+print("TESTING DYNAMIC WRAPPERS")
+print("=" * 70)
+pre_encoder = DynamicPreEncoderWrapper(model)
+head = DynamicHeadWrapper(model)
+pre_encoder.eval()
+head.eval()
+# Test 1: Empty state (like chunk 0)
+print("\nTest 1: Empty state (chunk 0)")
+chunk = torch.randn(1, 56, 128)  # First chunk has fewer frames
+chunk_len = torch.tensor([56], dtype=torch.long)
+spkcache = torch.zeros(1, 120, 512)
+spkcache_len = torch.tensor([0], dtype=torch.long)
+fifo = torch.zeros(1, 40, 512)
+fifo_len = torch.tensor([0], dtype=torch.long)
+with torch.no_grad():
+    pre_out, pre_len, chunk_embs, chunk_emb_len = pre_encoder(
+        chunk, chunk_len, spkcache, spkcache_len, fifo, fifo_len
+    )
+    preds, _, _ = head(pre_out, pre_len, chunk_embs, chunk_emb_len)
+print(f"  Pre-encoder output: {pre_out.shape}, length={pre_len.item()}")
+print(f"  Chunk embeddings: {chunk_embs.shape}, length={chunk_emb_len.item()}")
+print(f"  Predictions: {preds.shape}")
+sums = [f"{preds[0, i, :].sum().item():.4f}" for i in range(min(8, preds.shape[1]))]
+print(f"  First 8 pred frames sum: {sums}")
+# Test 2: Full state
+print("\nTest 2: Full state")
+chunk = torch.randn(1, 64, 128)
+chunk_len = torch.tensor([64], dtype=torch.long)
+spkcache = torch.randn(1, 120, 512)
+spkcache_len = torch.tensor([120], dtype=torch.long)
+fifo = torch.randn(1, 40, 512)
+fifo_len = torch.tensor([40], dtype=torch.long)
+with torch.no_grad():
+    pre_out, pre_len, chunk_embs, chunk_emb_len = pre_encoder(
+        chunk, chunk_len, spkcache, spkcache_len, fifo, fifo_len
+    )
+    preds, _, _ = head(pre_out, pre_len, chunk_embs, chunk_emb_len)
+print(f"  Pre-encoder output: {pre_out.shape}, length={pre_len.item()}")
+print(f"  Chunk embeddings: {chunk_embs.shape}, length={chunk_emb_len.item()}")
+print(f"  Predictions: {preds.shape}")
+print("\n" + "=" * 70)
+print("ISSUE IDENTIFIED")
+print("=" * 70)
+print("""
+The problem is that the current CoreML model was traced with FIXED lengths.
+When lengths change at runtime, the traced operations don't adapt.
+The fix requires re-tracing with proper dynamic handling OR using coremltools
+flexible shapes feature.
+For now, let's try a simpler approach: always pad inputs to max size and
+use the length parameters only for extracting the correct output slice.
+""")
+# The issue is that torch.jit.trace captures specific tensor values
+# We need to use torch.jit.script for truly dynamic behavior
+# But many NeMo operations don't work with script
+print("\nATTEMPTING CONVERSION WITH FLEXIBLE SHAPES...")
+# Try using coremltools range shapes
+try:
+    # Create wrapper that handles the length masking internally
+    class SimplePipelineWrapper(nn.Module):
+        def __init__(self, model):
+            super().__init__()
+            self.model = model
+        def forward(self, chunk, chunk_lengths, spkcache, spkcache_lengths, fifo, fifo_lengths):
+            # Pre-encode chunk
+            chunk_embs, chunk_emb_lens = self.model.encoder.pre_encode(x=chunk, lengths=chunk_lengths)
+            # Get lengths
+            spk_len = spkcache_lengths[0]
+            fifo_len = fifo_lengths[0]
+            chunk_len = chunk_emb_lens[0]
+            # Concatenate (always use fixed output size, rely on length for valid region)
+            # This matches what NeMo does internally
+            B = chunk.shape[0]
+            max_out = 168  # 120 + 40 + 8
+            D = 512
+            concat_embs = torch.zeros(B, max_out, D, device=chunk.device, dtype=chunk.dtype)
+            # Copy spkcache
+            for i in range(120):
+                if i < spk_len:
+                    concat_embs[:, i, :] = spkcache[:, i, :]
+            # Copy fifo
+            for i in range(40):
+                if i < fifo_len:
+                    concat_embs[:, 120 + i, :] = fifo[:, i, :]
+            # Copy chunk embeddings
+            for i in range(8):
+                if i < chunk_len:
+                    concat_embs[:, 120 + 40 + i, :] = chunk_embs[:, i, :]
+            total_len = spk_len + fifo_len + chunk_len
+            total_lens = total_len.unsqueeze(0)
+            # Run through encoder
+            fc_embs, fc_lens = self.model.frontend_encoder(
+                processed_signal=concat_embs,
+                processed_signal_length=total_lens,
+                bypass_pre_encode=True,
+            )
+            # Get predictions
+            preds = self.model.forward_infer(fc_embs, fc_lens)
+            return preds, chunk_embs, chunk_emb_lens
+    wrapper = SimplePipelineWrapper(model)
+    wrapper.eval()
+    # Trace with empty state example
+    print("Tracing with empty state example...")
+    chunk = torch.randn(1, 64, 128)
+    chunk_len = torch.tensor([56], dtype=torch.long)  # Actual length
+    spkcache = torch.zeros(1, 120, 512)
+    spkcache_len = torch.tensor([0], dtype=torch.long)
+    fifo = torch.zeros(1, 40, 512)
+    fifo_len = torch.tensor([0], dtype=torch.long)
+    with torch.no_grad():
+        traced = torch.jit.trace(wrapper, (chunk, chunk_len, spkcache, spkcache_len, fifo, fifo_len))
+    print("Converting to CoreML...")
+    mlmodel = ct.convert(
+        traced,
+        inputs=[
+            ct.TensorType(name="chunk", shape=(1, 64, 128), dtype=np.float32),
+            ct.TensorType(name="chunk_lengths", shape=(1,), dtype=np.int32),
+            ct.TensorType(name="spkcache", shape=(1, 120, 512), dtype=np.float32),
+            ct.TensorType(name="spkcache_lengths", shape=(1,), dtype=np.int32),
+            ct.TensorType(name="fifo", shape=(1, 40, 512), dtype=np.float32),
+            ct.TensorType(name="fifo_lengths", shape=(1,), dtype=np.int32),
+        ],
+        outputs=[
+            ct.TensorType(name="speaker_preds", dtype=np.float32),
+            ct.TensorType(name="chunk_pre_encoder_embs", dtype=np.float32),
+            ct.TensorType(name="chunk_pre_encoder_lengths", dtype=np.int32),
+        ],
+        minimum_deployment_target=ct.target.iOS16,
+        compute_precision=ct.precision.FLOAT32,
+        compute_units=ct.ComputeUnit.CPU_ONLY,  # Start with CPU for debugging
+    )
+    output_path = os.path.join(SCRIPT_DIR, 'coreml_models', 'SortformerPipeline_Dynamic.mlpackage')
+    mlmodel.save(output_path)
+    print(f"Saved to: {output_path}")
+    # Test the new model
+    print("\nTesting new CoreML model...")
+    test_output = mlmodel.predict({
+        'chunk': chunk.numpy(),
+        'chunk_lengths': chunk_len.numpy().astype(np.int32),
+        'spkcache': spkcache.numpy(),
+        'spkcache_lengths': spkcache_len.numpy().astype(np.int32),
+        'fifo': fifo.numpy(),
+        'fifo_lengths': fifo_len.numpy().astype(np.int32),
+    })
+    coreml_preds = np.array(test_output['speaker_preds'])
+    print(f"CoreML predictions shape: {coreml_preds.shape}")
+    print(f"CoreML first 8 frames:")
+    for i in range(min(8, coreml_preds.shape[1])):
+        print(f"  Frame {i}: {coreml_preds[0, i, :]}")
+except Exception as e:
+    print(f"Error during conversion: {e}")
+    import traceback
+    traceback.print_exc()

coreml_wrappers.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import torch
+from torch import nn
+from safe_concat import *
+from nemo.collections.asr.models import SortformerEncLabelModel
+def fixed_concat_and_pad(embs, lengths, max_total_len=188+188+6):
+    """
+    ANE-safe concat and pad that avoids zero-length slices.
+    Uses gather with arithmetic-computed indices to pack valid frames efficiently.
+    Args:
+        embs: List of 3 tensors [spkcache, fifo, chunk], each (B, seq_len, D)
+        lengths: List of 3 length tensors, each (1,) or scalar
+                 First two may be 0, third is always > 0
+        max_total_len: Output sequence length (padded with zeros)
+    Returns:
+        output: (B, max_total_len, D) with valid frames packed at the start
+        total_length: sum of lengths
+    """
+    B, _, D = embs[0].shape
+    device = embs[0].device
+    # Fixed sizes (known at trace time, becomes constants in graph)
+    size0, size1, size2 = embs[0].shape[1], embs[1].shape[1], embs[2].shape[1]
+    total_input_size = size0 + size1 + size2
+    # Concatenate all embeddings at full size (no zero-length slices!)
+    full_concat = torch.cat(embs, dim=1)  # (B, total_input_size, D)
+    # Get lengths (reshape to scalar for efficient broadcast)
+    len0 = lengths[0].reshape(())
+    len1 = lengths[1].reshape(())
+    len2 = lengths[2].reshape(())
+    total_length = len0 + len1 + len2
+    # Output positions: [0, 1, 2, ..., max_total_len-1]
+    out_pos = torch.arange(max_total_len, device=device, dtype=torch.long)
+    # Compute gather indices using arithmetic (more efficient than multiple where())
+    #
+    # For output position p:
+    #   seg0 (p < len0):           index = p
+    #   seg1 (len0 <= p < len0+len1): index = (p - len0) + size0 = p + (size0 - len0)
+    #   seg2 (len0+len1 <= p < total): index = (p - len0 - len1) + size0 + size1
+    #                                        = p + (size0 + size1 - len0 - len1)
+    #
+    # This simplifies to: index = p + offset, where offset depends on segment.
+    # offset_seg0 = 0
+    # offset_seg1 = size0 - len0
+    # offset_seg2 = size0 + size1 - len0 - len1 = offset_seg1 + (size1 - len1)
+    #
+    # Using segment indicators (0 or 1):
+    #   offset = in_seg1_or_2 * (size0 - len0) + in_seg2 * (size1 - len1)
+    cumsum0 = len0
+    cumsum1 = len0 + len1
+    # Segment indicators (bool -> long for arithmetic)
+    in_seg1_or_2 = (out_pos >= cumsum0).long()  # 1 if in seg1 or seg2
+    in_seg2 = (out_pos >= cumsum1).long()       # 1 if in seg2
+    # Compute offset and gather index
+    offset = in_seg1_or_2 * (size0 - len0) + in_seg2 * (size1 - len1)
+    gather_idx = (out_pos + offset).clamp(0, total_input_size - 1)
+    # Expand for gather: (B, max_total_len, D)
+    gather_idx = gather_idx.unsqueeze(0).unsqueeze(-1).expand(B, max_total_len, D)
+    # Gather and mask padding
+    output = torch.gather(full_concat, dim=1, index=gather_idx)
+    output = output * (out_pos < total_length).float().unsqueeze(0).unsqueeze(-1)
+    return output, total_length
+class PreprocessorWrapper(nn.Module):
+    """
+    Wraps the NeMo preprocessor (FilterbankFeaturesTA) for CoreML export.
+    We need to ensure it takes (audio, length) and returns (features, length).
+    """
+    def __init__(self, preprocessor):
+        super().__init__()
+        self.preprocessor = preprocessor
+    def forward(self, audio_signal, length):
+        # NeMo preprocessor returns (features, length)
+        # features shape: [B, D, T]
+        return self.preprocessor(input_signal=audio_signal, length=length)
+class SortformerHeadWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+    def forward(self, pre_encoder_embs, pre_encoder_lengths, chunk_pre_encoder_embs, chunk_pre_encoder_lengths):
+        spkcache_fifo_chunk_fc_encoder_embs, spkcache_fifo_chunk_fc_encoder_lengths = self.model.frontend_encoder(
+            processed_signal=pre_encoder_embs,
+            processed_signal_length=pre_encoder_lengths,
+            bypass_pre_encode=True,
+        )
+        # forward pass for inference
+        spkcache_fifo_chunk_preds = self.model.forward_infer(
+            spkcache_fifo_chunk_fc_encoder_embs, spkcache_fifo_chunk_fc_encoder_lengths
+        )
+        return spkcache_fifo_chunk_preds, chunk_pre_encoder_embs, chunk_pre_encoder_lengths
+class SortformerCoreMLWrapper(nn.Module):
+    """
+    Wraps the entire Sortformer pipeline (Encoder + Streaming Logic for Export)
+    The 'forward_for_export' method in the model is the target.
+    """
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.pre_encoder = PreEncoderWrapper(model)
+    def forward(self, chunk, chunk_lengths, spkcache, spkcache_lengths, fifo, fifo_lengths):
+        (spkcache_fifo_chunk_pre_encode_embs, spkcache_fifo_chunk_pre_encode_lengths,
+         chunk_pre_encode_embs, chunk_pre_encode_lengths) = self.pre_encoder(
+            chunk, chunk_lengths, spkcache, spkcache_lengths, fifo, fifo_lengths
+        )
+        # encode the concatenated embeddings
+        spkcache_fifo_chunk_fc_encoder_embs, spkcache_fifo_chunk_fc_encoder_lengths = self.model.frontend_encoder(
+            processed_signal=spkcache_fifo_chunk_pre_encode_embs,
+            processed_signal_length=spkcache_fifo_chunk_pre_encode_lengths,
+            bypass_pre_encode=True,
+        )
+        # forward pass for inference
+        spkcache_fifo_chunk_preds = self.model.forward_infer(
+            spkcache_fifo_chunk_fc_encoder_embs, spkcache_fifo_chunk_fc_encoder_lengths
+        )
+        return spkcache_fifo_chunk_preds, chunk_pre_encode_embs, chunk_pre_encode_lengths
+class PreEncoderWrapper(nn.Module):
+    """
+    Wraps the entire Sortformer pipeline (Encoder + Streaming Logic for Export)
+    The 'forward_for_export' method in the model is the target.
+    """
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        modules = model.sortformer_modules
+        chunk_length = modules.chunk_left_context + modules.chunk_len + modules.chunk_right_context
+        self.pre_encoder_length = modules.spkcache_len + modules.fifo_len + chunk_length
+    def forward(self, *args):
+        if len(args) == 6:
+            return self.forward_concat(*args)
+        else:
+            return self.forward_pre_encode(*args)
+    def forward_concat(self, chunk, chunk_lengths, spkcache, spkcache_lengths, fifo, fifo_lengths):
+        chunk_pre_encode_embs, chunk_pre_encode_lengths = self.model.encoder.pre_encode(x=chunk, lengths=chunk_lengths)
+        chunk_pre_encode_lengths = chunk_pre_encode_lengths.to(torch.int64)
+        spkcache_fifo_chunk_pre_encode_embs, spkcache_fifo_chunk_pre_encode_lengths = fixed_concat_and_pad(
+            [spkcache, fifo, chunk_pre_encode_embs],
+            [spkcache_lengths, fifo_lengths, chunk_pre_encode_lengths],
+            self.pre_encoder_length
+        )
+        return (spkcache_fifo_chunk_pre_encode_embs, spkcache_fifo_chunk_pre_encode_lengths,
+                chunk_pre_encode_embs, chunk_pre_encode_lengths)
+    def forward_pre_encode(self, chunk, chunk_lengths):
+        chunk_pre_encode_embs, chunk_pre_encode_lengths = self.model.encoder.pre_encode(x=chunk, lengths=chunk_lengths)
+        chunk_pre_encode_lengths = chunk_pre_encode_lengths.to(torch.int64)
+        return chunk_pre_encode_embs, chunk_pre_encode_lengths
+class ConformerEncoderWrapper(nn.Module):
+    """
+    Wraps the entire Sortformer pipeline (Encoder + Streaming Logic for Export)
+    The 'forward_for_export' method in the model is the target.
+    """
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+    def forward(self, pre_encode_embs, pre_encode_lengths):
+        spkcache_fifo_chunk_fc_encoder_embs, spkcache_fifo_chunk_fc_encoder_lengths = self.model.frontend_encoder(
+            processed_signal=pre_encode_embs,
+            processed_signal_length=pre_encode_lengths,
+            bypass_pre_encode=True,
+        )
+        return spkcache_fifo_chunk_fc_encoder_embs, spkcache_fifo_chunk_fc_encoder_lengths
+class SortformerEncoderWrapper(nn.Module):
+    """
+    Wraps the entire Sortformer pipeline (Encoder + Streaming Logic for Export)
+    The 'forward_for_export' method in the model is the target.
+    """
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+    def forward(self, encoder_embs, encoder_lengths):
+        spkcache_fifo_chunk_preds = self.model.forward_infer(
+            encoder_embs, encoder_lengths
+        )
+        return spkcache_fifo_chunk_preds

mic_inference.py ADDED Viewed

	@@ -0,0 +1,519 @@

+"""
+Real-Time Microphone Diarization with CoreML
+This script captures audio from the microphone in real-time,
+processes it through CoreML models, and displays a live updating
+diarization heatmap.
+Pipeline: Microphone → Audio Buffer → CoreML Preproc → CoreML Main → Live Plot
+Requirements:
+    pip install pyaudio matplotlib seaborn numpy coremltools
+Usage:
+    python mic_inference.py
+"""
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+import torch
+import numpy as np
+import coremltools as ct
+import matplotlib.pyplot as plt
+import matplotlib
+matplotlib.use('TkAgg')
+import seaborn as sns
+import threading
+import queue
+import time
+import math
+import argparse
+# Import NeMo for state management
+from nemo.collections.asr.models import SortformerEncLabelModel
+try:
+    import sounddevice as sd
+    SOUNDDEVICE_AVAILABLE = True
+except ImportError:
+    import sounddevice as sd
+    SOUNDDEVICE_AVAILABLE = False
+    print("Warning: sounddevice not available. Install with: pip install sounddevice")
+# ============================================================
+# Configuration
+# ============================================================
+CONFIG = {
+    'chunk_len': 6,
+    'chunk_right_context': 1,
+    'chunk_left_context': 1,
+    'fifo_len': 40,
+    'spkcache_len': 120,
+    'spkcache_update_period': 32,
+    'subsampling_factor': 8,
+    'sample_rate': 16000,
+    'mel_window': 400,
+    'mel_stride': 160,
+    # Audio settings
+    'audio_chunk_samples': 1280,  # 80ms chunks from mic
+    'channels': 1,
+}
+CONFIG['spkcache_input_len'] = CONFIG['spkcache_len']
+CONFIG['fifo_input_len'] = CONFIG['fifo_len']
+CONFIG['chunk_frames'] = (CONFIG['chunk_len'] + CONFIG['chunk_left_context'] + CONFIG['chunk_right_context']) * CONFIG['subsampling_factor']
+CONFIG['preproc_audio_samples'] = (CONFIG['chunk_frames'] - 1) * CONFIG['mel_stride'] + CONFIG['mel_window']
+class MicrophoneStream:
+    """Captures audio from microphone using sounddevice."""
+    def __init__(self, sample_rate, chunk_size, audio_queue):
+        self.sample_rate = sample_rate
+        self.chunk_size = chunk_size
+        self.audio_queue = audio_queue
+        self.stream = None
+        self.running = False
+    def start(self):
+        if not SOUNDDEVICE_AVAILABLE:
+            print("sounddevice not available!")
+            return False
+        def callback(indata, frames, time_info, status):
+            if status:
+                print(f"Audio status: {status}")
+            # indata is already float32 in range [-1, 1]
+            audio = indata[:, 0].copy()  # Take first channel
+            self.audio_queue.put(audio)
+        self.stream = sd.InputStream(
+            samplerate=self.sample_rate,
+            channels=1,
+            dtype=np.float32,
+            blocksize=self.chunk_size,
+            callback=callback
+        )
+        self.stream.start()
+        self.running = True
+        print("Microphone started...")
+        return True
+    def stop(self):
+        self.running = False
+        if self.stream:
+            self.stream.stop()
+            self.stream.close()
+        print("Microphone stopped.")
+class StreamingDiarizer:
+    """Real-time streaming diarization using CoreML."""
+    def __init__(self, nemo_model, preproc_model, main_model, config):
+        self.modules = nemo_model.sortformer_modules
+        self.preproc_model = preproc_model
+        self.main_model = main_model
+        self.config = config
+        # Audio buffer
+        self.audio_buffer = np.array([], dtype=np.float32)
+        # Feature buffer
+        self.feature_buffer = None
+        self.features_processed = 0
+        # Diarization state
+        self.state = self.modules.init_streaming_state(batch_size=1, device='cpu')
+        self.all_probs = []  # List of [T, 4] arrays
+        # Chunk tracking
+        self.diar_chunk_idx = 0
+        self.preproc_chunk_idx = 0
+        # Derived params
+        self.subsampling = config['subsampling_factor']
+        self.core_frames = config['chunk_len'] * self.subsampling
+        self.left_ctx = config['chunk_left_context'] * self.subsampling
+        self.right_ctx = config['chunk_right_context'] * self.subsampling
+        # Audio hop for preprocessor
+        self.audio_hop = config['preproc_audio_samples'] - config['mel_window']
+        self.overlap_frames = (config['mel_window'] - config['mel_stride']) // config['mel_stride'] + 1
+    def add_audio(self, audio_chunk):
+        """Add new audio samples."""
+        self.audio_buffer = np.concatenate([self.audio_buffer, audio_chunk])
+    def process(self):
+        """
+        Process available audio through preprocessor and diarizer.
+        Returns new probability frames if available.
+        """
+        new_probs = None
+        # Step 1: Run preprocessor on available audio
+        while len(self.audio_buffer) >= self.config['preproc_audio_samples']:
+            audio_chunk = self.audio_buffer[:self.config['preproc_audio_samples']]
+            preproc_inputs = {
+                "audio_signal": audio_chunk.reshape(1, -1).astype(np.float32),
+                "length": np.array([self.config['preproc_audio_samples']], dtype=np.int32)
+            }
+            preproc_out = self.preproc_model.predict(preproc_inputs)
+            feat_chunk = np.array(preproc_out["features"])
+            feat_len = int(preproc_out["feature_lengths"][0])
+            if self.preproc_chunk_idx == 0:
+                valid_feats = feat_chunk[:, :, :feat_len]
+            else:
+                valid_feats = feat_chunk[:, :, self.overlap_frames:feat_len]
+            if self.feature_buffer is None:
+                self.feature_buffer = valid_feats
+            else:
+                self.feature_buffer = np.concatenate([self.feature_buffer, valid_feats], axis=2)
+            self.audio_buffer = self.audio_buffer[self.audio_hop:]
+            self.preproc_chunk_idx += 1
+        if self.feature_buffer is None:
+            return None
+        # Step 2: Run diarization on available features
+        total_features = self.feature_buffer.shape[2]
+        while True:
+            # Calculate chunk boundaries
+            chunk_start = self.diar_chunk_idx * self.core_frames
+            chunk_end = chunk_start + self.core_frames
+            # Need right context
+            required_features = chunk_end + self.right_ctx
+            if required_features > total_features:
+                break  # Not enough features yet
+            # Extract with context
+            left_offset = min(self.left_ctx, chunk_start)
+            right_offset = min(self.right_ctx, total_features - chunk_end)
+            feat_start = chunk_start - left_offset
+            feat_end = chunk_end + right_offset
+            chunk_feat = self.feature_buffer[:, :, feat_start:feat_end]
+            chunk_feat_tensor = torch.from_numpy(chunk_feat).float()
+            actual_len = chunk_feat.shape[2]
+            # Transpose to [B, T, D]
+            chunk_t = chunk_feat_tensor.transpose(1, 2)
+            # Pad if needed
+            if actual_len < self.config['chunk_frames']:
+                pad_len = self.config['chunk_frames'] - actual_len
+                chunk_in = torch.nn.functional.pad(chunk_t, (0, 0, 0, pad_len))
+            else:
+                chunk_in = chunk_t[:, :self.config['chunk_frames'], :]
+            # State preparation
+            curr_spk_len = self.state.spkcache.shape[1]
+            curr_fifo_len = self.state.fifo.shape[1]
+            current_spkcache = self.state.spkcache
+            if curr_spk_len < self.config['spkcache_input_len']:
+                current_spkcache = torch.nn.functional.pad(
+                    current_spkcache, (0, 0, 0, self.config['spkcache_input_len'] - curr_spk_len)
+                )
+            elif curr_spk_len > self.config['spkcache_input_len']:
+                current_spkcache = current_spkcache[:, :self.config['spkcache_input_len'], :]
+            current_fifo = self.state.fifo
+            if curr_fifo_len < self.config['fifo_input_len']:
+                current_fifo = torch.nn.functional.pad(
+                    current_fifo, (0, 0, 0, self.config['fifo_input_len'] - curr_fifo_len)
+                )
+            elif curr_fifo_len > self.config['fifo_input_len']:
+                current_fifo = current_fifo[:, :self.config['fifo_input_len'], :]
+            # CoreML inference
+            coreml_inputs = {
+                "chunk": chunk_in.numpy().astype(np.float32),
+                "chunk_lengths": np.array([actual_len], dtype=np.int32),
+                "spkcache": current_spkcache.numpy().astype(np.float32),
+                "spkcache_lengths": np.array([curr_spk_len], dtype=np.int32),
+                "fifo": current_fifo.numpy().astype(np.float32),
+                "fifo_lengths": np.array([curr_fifo_len], dtype=np.int32)
+            }
+            st_time = time.time_ns()
+            coreml_out = self.main_model.predict(coreml_inputs)
+            ed_time = time.time_ns()
+            print(f"duration: {1e-6 * (ed_time - st_time)}")
+            pred_logits = torch.from_numpy(coreml_out["speaker_preds"])
+            chunk_embs = torch.from_numpy(coreml_out["chunk_pre_encoder_embs"])
+            chunk_emb_len = int(coreml_out["chunk_pre_encoder_lengths"][0])
+            chunk_embs = chunk_embs[:, :chunk_emb_len, :]
+            lc = round(left_offset / self.subsampling)
+            rc = math.ceil(right_offset / self.subsampling)
+            self.state, chunk_probs = self.modules.streaming_update(
+                streaming_state=self.state,
+                chunk=chunk_embs,
+                preds=pred_logits,
+                lc=lc,
+                rc=rc
+            )
+            # Store probabilities
+            probs_np = chunk_probs.squeeze(0).detach().cpu().numpy()
+            self.all_probs.append(probs_np)
+            new_probs = probs_np
+            self.diar_chunk_idx += 1
+        return new_probs
+    def get_all_probs(self):
+        """Get all accumulated probabilities."""
+        if len(self.all_probs) > 0:
+            return np.concatenate(self.all_probs, axis=0)
+        return None
+def run_mic_inference(model_name, coreml_dir):
+    """Run real-time microphone diarization."""
+    if not SOUNDDEVICE_AVAILABLE:
+        print("Cannot run mic inference without sounddevice!")
+        return
+    print("=" * 70)
+    print("Real-Time Microphone Diarization")
+    print("=" * 70)
+    # Load NeMo model
+    print(f"\nLoading NeMo Model: {model_name}")
+    nemo_model = SortformerEncLabelModel.from_pretrained(model_name, map_location="cpu")
+    nemo_model.eval()
+    # Configure
+    modules = nemo_model.sortformer_modules
+    modules.chunk_len = CONFIG['chunk_len']
+    modules.chunk_right_context = CONFIG['chunk_right_context']
+    modules.chunk_left_context = CONFIG['chunk_left_context']
+    modules.fifo_len = CONFIG['fifo_len']
+    modules.spkcache_len = CONFIG['spkcache_len']
+    modules.spkcache_update_period = CONFIG['spkcache_update_period']
+    if hasattr(nemo_model.preprocessor, 'featurizer'):
+        nemo_model.preprocessor.featurizer.dither = 0.0
+        nemo_model.preprocessor.featurizer.pad_to = 0
+    # Load CoreML models
+    print(f"Loading CoreML Models from {coreml_dir}...")
+    preproc_model = ct.models.MLModel(
+        os.path.join(coreml_dir, "Pipeline_Preprocessor.mlpackage"),
+        compute_units=ct.ComputeUnit.CPU_ONLY
+    )
+    main_model = ct.models.MLModel(
+        os.path.join(coreml_dir, "SortformerPipeline.mlpackage"),
+        compute_units=ct.ComputeUnit.ALL
+    )
+    # Create diarizer
+    diarizer = StreamingDiarizer(nemo_model, preproc_model, main_model, CONFIG)
+    # Audio queue
+    audio_queue = queue.Queue()
+    # Start microphone
+    mic = MicrophoneStream(
+        sample_rate=CONFIG['sample_rate'],
+        chunk_size=CONFIG['audio_chunk_samples'],
+        audio_queue=audio_queue
+    )
+    if not mic.start():
+        return
+    # Setup plot
+    plt.ion()
+    fig, ax = plt.subplots(figsize=(14, 4))
+    print("\nListening... Press Ctrl+C to stop.\n")
+    try:
+        last_update = time.time()
+        while True:
+            # Get audio from queue
+            while not audio_queue.empty():
+                audio_chunk = audio_queue.get()
+                diarizer.add_audio(audio_chunk)
+            # Process
+            new_probs = diarizer.process()
+            # Update plot periodically
+            if time.time() - last_update > 0.16:  # Update every 160ms
+                all_probs = diarizer.get_all_probs()
+                if all_probs is not None and len(all_probs) > 0:
+                    ax.clear()
+                    # Show last 200 frames (~16 seconds)
+                    display_frames = min(200, len(all_probs))
+                    display_probs = all_probs[-display_frames:]
+                    sns.heatmap(
+                        display_probs.T,
+                        ax=ax,
+                        cmap="viridis",
+                        vmin=0, vmax=1,
+                        yticklabels=[f"Spk {i}" for i in range(4)],
+                        cbar=False
+                    )
+                    ax.set_xlabel("Time (frames, 80ms each)")
+                    ax.set_ylabel("Speaker")
+                    ax.set_title(f"Live Diarization - Total: {len(all_probs)} frames ({len(all_probs)*0.08:.1f}s)")
+                    plt.draw()
+                    plt.pause(0.01)
+                last_update = time.time()
+            time.sleep(0.01)
+    except KeyboardInterrupt:
+        print("\nStopping...")
+    finally:
+        mic.stop()
+        plt.ioff()
+        plt.close()
+    # Final summary
+    all_probs = diarizer.get_all_probs()
+    if all_probs is not None:
+        print(f"\nTotal processed: {len(all_probs)} frames ({len(all_probs)*0.08:.1f} seconds)")
+def run_file_demo(model_name, coreml_dir, audio_path):
+    """Run demo on audio file with live updating plot."""
+    print("=" * 70)
+    print("File Demo with Live Updating Plot")
+    print("=" * 70)
+    # Load NeMo model
+    print(f"\nLoading NeMo Model: {model_name}")
+    nemo_model = SortformerEncLabelModel.from_pretrained(model_name, map_location="cpu")
+    nemo_model.eval()
+    # Configure
+    modules = nemo_model.sortformer_modules
+    modules.chunk_len = CONFIG['chunk_len']
+    modules.chunk_right_context = CONFIG['chunk_right_context']
+    modules.chunk_left_context = CONFIG['chunk_left_context']
+    modules.fifo_len = CONFIG['fifo_len']
+    modules.spkcache_len = CONFIG['spkcache_len']
+    modules.spkcache_update_period = CONFIG['spkcache_update_period']
+    if hasattr(nemo_model.preprocessor, 'featurizer'):
+        nemo_model.preprocessor.featurizer.dither = 0.0
+        nemo_model.preprocessor.featurizer.pad_to = 0
+    # Load CoreML models
+    print(f"Loading CoreML Models from {coreml_dir}...")
+    preproc_model = ct.models.MLModel(
+        os.path.join(coreml_dir, "Pipeline_Preprocessor.mlpackage"),
+        compute_units=ct.ComputeUnit.CPU_ONLY
+    )
+    main_model = ct.models.MLModel(
+        os.path.join(coreml_dir, "SortformerPipeline.mlpackage"),
+        compute_units=ct.ComputeUnit.ALL
+    )
+    # Load audio file
+    import librosa
+    audio, _ = librosa.load(audio_path, sr=CONFIG['sample_rate'], mono=True)
+    print(f"Loaded audio: {len(audio)} samples ({len(audio)/CONFIG['sample_rate']:.1f}s)")
+    # Create diarizer
+    diarizer = StreamingDiarizer(nemo_model, preproc_model, main_model, CONFIG)
+    # Setup plot
+    plt.ion()
+    fig, ax = plt.subplots(figsize=(14, 4))
+    # Simulate streaming
+    chunk_size = CONFIG['audio_chunk_samples']
+    offset = 0
+    print("\nStreaming audio with live plot...")
+    try:
+        while offset < len(audio):
+            # Add audio chunk
+            chunk_end = min(offset + chunk_size, len(audio))
+            audio_chunk = audio[offset:chunk_end]
+            diarizer.add_audio(audio_chunk)
+            offset = chunk_end
+            # Process
+            diarizer.process()
+            # Update plot
+            all_probs = diarizer.get_all_probs()
+            if all_probs is not None and len(all_probs) > 0:
+                ax.clear()
+                sns.heatmap(
+                    all_probs.T,
+                    ax=ax,
+                    cmap="viridis",
+                    vmin=0, vmax=1,
+                    yticklabels=[f"Spk {i}" for i in range(4)],
+                    cbar=False
+                )
+                ax.set_xlabel("Time (frames, 80ms each)")
+                ax.set_ylabel("Speaker")
+                ax.set_title(f"Streaming Diarization - {len(all_probs)} frames")
+                plt.draw()
+                plt.pause(0.05)
+            # Simulate real-time (optional - comment out for fast mode)
+            # time.sleep(chunk_size / CONFIG['sample_rate'])
+    except KeyboardInterrupt:
+        print("\nStopped.")
+    plt.ioff()
+    # Final plot
+    all_probs = diarizer.get_all_probs()
+    if all_probs is not None:
+        print(f"\nTotal: {len(all_probs)} frames ({len(all_probs)*0.08:.1f}s)")
+        plt.show()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", default="nvidia/diar_streaming_sortformer_4spk-v2.1")
+    parser.add_argument("--coreml_dir", default="coreml_models")
+    parser.add_argument("--audio_path", default="audio.wav")
+    parser.add_argument("--mic", action="store_true", help="Use microphone input")
+    args = parser.parse_args()
+    run_mic_inference(args.model_name, args.coreml_dir)
+    # if args.mic:
+    # else:
+    #     run_file_demo(args.model_name, args.coreml_dir, args.audio_path)

nemo_streaming_reference.py ADDED Viewed

	@@ -0,0 +1,153 @@

+#!/usr/bin/env python3
+"""Get exact NeMo streaming inference output for comparison with Swift."""
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+import torch
+import numpy as np
+import librosa
+import json
+from nemo.collections.asr.models import SortformerEncLabelModel
+def main():
+    print("Loading NeMo model...")
+    model = SortformerEncLabelModel.restore_from(
+        'diar_streaming_sortformer_4spk-v2.nemo', map_location='cpu'
+    )
+    model.eval()
+    # Disable dither for deterministic output
+    if hasattr(model.preprocessor, 'featurizer'):
+        if hasattr(model.preprocessor.featurizer, 'dither'):
+            model.preprocessor.featurizer.dither = 0.0
+    # Configure for Gradient Descent's streaming config (same as Swift)
+    modules = model.sortformer_modules
+    modules.chunk_len = 6
+    modules.chunk_left_context = 1
+    modules.chunk_right_context = 7
+    modules.fifo_len = 40
+    modules.spkcache_len = 188
+    modules.spkcache_update_period = 31
+    print(f"Config: chunk_len={modules.chunk_len}, left_ctx={modules.chunk_left_context}, right_ctx={modules.chunk_right_context}")
+    print(f"        fifo_len={modules.fifo_len}, spkcache_len={modules.spkcache_len}")
+    # Load audio
+    audio_path = "../audio.wav"
+    audio, sr = librosa.load(audio_path, sr=16000, mono=True)
+    print(f"Loaded audio: {len(audio)} samples ({len(audio)/16000:.2f}s)")
+    waveform = torch.from_numpy(audio).unsqueeze(0).float()
+    # Get mel features using model's preprocessor
+    with torch.no_grad():
+        audio_len = torch.tensor([waveform.shape[1]])
+        features, feat_len = model.process_signal(
+            audio_signal=waveform, audio_signal_length=audio_len
+        )
+    # features is [batch, mel, time], need [batch, time, mel] for streaming
+    features = features[:, :, :feat_len.max()]
+    print(f"Features: {features.shape} (batch, mel, time)")
+    # Streaming inference using forward_streaming_step
+    subsampling = modules.subsampling_factor  # 8
+    chunk_len = modules.chunk_len  # 6
+    left_context = modules.chunk_left_context  # 1
+    right_context = modules.chunk_right_context  # 7
+    core_frames = chunk_len * subsampling  # 48 mel frames
+    total_mel_frames = features.shape[2]
+    print(f"Total mel frames: {total_mel_frames}")
+    print(f"Core frames per chunk: {core_frames}")
+    # Initialize streaming state
+    streaming_state = modules.init_streaming_state(device=features.device)
+    # Initialize total_preds tensor
+    total_preds = torch.zeros((1, 0, 4), device=features.device)
+    all_preds = []
+    chunk_idx = 0
+    # Process chunks like streaming_feat_loader
+    stt_feat = 0
+    while stt_feat < total_mel_frames:
+        end_feat = min(stt_feat + core_frames, total_mel_frames)
+        # Calculate context (in mel frames)
+        left_offset = min(left_context * subsampling, stt_feat)
+        right_offset = min(right_context * subsampling, total_mel_frames - end_feat)
+        chunk_start = stt_feat - left_offset
+        chunk_end = end_feat + right_offset
+        # Extract chunk - [batch, mel, time] -> [batch, time, mel]
+        chunk = features[:, :, chunk_start:chunk_end]  # [1, 128, T]
+        chunk_t = chunk.transpose(1, 2)  # [1, T, 128]
+        chunk_len_tensor = torch.tensor([chunk_t.shape[1]], dtype=torch.long)
+        with torch.no_grad():
+            # Use forward_streaming_step
+            streaming_state, total_preds = model.forward_streaming_step(
+                processed_signal=chunk_t,
+                processed_signal_length=chunk_len_tensor,
+                streaming_state=streaming_state,
+                total_preds=total_preds,
+                left_offset=left_offset,
+                right_offset=right_offset,
+            )
+        chunk_idx += 1
+        stt_feat = end_feat
+    # total_preds now contains all predictions
+    all_preds = total_preds[0].numpy()  # [total_frames, 4]
+    print(f"\nTotal output frames: {all_preds.shape[0]}")
+    print(f"Predictions shape: {all_preds.shape}")
+    # Print timeline
+    print("\n=== NeMo Streaming Timeline (80ms per frame, threshold=0.55) ===")
+    print("Frame  Time    Spk0   Spk1   Spk2   Spk3  | Visual")
+    print("-" * 60)
+    for frame in range(all_preds.shape[0]):
+        time_sec = frame * 0.08
+        probs = all_preds[frame]
+        visual = ['■' if p > 0.55 else '·' for p in probs]
+        print(f"{frame:5d}  {time_sec:5.2f}s  {probs[0]:.3f}  {probs[1]:.3f}  {probs[2]:.3f}  {probs[3]:.3f}  | [{visual[0]}{visual[1]}{visual[2]}{visual[3]}]")
+    print("-" * 60)
+    # Speaker activity summary
+    print("\n=== Speaker Activity Summary ===")
+    threshold = 0.55
+    for spk in range(4):
+        active_frames = np.sum(all_preds[:, spk] > threshold)
+        active_time = active_frames * 0.08
+        percent = active_time / (all_preds.shape[0] * 0.08) * 100
+        print(f"Speaker_{spk}: {active_time:.1f}s active ({percent:.1f}%)")
+    # Save to JSON for comparison
+    output = {
+        "total_frames": int(all_preds.shape[0]),
+        "frame_duration_seconds": 0.08,
+        "probabilities": all_preds.flatten().tolist(),
+        "config": {
+            "chunk_len": chunk_len,
+            "chunk_left_context": left_context,
+            "chunk_right_context": right_context,
+            "fifo_len": modules.fifo_len,
+            "spkcache_len": modules.spkcache_len,
+        }
+    }
+    with open("/tmp/nemo_streaming_reference.json", "w") as f:
+        json.dump(output, f, indent=2)
+    print("\nSaved to /tmp/nemo_streaming_reference.json")
+if __name__ == "__main__":
+    main()

streaming_inference.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import torch
+import numpy as np
+import coremltools as ct
+import librosa
+import argparse
+import os
+import sys
+import math
+# Import NeMo components for State Logic
+try:
+    from nemo.collections.asr.models import SortformerEncLabelModel
+    # Try importing SortformerModules directly for type hints if needed, but we can access via model instance
+    from nemo.collections.asr.modules.sortformer_modules import SortformerModules
+except ImportError as e:
+    print(f"Error importing NeMo: {e}")
+    sys.exit(1)
+def streaming_feat_loader(modules, feat_seq, feat_seq_length, feat_seq_offset):
+    """
+    Load a chunk of feature sequence for streaming inference.
+    Adapted from NeMo's SortformerModules.streaming_feat_loader
+    Args:
+        modules: SortformerModules instance with chunk_len, subsampling_factor,
+                 chunk_left_context, chunk_right_context
+        feat_seq (torch.Tensor): Tensor containing feature sequence
+            Shape: (batch_size, feat_dim, feat frame count)
+        feat_seq_length (torch.Tensor): Tensor containing feature sequence lengths
+            Shape: (batch_size,)
+        feat_seq_offset (torch.Tensor): Tensor containing feature sequence offsets
+            Shape: (batch_size,)
+    Yields:
+        chunk_idx (int): Index of the current chunk
+        chunk_feat_seq (torch.Tensor): Tensor containing the chunk of feature sequence
+            Shape: (batch_size, feat frame count, feat_dim)  # Transposed!
+        feat_lengths (torch.Tensor): Tensor containing lengths of the chunk of feature sequence
+            Shape: (batch_size,)
+        left_offset (int): Left context offset in feature frames
+        right_offset (int): Right context offset in feature frames
+    """
+    feat_len = feat_seq.shape[2]
+    chunk_len = modules.chunk_len
+    subsampling_factor = modules.subsampling_factor
+    chunk_left_context = getattr(modules, 'chunk_left_context', 0)
+    chunk_right_context = getattr(modules, 'chunk_right_context', 0)
+    num_chunks = math.ceil(feat_len / (chunk_len * subsampling_factor))
+    print(f"streaming_feat_loader: feat_len={feat_len}, num_chunks={num_chunks}, "
+          f"chunk_len={chunk_len}, subsampling_factor={subsampling_factor}")
+    stt_feat, end_feat, chunk_idx = 0, 0, 0
+    while end_feat < feat_len:
+        left_offset = min(chunk_left_context * subsampling_factor, stt_feat)
+        end_feat = min(stt_feat + chunk_len * subsampling_factor, feat_len)
+        right_offset = min(chunk_right_context * subsampling_factor, feat_len - end_feat)
+        chunk_feat_seq = feat_seq[:, :, stt_feat - left_offset : end_feat + right_offset]
+        feat_lengths = (feat_seq_length + feat_seq_offset - stt_feat + left_offset).clamp(
+            0, chunk_feat_seq.shape[2]
+        )
+        feat_lengths = feat_lengths * (feat_seq_offset < end_feat)
+        stt_feat = end_feat
+        # Transpose from (batch, feat_dim, frames) to (batch, frames, feat_dim)
+        chunk_feat_seq_t = torch.transpose(chunk_feat_seq, 1, 2)
+        print(f"  chunk_idx: {chunk_idx}, chunk_feat_seq_t shape: {chunk_feat_seq_t.shape}, "
+              f"feat_lengths: {feat_lengths}, left_offset: {left_offset}, right_offset: {right_offset}")
+        yield chunk_idx, chunk_feat_seq_t, feat_lengths, left_offset, right_offset
+        chunk_idx += 1
+def run_streaming_inference(model_name, coreml_dir, audio_path):
+    print(f"Loading NeMo Model (for Python Streaming Logic): {model_name}")
+    if os.path.exists(model_name):
+        nemo_model = SortformerEncLabelModel.restore_from(model_name, map_location="cpu")
+    else:
+        nemo_model = SortformerEncLabelModel.from_pretrained(model_name, map_location="cpu")
+    nemo_model.eval()
+    modules = nemo_model.sortformer_modules
+    # --- Override Config to match CoreML Export (Low Latency) ---
+    print("Overriding Config (Inference) to match CoreML...")
+    modules.chunk_len = 4
+    modules.chunk_right_context = 1  # 1 chunk of right context
+    modules.chunk_left_context = 2   # 1 chunk of left context
+    # Match CoreML export sizes (from model spec)
+    modules.fifo_len = 63
+    modules.spkcache_len = 63
+    modules.spkcache_update_period = 50  # Match CoreML export
+    # CoreML fixed input sizes (must match export settings)
+    # With left_context=1, right_context=1: (4+1+1)*8 = 48 frames
+    COREML_CHUNK_FRAMES = 56
+    COREML_SPKCACHE_LEN = 63
+    COREML_FIFO_LEN = 63
+    # Disable dither and pad_to (as diarize does)
+    if hasattr(nemo_model.preprocessor, 'featurizer'):
+        if hasattr(nemo_model.preprocessor.featurizer, 'dither'):
+            nemo_model.preprocessor.featurizer.dither = 0.0
+        if hasattr(nemo_model.preprocessor.featurizer, 'pad_to'):
+            nemo_model.preprocessor.featurizer.pad_to = 0
+    # CoreML Models - use CPU_ONLY for compatibility
+    print(f"Loading CoreML Models from {coreml_dir}...")
+    preproc_model = ct.models.MLModel(
+        os.path.join(coreml_dir, "SortformerPreprocessor.mlpackage"),
+        compute_units=ct.ComputeUnit.CPU_ONLY
+    )
+    main_model = ct.models.MLModel(
+        os.path.join(coreml_dir, "Sortformer.mlpackage"),
+        compute_units=ct.ComputeUnit.ALL
+    )
+    # Config
+    chunk_len = modules.chunk_len  # Output frames (e.g., 4 for low latency)
+    subsampling_factor = modules.subsampling_factor  # 8
+    sample_rate = 16000
+    print(f"Chunk Config: {chunk_len} output frames (diar), subsampling_factor={subsampling_factor}")
+    # Load Audio
+    print(f"Loading Audio: {audio_path}")
+    full_audio, _ = librosa.load(audio_path, sr=sample_rate, mono=True)
+    total_samples = len(full_audio)
+    print(f"Total Samples: {total_samples} ({total_samples/sample_rate:.2f}s)")
+    # === Step 1: Extract features for the ENTIRE audio using preprocessor ===
+    # This matches NeMo's approach: process_signal -> forward_streaming
+    print("Extracting features for entire audio...")
+    audio_tensor = torch.from_numpy(full_audio).unsqueeze(0).float()  # [1, samples]
+    audio_length = torch.tensor([total_samples], dtype=torch.long)
+    with torch.no_grad():
+        # Use process_signal for proper normalization (same as forward())
+        processed_signal, processed_signal_length = nemo_model.process_signal(
+            audio_signal=audio_tensor, audio_signal_length=audio_length
+        )
+    print(f"Processed signal shape: {processed_signal.shape}")  # [1, 128, T]
+    print(f"Processed signal length: {processed_signal_length}")
+    # Trim to actual length
+    processed_signal = processed_signal[:, :, :processed_signal_length.max()]
+    # === Step 2: Initialize streaming state ===
+    print("Initializing Streaming State...")
+    state = modules.init_streaming_state(batch_size=1, device='cpu')
+    # === Step 3: Use streaming_feat_loader to chunk features (matches NeMo exactly) ===
+    batch_size = processed_signal.shape[0]
+    processed_signal_offset = torch.zeros((batch_size,), dtype=torch.long)
+    all_preds = []
+    feat_loader = streaming_feat_loader(
+        modules=modules,
+        feat_seq=processed_signal,
+        feat_seq_length=processed_signal_length,
+        feat_seq_offset=processed_signal_offset,
+    )
+    for chunk_idx, chunk_feat_seq_t, feat_lengths, left_offset, right_offset in feat_loader:
+        # Prepare inputs for CoreML model
+        # Pad chunk to fixed size for CoreML
+        chunk_actual_len = chunk_feat_seq_t.shape[1]
+        if chunk_actual_len < COREML_CHUNK_FRAMES:
+            pad_len = COREML_CHUNK_FRAMES - chunk_actual_len
+            chunk_in = torch.nn.functional.pad(chunk_feat_seq_t, (0, 0, 0, pad_len))
+        else:
+            chunk_in = chunk_feat_seq_t[:, :COREML_CHUNK_FRAMES, :]
+        chunk_len_in = feat_lengths.long()  # actual length
+        # Get actual lengths from state (pad tensors but track real lengths)
+        curr_spk_len = state.spkcache.shape[1]
+        curr_fifo_len = state.fifo.shape[1]
+        # Prepare SpkCache - Pad to CoreML fixed size
+        current_spkcache = state.spkcache
+        if curr_spk_len < COREML_SPKCACHE_LEN:
+            pad_len = COREML_SPKCACHE_LEN - curr_spk_len
+            current_spkcache = torch.nn.functional.pad(current_spkcache, (0, 0, 0, pad_len))
+        elif curr_spk_len > COREML_SPKCACHE_LEN:
+            current_spkcache = current_spkcache[:, :COREML_SPKCACHE_LEN, :]
+        spkcache_in = current_spkcache
+        # Use actual length, not padded length
+        spkcache_len_in = torch.tensor([curr_spk_len], dtype=torch.long)
+        # Prepare FIFO - Pad to CoreML fixed size
+        current_fifo = state.fifo
+        if curr_fifo_len < COREML_FIFO_LEN:
+            pad_len = COREML_FIFO_LEN - curr_fifo_len
+            current_fifo = torch.nn.functional.pad(current_fifo, (0, 0, 0, pad_len))
+        elif curr_fifo_len > COREML_FIFO_LEN:
+            current_fifo = current_fifo[:, :COREML_FIFO_LEN, :]
+        fifo_in = current_fifo
+        fifo_len_in = torch.tensor([curr_fifo_len], dtype=torch.long)
+        # === Run CoreML Model ===
+        coreml_inputs = {
+            "chunk": chunk_in.numpy().astype(np.float32),
+            "chunk_lengths": chunk_len_in.numpy().astype(np.int32),
+            "spkcache": spkcache_in.numpy().astype(np.float32),
+            "spkcache_lengths": spkcache_len_in.numpy().astype(np.int32),
+            "fifo": fifo_in.numpy().astype(np.float32),
+            "fifo_lengths": fifo_len_in.numpy().astype(np.int32)
+        }
+        coreml_out = main_model.predict(coreml_inputs)
+        # Convert outputs back to torch tensors
+        pred_logits = torch.from_numpy(coreml_out["speaker_preds"])
+        chunk_embs = torch.from_numpy(coreml_out["chunk_pre_encoder_embs"])
+        chunk_emb_len = int(coreml_out["chunk_pre_encoder_lengths"][0])
+        # Trim chunk_embs to actual length (drop padded frames)
+        chunk_embs = chunk_embs[:, :chunk_emb_len, :]
+        # Compute lc and rc for streaming_update (in embeddings/diar frames, not feature frames)
+        # NeMo does: lc = round(left_offset / encoder.subsampling_factor)
+        #            rc = math.ceil(right_offset / encoder.subsampling_factor)
+        lc = round(left_offset / subsampling_factor)
+        rc = math.ceil(right_offset / subsampling_factor)
+        # Update state using streaming_update with proper lc/rc
+        state, chunk_probs = modules.streaming_update(
+            streaming_state=state,
+            chunk=chunk_embs,
+            preds=pred_logits,
+            lc=lc,
+            rc=rc
+        )
+        # chunk_probs is the prediction for the current chunk
+        all_preds.append(chunk_probs)
+        print(f"Processed chunk {chunk_idx + 1}, chunk_probs shape: {chunk_probs.shape}", end='\r')
+    print(f"\nFinished. Total Chunks: {len(all_preds)}")
+    if len(all_preds) > 0:
+        final_probs = torch.cat(all_preds, dim=1)  # [1, TotalFrames, Spks]
+        print(f"Final Predictions Shape: {final_probs.shape}")
+        return final_probs
+    return None
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", default="nvidia/diar_streaming_sortformer_4spk-v2.1")
+    parser.add_argument("--coreml_dir", default="coreml_models")
+    parser.add_argument("--audio_path", default="test2.wav")
+    args = parser.parse_args()
+    run_streaming_inference(args.model_name, args.coreml_dir, args.audio_path)

streaming_preproc_inference.py ADDED Viewed

	@@ -0,0 +1,411 @@

+"""
+True Streaming CoreML Diarization
+This script implements true streaming inference:
+    Audio chunks → CoreML Preprocessor → Feature Buffer → CoreML Main Model → Predictions
+Audio is processed incrementally, features are accumulated with proper context handling.
+"""
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+import torch
+import numpy as np
+import coremltools as ct
+import librosa
+import argparse
+import math
+# Import NeMo for state management (streaming_update) only
+from nemo.collections.asr.models import SortformerEncLabelModel
+# ============================================================
+# Configuration for Sortformer16.mlpackage
+# ============================================================
+CONFIG = {
+    'chunk_len': 4,                  # Diarization chunk length
+    'chunk_right_context': 1,        # Right context chunks
+    'chunk_left_context': 2,         # Left context chunks
+    'fifo_len': 63,
+    'spkcache_len': 63,
+    'spkcache_update_period': 50,
+    'subsampling_factor': 8,
+    'sample_rate': 16000,
+    # Derived values
+    'chunk_frames': 56,              # (4+2+1)*8 = 56 feature frames for CoreML input
+    'spkcache_input_len': 63,
+    'fifo_input_len': 63,
+    # Preprocessor settings
+    'preproc_audio_samples': 9200,   # CoreML preprocessor fixed input size
+    'mel_window': 400,               # 25ms @ 16kHz
+    'mel_stride': 160,               # 10ms @ 16kHz
+}
+def run_true_streaming(nemo_model, preproc_model, main_model, audio_path, config):
+    """
+    True streaming inference: audio chunks → preproc → main model.
+    Strategy:
+    1. Process audio in chunks through CoreML preprocessor
+    2. Accumulate features
+    3. When enough features for a diarization chunk (with context), run main model
+    """
+    modules = nemo_model.sortformer_modules
+    subsampling_factor = config['subsampling_factor']
+    # Load full audio (simulating microphone input)
+    full_audio, sr = librosa.load(audio_path, sr=config['sample_rate'], mono=True)
+    total_samples = len(full_audio)
+    print(f"Total audio samples: {total_samples}")
+    # Preprocessing parameters
+    mel_window = config['mel_window']
+    mel_stride = config['mel_stride']
+    preproc_len = config['preproc_audio_samples']
+    # Audio hop for preprocessor (to avoid overlap in features)
+    audio_hop = preproc_len - mel_window  # 8800 samples
+    # Feature accumulator
+    all_features = []
+    audio_offset = 0
+    preproc_chunk_idx = 0
+    # Step 1: Process all audio through preprocessor to get features
+    print("Step 1: Extracting features via CoreML preprocessor...")
+    while audio_offset < total_samples:
+        # Get audio chunk
+        chunk_end = min(audio_offset + preproc_len, total_samples)
+        audio_chunk = full_audio[audio_offset:chunk_end]
+        actual_samples = len(audio_chunk)
+        # Pad if needed
+        if actual_samples < preproc_len:
+            audio_chunk = np.pad(audio_chunk, (0, preproc_len - actual_samples))
+        # Run preprocessor
+        preproc_inputs = {
+            "audio_signal": audio_chunk.reshape(1, -1).astype(np.float32),
+            "length": np.array([actual_samples], dtype=np.int32)
+        }
+        preproc_out = preproc_model.predict(preproc_inputs)
+        feat_chunk = np.array(preproc_out["features"])  # [1, 128, frames]
+        feat_len = int(preproc_out["feature_lengths"][0])
+        # Extract valid features and handle overlap
+        if preproc_chunk_idx == 0:
+            # First chunk: keep all
+            valid_feats = feat_chunk[:, :, :feat_len]
+        else:
+            # Subsequent: skip overlap frames
+            overlap_frames = (mel_window - mel_stride) // mel_stride + 1  # ~2-3 frames
+            valid_feats = feat_chunk[:, :, overlap_frames:feat_len]
+        all_features.append(valid_feats)
+        audio_offset += audio_hop
+        preproc_chunk_idx += 1
+        print(f"\r  Processed audio chunk {preproc_chunk_idx}, features so far: {sum(f.shape[2] for f in all_features)}", end='')
+    print()
+    # Concatenate all features
+    full_features = np.concatenate(all_features, axis=2)  # [1, 128, total_frames]
+    processed_signal = torch.from_numpy(full_features).float()
+    processed_signal_length = torch.tensor([full_features.shape[2]], dtype=torch.long)
+    print(f"Total features extracted: {processed_signal.shape}")
+    # Step 2: Run diarization streaming loop (same as NeMo reference)
+    print("Step 2: Running diarization streaming...")
+    state = modules.init_streaming_state(batch_size=1, device='cpu')
+    all_preds = []
+    feat_len = processed_signal.shape[2]
+    chunk_len = modules.chunk_len
+    left_ctx = modules.chunk_left_context
+    right_ctx = modules.chunk_right_context
+    stt_feat, end_feat, chunk_idx = 0, 0, 0
+    while end_feat < feat_len:
+        left_offset = min(left_ctx * subsampling_factor, stt_feat)
+        end_feat = min(stt_feat + chunk_len * subsampling_factor, feat_len)
+        right_offset = min(right_ctx * subsampling_factor, feat_len - end_feat)
+        # Extract chunk with context
+        chunk_feat = processed_signal[:, :, stt_feat - left_offset : end_feat + right_offset]
+        actual_len = chunk_feat.shape[2]
+        # Transpose to [B, T, D]
+        chunk_t = chunk_feat.transpose(1, 2)
+        # Pad to fixed size
+        if actual_len < config['chunk_frames']:
+            pad_len = config['chunk_frames'] - actual_len
+            chunk_in = torch.nn.functional.pad(chunk_t, (0, 0, 0, pad_len))
+        else:
+            chunk_in = chunk_t[:, :config['chunk_frames'], :]
+        # State preparation
+        curr_spk_len = state.spkcache.shape[1]
+        curr_fifo_len = state.fifo.shape[1]
+        current_spkcache = state.spkcache
+        if curr_spk_len < config['spkcache_input_len']:
+            current_spkcache = torch.nn.functional.pad(
+                current_spkcache, (0, 0, 0, config['spkcache_input_len'] - curr_spk_len)
+            )
+        elif curr_spk_len > config['spkcache_input_len']:
+            current_spkcache = current_spkcache[:, :config['spkcache_input_len'], :]
+        current_fifo = state.fifo
+        if curr_fifo_len < config['fifo_input_len']:
+            current_fifo = torch.nn.functional.pad(
+                current_fifo, (0, 0, 0, config['fifo_input_len'] - curr_fifo_len)
+            )
+        elif curr_fifo_len > config['fifo_input_len']:
+            current_fifo = current_fifo[:, :config['fifo_input_len'], :]
+        # CoreML inference
+        coreml_inputs = {
+            "chunk": chunk_in.numpy().astype(np.float32),
+            "chunk_lengths": np.array([actual_len], dtype=np.int32),
+            "spkcache": current_spkcache.numpy().astype(np.float32),
+            "spkcache_lengths": np.array([curr_spk_len], dtype=np.int32),
+            "fifo": current_fifo.numpy().astype(np.float32),
+            "fifo_lengths": np.array([curr_fifo_len], dtype=np.int32)
+        }
+        coreml_out = main_model.predict(coreml_inputs)
+        pred_logits = torch.from_numpy(coreml_out["speaker_preds"])
+        chunk_embs = torch.from_numpy(coreml_out["chunk_pre_encoder_embs"])
+        chunk_emb_len = int(coreml_out["chunk_pre_encoder_lengths"][0])
+        chunk_embs = chunk_embs[:, :chunk_emb_len, :]
+        lc = round(left_offset / subsampling_factor)
+        rc = math.ceil(right_offset / subsampling_factor)
+        state, chunk_probs = modules.streaming_update(
+            streaming_state=state,
+            chunk=chunk_embs,
+            preds=pred_logits,
+            lc=lc,
+            rc=rc
+        )
+        all_preds.append(chunk_probs)
+        stt_feat = end_feat
+        chunk_idx += 1
+        print(f"\r  Diarization chunk {chunk_idx}", end='')
+    print()
+    if len(all_preds) > 0:
+        return torch.cat(all_preds, dim=1)
+    return None
+def run_reference(nemo_model, main_model, audio_path, config):
+    """
+    Reference implementation using NeMo preprocessing.
+    """
+    modules = nemo_model.sortformer_modules
+    subsampling_factor = modules.subsampling_factor
+    # Load full audio
+    full_audio, _ = librosa.load(audio_path, sr=config['sample_rate'], mono=True)
+    audio_tensor = torch.from_numpy(full_audio).unsqueeze(0).float()
+    audio_length = torch.tensor([len(full_audio)], dtype=torch.long)
+    # Extract features using NeMo preprocessor
+    with torch.no_grad():
+        processed_signal, processed_signal_length = nemo_model.process_signal(
+            audio_signal=audio_tensor, audio_signal_length=audio_length
+        )
+    processed_signal = processed_signal[:, :, :processed_signal_length.max()]
+    print(f"NeMo Preproc: features shape = {processed_signal.shape}")
+    # Streaming loop
+    state = modules.init_streaming_state(batch_size=1, device='cpu')
+    all_preds = []
+    feat_len = processed_signal.shape[2]
+    chunk_len = modules.chunk_len
+    left_ctx = modules.chunk_left_context
+    right_ctx = modules.chunk_right_context
+    stt_feat, end_feat, chunk_idx = 0, 0, 0
+    while end_feat < feat_len:
+        left_offset = min(left_ctx * subsampling_factor, stt_feat)
+        end_feat = min(stt_feat + chunk_len * subsampling_factor, feat_len)
+        right_offset = min(right_ctx * subsampling_factor, feat_len - end_feat)
+        chunk_feat = processed_signal[:, :, stt_feat - left_offset : end_feat + right_offset]
+        actual_len = chunk_feat.shape[2]
+        chunk_t = chunk_feat.transpose(1, 2)
+        if actual_len < config['chunk_frames']:
+            pad_len = config['chunk_frames'] - actual_len
+            chunk_in = torch.nn.functional.pad(chunk_t, (0, 0, 0, pad_len))
+        else:
+            chunk_in = chunk_t[:, :config['chunk_frames'], :]
+        curr_spk_len = state.spkcache.shape[1]
+        curr_fifo_len = state.fifo.shape[1]
+        current_spkcache = state.spkcache
+        if curr_spk_len < config['spkcache_input_len']:
+            current_spkcache = torch.nn.functional.pad(
+                current_spkcache, (0, 0, 0, config['spkcache_input_len'] - curr_spk_len)
+            )
+        elif curr_spk_len > config['spkcache_input_len']:
+            current_spkcache = current_spkcache[:, :config['spkcache_input_len'], :]
+        current_fifo = state.fifo
+        if curr_fifo_len < config['fifo_input_len']:
+            current_fifo = torch.nn.functional.pad(
+                current_fifo, (0, 0, 0, config['fifo_input_len'] - curr_fifo_len)
+            )
+        elif curr_fifo_len > config['fifo_input_len']:
+            current_fifo = current_fifo[:, :config['fifo_input_len'], :]
+        coreml_inputs = {
+            "chunk": chunk_in.numpy().astype(np.float32),
+            "chunk_lengths": np.array([actual_len], dtype=np.int32),
+            "spkcache": current_spkcache.numpy().astype(np.float32),
+            "spkcache_lengths": np.array([curr_spk_len], dtype=np.int32),
+            "fifo": current_fifo.numpy().astype(np.float32),
+            "fifo_lengths": np.array([curr_fifo_len], dtype=np.int32)
+        }
+        coreml_out = main_model.predict(coreml_inputs)
+        pred_logits = torch.from_numpy(coreml_out["speaker_preds"])
+        chunk_embs = torch.from_numpy(coreml_out["chunk_pre_encoder_embs"])
+        chunk_emb_len = int(coreml_out["chunk_pre_encoder_lengths"][0])
+        chunk_embs = chunk_embs[:, :chunk_emb_len, :]
+        lc = round(left_offset / subsampling_factor)
+        rc = math.ceil(right_offset / subsampling_factor)
+        state, chunk_probs = modules.streaming_update(
+            streaming_state=state,
+            chunk=chunk_embs,
+            preds=pred_logits,
+            lc=lc,
+            rc=rc
+        )
+        all_preds.append(chunk_probs)
+        stt_feat = end_feat
+        chunk_idx += 1
+    if len(all_preds) > 0:
+        return torch.cat(all_preds, dim=1)
+    return None
+def validate(model_name, coreml_dir, audio_path):
+    """
+    Validate true streaming against NeMo preprocessing.
+    """
+    print("=" * 70)
+    print("VALIDATION: True Streaming vs NeMo Preprocessing")
+    print("=" * 70)
+    # Load NeMo model
+    print(f"\nLoading NeMo Model: {model_name}")
+    nemo_model = SortformerEncLabelModel.from_pretrained(model_name, map_location="cpu")
+    nemo_model.eval()
+    # Apply config
+    modules = nemo_model.sortformer_modules
+    modules.chunk_len = CONFIG['chunk_len']
+    modules.chunk_right_context = CONFIG['chunk_right_context']
+    modules.chunk_left_context = CONFIG['chunk_left_context']
+    modules.fifo_len = CONFIG['fifo_len']
+    modules.spkcache_len = CONFIG['spkcache_len']
+    modules.spkcache_update_period = CONFIG['spkcache_update_period']
+    # Disable dither and pad_to
+    if hasattr(nemo_model.preprocessor, 'featurizer'):
+        nemo_model.preprocessor.featurizer.dither = 0.0
+        nemo_model.preprocessor.featurizer.pad_to = 0
+    print(f"Config: chunk_len={modules.chunk_len}, left_ctx={modules.chunk_left_context}, "
+          f"right_ctx={modules.chunk_right_context}")
+    # Load CoreML models
+    print(f"Loading CoreML Models from {coreml_dir}...")
+    preproc_model = ct.models.MLModel(
+        os.path.join(coreml_dir, "SortformerPreprocessor.mlpackage"),
+        compute_units=ct.ComputeUnit.CPU_ONLY
+    )
+    main_model = ct.models.MLModel(
+        os.path.join(coreml_dir, "Sortformer16.mlpackage"),
+        compute_units=ct.ComputeUnit.CPU_ONLY
+    )
+    # Reference
+    print("\n" + "=" * 70)
+    print("TEST 1: NeMo Preprocessing + CoreML Inference (Reference)")
+    print("=" * 70)
+    ref_probs = run_reference(nemo_model, main_model, audio_path, CONFIG)
+    if ref_probs is not None:
+        ref_probs_np = ref_probs.squeeze(0).detach().cpu().numpy()
+        print(f"Reference Probs Shape: {ref_probs_np.shape}")
+    else:
+        print("Reference inference failed!")
+        return
+    # True streaming
+    print("\n" + "=" * 70)
+    print("TEST 2: True Streaming (Audio → CoreML Preproc → CoreML Main)")
+    print("=" * 70)
+    streaming_probs = run_true_streaming(nemo_model, preproc_model, main_model, audio_path, CONFIG)
+    if streaming_probs is not None:
+        streaming_probs_np = streaming_probs.squeeze(0).detach().cpu().numpy()
+        print(f"Streaming Probs Shape: {streaming_probs_np.shape}")
+        # Compare
+        min_len = min(ref_probs_np.shape[0], streaming_probs_np.shape[0])
+        diff = np.abs(ref_probs_np[:min_len] - streaming_probs_np[:min_len])
+        print(f"\nLength: ref={ref_probs_np.shape[0]}, streaming={streaming_probs_np.shape[0]}")
+        print(f"Mean Absolute Error: {np.mean(diff):.8f}")
+        print(f"Max Absolute Error: {np.max(diff):.8f}")
+        if np.max(diff) < 0.01:
+            print("\n✅ SUCCESS: True streaming matches reference!")
+        else:
+            print("\n⚠️  Errors exceed tolerance")
+    else:
+        print("True streaming inference produced no output!")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", default="nvidia/diar_streaming_sortformer_4spk-v2.1")
+    parser.add_argument("--coreml_dir", default="coreml_models")
+    parser.add_argument("--audio_path", default="audio.wav")
+    args = parser.parse_args()
+    validate(args.model_name, args.coreml_dir, args.audio_path)