alexwengg commited on Dec 24, 2025

Commit

f9a579a

verified ·

1 Parent(s): a7e3983

Upload 26 files

Browse files

Files changed (26) hide show

Pipeline_Head_Fixed.mlmodelc/analytics/coremldata.bin +3 -0
Pipeline_Head_Fixed.mlmodelc/coremldata.bin +3 -0
Pipeline_Head_Fixed.mlmodelc/model.mil +0 -0
Pipeline_Head_Fixed.mlmodelc/weights/weight.bin +3 -0
Pipeline_Head_Fixed.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
Pipeline_Head_Fixed.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
Pipeline_Head_Fixed.mlpackage/Manifest.json +18 -0
Pipeline_PreEncoder.mlmodelc/analytics/coremldata.bin +3 -0
Pipeline_PreEncoder.mlmodelc/coremldata.bin +3 -0
Pipeline_PreEncoder.mlmodelc/model.mil +201 -0
Pipeline_PreEncoder.mlmodelc/weights/weight.bin +3 -0
Pipeline_PreEncoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
Pipeline_PreEncoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
Pipeline_PreEncoder.mlpackage/Manifest.json +18 -0
Pipeline_Preprocessor.mlmodelc/analytics/coremldata.bin +3 -0
Pipeline_Preprocessor.mlmodelc/coremldata.bin +3 -0
Pipeline_Preprocessor.mlmodelc/model.mil +0 -0
Pipeline_Preprocessor.mlmodelc/weights/weight.bin +3 -0
Pipeline_Preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
Pipeline_Preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
Pipeline_Preprocessor.mlpackage/Manifest.json +18 -0
README.md +137 -0
convert_to_coreml.py +357 -0
export_nvidia_pipeline.py +308 -0
inference.py +192 -0
pyproject.toml +22 -0

Pipeline_Head_Fixed.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5980b0b5b6afd629201028acd9d30ef139405a4ff8e3197551b5749757e19808
+size 243

Pipeline_Head_Fixed.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d3a809042f1aafc6410902c356ad226e4104b9f92f21a266b85a89d501c8e3c
+size 505

Pipeline_Head_Fixed.mlmodelc/model.mil ADDED Viewed

The diff for this file is too large to render. See raw diff

Pipeline_Head_Fixed.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be56312ef2dbf57706aab7395fc2d5601ac6fbe6c553fd2b25069eba8da9b3b2
+size 235580992

Pipeline_Head_Fixed.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93251763d5376dad5dc1f78cb0440397abb5a52e346575f1b6b750e958da13eb
+size 827022

Pipeline_Head_Fixed.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be56312ef2dbf57706aab7395fc2d5601ac6fbe6c553fd2b25069eba8da9b3b2
+size 235580992

Pipeline_Head_Fixed.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "7D7B39C7-0AD2-4CD7-B6B0-A7E76DCDE6CA": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        },
+        "B58CA828-CA78-46FC-BD8D-5ABFB5AAEADD": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        }
+    },
+    "rootModelIdentifier": "B58CA828-CA78-46FC-BD8D-5ABFB5AAEADD"
+}

Pipeline_PreEncoder.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0946b687ccf4274e0228e3bc539e6733c33bf0f4419e28d02220b19a35d884b
+size 243

Pipeline_PreEncoder.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:beb4fe86e80615cf79e64e55cb229cc931998b67b51a661bfbdc66204da0ea7b
+size 553

Pipeline_PreEncoder.mlmodelc/model.mil ADDED Viewed

	@@ -0,0 +1,201 @@

+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3505.3.2"}, {"coremlc-version", "3505.4.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
+{
+    func main<ios16>(tensor<fp32, [1, 112, 128]> chunk, tensor<int32, [1]> chunk_lengths, tensor<fp32, [1, 188, 512]> fifo, tensor<int32, [1]> fifo_lengths, tensor<fp32, [1, 188, 512]> spkcache, tensor<int32, [1]> spkcache_lengths) {
+            tensor<fp32, [256]> model_encoder_pre_encode_conv_0_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp32, [256, 1, 3, 3]> model_encoder_pre_encode_conv_0_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_0_weight"), val = tensor<fp32, [256, 1, 3, 3]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
+            tensor<fp32, [256]> model_encoder_pre_encode_conv_2_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10432)))];
+            tensor<fp32, [256, 1, 3, 3]> model_encoder_pre_encode_conv_2_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_2_weight"), val = tensor<fp32, [256, 1, 3, 3]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11520)))];
+            tensor<fp32, [256]> model_encoder_pre_encode_conv_3_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20800)))];
+            tensor<fp32, [256, 256, 1, 1]> model_encoder_pre_encode_conv_3_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_3_weight"), val = tensor<fp32, [256, 256, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21888)))];
+            tensor<fp32, [256]> model_encoder_pre_encode_conv_5_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(284096)))];
+            tensor<fp32, [256, 1, 3, 3]> model_encoder_pre_encode_conv_5_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_5_weight"), val = tensor<fp32, [256, 1, 3, 3]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(285184)))];
+            tensor<fp32, [256]> model_encoder_pre_encode_conv_6_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_6_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(294464)))];
+            tensor<fp32, [256, 256, 1, 1]> model_encoder_pre_encode_conv_6_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_6_weight"), val = tensor<fp32, [256, 256, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295552)))];
+            tensor<fp32, [512]> model_encoder_pre_encode_out_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_out_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(557760)))];
+            tensor<fp32, [512, 4096]> model_encoder_pre_encode_out_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_out_weight"), val = tensor<fp32, [512, 4096]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(559872)))];
+            tensor<int32, [1]> tensor_1_axes_0 = const()[name = tensor<string, []>("tensor_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1, 1, 112, 128]> tensor_1 = expand_dims(axes = tensor_1_axes_0, x = chunk)[name = tensor<string, []>("tensor_1")];
+            tensor<string, []> cast_0_dtype_0 = const()[name = tensor<string, []>("cast_0_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, [1, 112]> expand_dims_0 = const()[name = tensor<string, []>("expand_dims_0"), val = tensor<int32, [1, 112]>([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111]])];
+            tensor<int32, [1]> var_40_axes_0 = const()[name = tensor<string, []>("op_40_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [1, 1]> var_40 = expand_dims(axes = var_40_axes_0, x = chunk_lengths)[name = tensor<string, []>("op_40")];
+            tensor<bool, [1, 112]> time_mask_1 = less(x = expand_dims_0, y = var_40)[name = tensor<string, []>("time_mask_1")];
+            tensor<int32, [1]> var_42_axes_0 = const()[name = tensor<string, []>("op_42_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<bool, [1, 112, 1]> var_42 = expand_dims(axes = var_42_axes_0, x = time_mask_1)[name = tensor<string, []>("op_42")];
+            tensor<int32, [3]> var_44_reps_0 = const()[name = tensor<string, []>("op_44_reps_0"), val = tensor<int32, [3]>([1, 1, 128])];
+            tensor<bool, [1, 112, 128]> var_44 = tile(reps = var_44_reps_0, x = var_42)[name = tensor<string, []>("op_44")];
+            tensor<string, []> cast_2_dtype_0 = const()[name = tensor<string, []>("cast_2_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, [1]> var_50_axes_0 = const()[name = tensor<string, []>("op_50_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1, 112, 128]> cast_2 = cast(dtype = cast_2_dtype_0, x = var_44)[name = tensor<string, []>("cast_25")];
+            tensor<fp32, [1, 1, 112, 128]> var_50 = expand_dims(axes = var_50_axes_0, x = cast_2)[name = tensor<string, []>("op_50")];
+            tensor<fp32, [1, 1, 112, 128]> input_1 = mul(x = tensor_1, y = var_50)[name = tensor<string, []>("input_1")];
+            tensor<string, []> tensor_3_pad_type_0 = const()[name = tensor<string, []>("tensor_3_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> tensor_3_pad_0 = const()[name = tensor<string, []>("tensor_3_pad_0"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<int32, [2]> tensor_3_strides_0 = const()[name = tensor<string, []>("tensor_3_strides_0"), val = tensor<int32, [2]>([2, 2])];
+            tensor<int32, [2]> tensor_3_dilations_0 = const()[name = tensor<string, []>("tensor_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, []> tensor_3_groups_0 = const()[name = tensor<string, []>("tensor_3_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 56, 64]> tensor_3 = conv(bias = model_encoder_pre_encode_conv_0_bias, dilations = tensor_3_dilations_0, groups = tensor_3_groups_0, pad = tensor_3_pad_0, pad_type = tensor_3_pad_type_0, strides = tensor_3_strides_0, weight = model_encoder_pre_encode_conv_0_weight, x = input_1)[name = tensor<string, []>("tensor_3")];
+            tensor<fp32, []> var_61_promoted = const()[name = tensor<string, []>("op_61_promoted"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> cast_0 = cast(dtype = cast_0_dtype_0, x = chunk_lengths)[name = tensor<string, []>("cast_26")];
+            tensor<fp32, [1]> var_62 = add(x = cast_0, y = var_61_promoted)[name = tensor<string, []>("op_62")];
+            tensor<fp32, []> var_63_promoted = const()[name = tensor<string, []>("op_63_promoted"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> var_64 = add(x = var_62, y = var_63_promoted)[name = tensor<string, []>("op_64")];
+            tensor<fp32, []> var_65_promoted = const()[name = tensor<string, []>("op_65_promoted"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> var_66 = sub(x = var_64, y = var_65_promoted)[name = tensor<string, []>("op_66")];
+            tensor<fp32, []> var_21_promoted = const()[name = tensor<string, []>("op_21_promoted"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> floor_div_0 = floor_div(x = var_66, y = var_21_promoted)[name = tensor<string, []>("floor_div_0")];
+            tensor<fp32, []> var_68_promoted = const()[name = tensor<string, []>("op_68_promoted"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> current_lengths_3 = add(x = floor_div_0, y = var_68_promoted)[name = tensor<string, []>("current_lengths_3")];
+            tensor<string, []> cast_3_dtype_0 = const()[name = tensor<string, []>("cast_3_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, [1, 56]> expand_dims_1 = const()[name = tensor<string, []>("expand_dims_1"), val = tensor<int32, [1, 56]>([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]])];
+            tensor<int32, [1]> var_77_axes_0 = const()[name = tensor<string, []>("op_77_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [1]> cast_3 = cast(dtype = cast_3_dtype_0, x = current_lengths_3)[name = tensor<string, []>("cast_24")];
+            tensor<int32, [1, 1]> var_77 = expand_dims(axes = var_77_axes_0, x = cast_3)[name = tensor<string, []>("op_77")];
+            tensor<bool, [1, 56]> time_mask_3 = less(x = expand_dims_1, y = var_77)[name = tensor<string, []>("time_mask_3")];
+            tensor<int32, [1]> var_79_axes_0 = const()[name = tensor<string, []>("op_79_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<bool, [1, 56, 1]> var_79 = expand_dims(axes = var_79_axes_0, x = time_mask_3)[name = tensor<string, []>("op_79")];
+            tensor<int32, [3]> var_81_reps_0 = const()[name = tensor<string, []>("op_81_reps_0"), val = tensor<int32, [3]>([1, 1, 64])];
+            tensor<bool, [1, 56, 64]> var_81 = tile(reps = var_81_reps_0, x = var_79)[name = tensor<string, []>("op_81")];
+            tensor<string, []> cast_4_dtype_0 = const()[name = tensor<string, []>("cast_4_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, [1]> var_87_axes_0 = const()[name = tensor<string, []>("op_87_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1, 56, 64]> cast_4 = cast(dtype = cast_4_dtype_0, x = var_81)[name = tensor<string, []>("cast_23")];
+            tensor<fp32, [1, 1, 56, 64]> var_87 = expand_dims(axes = var_87_axes_0, x = cast_4)[name = tensor<string, []>("op_87")];
+            tensor<int32, [4]> expanded_mask_3_reps_0 = const()[name = tensor<string, []>("expanded_mask_3_reps_0"), val = tensor<int32, [4]>([1, 256, 1, 1])];
+            tensor<fp32, [1, 256, 56, 64]> expanded_mask_3 = tile(reps = expanded_mask_3_reps_0, x = var_87)[name = tensor<string, []>("expanded_mask_3")];
+            tensor<fp32, [1, 256, 56, 64]> input_3 = mul(x = tensor_3, y = expanded_mask_3)[name = tensor<string, []>("input_3")];
+            tensor<fp32, [1, 256, 56, 64]> tensor_5 = relu(x = input_3)[name = tensor<string, []>("tensor_5")];
+            tensor<fp32, [1, 256, 56, 64]> input_5 = mul(x = tensor_5, y = expanded_mask_3)[name = tensor<string, []>("input_5")];
+            tensor<string, []> tensor_7_pad_type_0 = const()[name = tensor<string, []>("tensor_7_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> tensor_7_pad_0 = const()[name = tensor<string, []>("tensor_7_pad_0"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<int32, [2]> tensor_7_strides_0 = const()[name = tensor<string, []>("tensor_7_strides_0"), val = tensor<int32, [2]>([2, 2])];
+            tensor<int32, []> tensor_7_groups_0 = const()[name = tensor<string, []>("tensor_7_groups_0"), val = tensor<int32, []>(256)];
+            tensor<int32, [2]> tensor_7_dilations_0 = const()[name = tensor<string, []>("tensor_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 256, 28, 32]> tensor_7 = conv(bias = model_encoder_pre_encode_conv_2_bias, dilations = tensor_7_dilations_0, groups = tensor_7_groups_0, pad = tensor_7_pad_0, pad_type = tensor_7_pad_type_0, strides = tensor_7_strides_0, weight = model_encoder_pre_encode_conv_2_weight, x = input_5)[name = tensor<string, []>("tensor_7")];
+            tensor<fp32, []> var_107_promoted = const()[name = tensor<string, []>("op_107_promoted"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> var_108 = add(x = current_lengths_3, y = var_107_promoted)[name = tensor<string, []>("op_108")];
+            tensor<fp32, []> var_109_promoted = const()[name = tensor<string, []>("op_109_promoted"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> var_110 = add(x = var_108, y = var_109_promoted)[name = tensor<string, []>("op_110")];
+            tensor<fp32, []> var_111_promoted = const()[name = tensor<string, []>("op_111_promoted"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> var_112 = sub(x = var_110, y = var_111_promoted)[name = tensor<string, []>("op_112")];
+            tensor<fp32, []> var_21_promoted_1 = const()[name = tensor<string, []>("op_21_promoted_1"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> floor_div_1 = floor_div(x = var_112, y = var_21_promoted_1)[name = tensor<string, []>("floor_div_1")];
+            tensor<fp32, []> var_114_promoted = const()[name = tensor<string, []>("op_114_promoted"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> current_lengths_5 = add(x = floor_div_1, y = var_114_promoted)[name = tensor<string, []>("current_lengths_5")];
+            tensor<string, []> cast_5_dtype_0 = const()[name = tensor<string, []>("cast_5_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, [1, 28]> expand_dims_2 = const()[name = tensor<string, []>("expand_dims_2"), val = tensor<int32, [1, 28]>([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]])];
+            tensor<int32, [1]> var_123_axes_0 = const()[name = tensor<string, []>("op_123_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [1]> cast_5 = cast(dtype = cast_5_dtype_0, x = current_lengths_5)[name = tensor<string, []>("cast_22")];
+            tensor<int32, [1, 1]> var_123 = expand_dims(axes = var_123_axes_0, x = cast_5)[name = tensor<string, []>("op_123")];
+            tensor<bool, [1, 28]> time_mask_5 = less(x = expand_dims_2, y = var_123)[name = tensor<string, []>("time_mask_5")];
+            tensor<int32, [1]> var_125_axes_0 = const()[name = tensor<string, []>("op_125_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<bool, [1, 28, 1]> var_125 = expand_dims(axes = var_125_axes_0, x = time_mask_5)[name = tensor<string, []>("op_125")];
+            tensor<int32, [3]> var_127_reps_0 = const()[name = tensor<string, []>("op_127_reps_0"), val = tensor<int32, [3]>([1, 1, 32])];
+            tensor<bool, [1, 28, 32]> var_127 = tile(reps = var_127_reps_0, x = var_125)[name = tensor<string, []>("op_127")];
+            tensor<string, []> cast_6_dtype_0 = const()[name = tensor<string, []>("cast_6_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, [1]> var_133_axes_0 = const()[name = tensor<string, []>("op_133_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1, 28, 32]> cast_6 = cast(dtype = cast_6_dtype_0, x = var_127)[name = tensor<string, []>("cast_21")];
+            tensor<fp32, [1, 1, 28, 32]> var_133 = expand_dims(axes = var_133_axes_0, x = cast_6)[name = tensor<string, []>("op_133")];
+            tensor<int32, [4]> expanded_mask_7_reps_0 = const()[name = tensor<string, []>("expanded_mask_7_reps_0"), val = tensor<int32, [4]>([1, 256, 1, 1])];
+            tensor<fp32, [1, 256, 28, 32]> expanded_mask_7 = tile(reps = expanded_mask_7_reps_0, x = var_133)[name = tensor<string, []>("expanded_mask_7")];
+            tensor<fp32, [1, 256, 28, 32]> input_7 = mul(x = tensor_7, y = expanded_mask_7)[name = tensor<string, []>("input_7")];
+            tensor<string, []> tensor_9_pad_type_0 = const()[name = tensor<string, []>("tensor_9_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [2]> tensor_9_strides_0 = const()[name = tensor<string, []>("tensor_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> tensor_9_pad_0 = const()[name = tensor<string, []>("tensor_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> tensor_9_dilations_0 = const()[name = tensor<string, []>("tensor_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, []> tensor_9_groups_0 = const()[name = tensor<string, []>("tensor_9_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 28, 32]> tensor_9 = conv(bias = model_encoder_pre_encode_conv_3_bias, dilations = tensor_9_dilations_0, groups = tensor_9_groups_0, pad = tensor_9_pad_0, pad_type = tensor_9_pad_type_0, strides = tensor_9_strides_0, weight = model_encoder_pre_encode_conv_3_weight, x = input_7)[name = tensor<string, []>("tensor_9")];
+            tensor<fp32, [1, 256, 28, 32]> input_9 = mul(x = tensor_9, y = expanded_mask_7)[name = tensor<string, []>("input_9")];
+            tensor<fp32, [1, 256, 28, 32]> tensor_11 = relu(x = input_9)[name = tensor<string, []>("tensor_11")];
+            tensor<fp32, [1, 256, 28, 32]> input_11 = mul(x = tensor_11, y = expanded_mask_7)[name = tensor<string, []>("input_11")];
+            tensor<string, []> tensor_13_pad_type_0 = const()[name = tensor<string, []>("tensor_13_pad_type_0"), val = tensor<string, []>("custom")];
+            tensor<int32, [4]> tensor_13_pad_0 = const()[name = tensor<string, []>("tensor_13_pad_0"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<int32, [2]> tensor_13_strides_0 = const()[name = tensor<string, []>("tensor_13_strides_0"), val = tensor<int32, [2]>([2, 2])];
+            tensor<int32, []> tensor_13_groups_0 = const()[name = tensor<string, []>("tensor_13_groups_0"), val = tensor<int32, []>(256)];
+            tensor<int32, [2]> tensor_13_dilations_0 = const()[name = tensor<string, []>("tensor_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<fp32, [1, 256, 14, 16]> tensor_13 = conv(bias = model_encoder_pre_encode_conv_5_bias, dilations = tensor_13_dilations_0, groups = tensor_13_groups_0, pad = tensor_13_pad_0, pad_type = tensor_13_pad_type_0, strides = tensor_13_strides_0, weight = model_encoder_pre_encode_conv_5_weight, x = input_11)[name = tensor<string, []>("tensor_13")];
+            tensor<fp32, []> var_168_promoted = const()[name = tensor<string, []>("op_168_promoted"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> var_169 = add(x = current_lengths_5, y = var_168_promoted)[name = tensor<string, []>("op_169")];
+            tensor<fp32, []> var_170_promoted = const()[name = tensor<string, []>("op_170_promoted"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> var_171 = add(x = var_169, y = var_170_promoted)[name = tensor<string, []>("op_171")];
+            tensor<fp32, []> var_172_promoted = const()[name = tensor<string, []>("op_172_promoted"), val = tensor<fp32, []>(0x1.8p+1)];
+            tensor<fp32, [1]> var_173 = sub(x = var_171, y = var_172_promoted)[name = tensor<string, []>("op_173")];
+            tensor<fp32, []> var_21_promoted_2 = const()[name = tensor<string, []>("op_21_promoted_2"), val = tensor<fp32, []>(0x1p+1)];
+            tensor<fp32, [1]> floor_div_2 = floor_div(x = var_173, y = var_21_promoted_2)[name = tensor<string, []>("floor_div_2")];
+            tensor<fp32, []> var_175_promoted = const()[name = tensor<string, []>("op_175_promoted"), val = tensor<fp32, []>(0x1p+0)];
+            tensor<fp32, [1]> current_lengths = add(x = floor_div_2, y = var_175_promoted)[name = tensor<string, []>("current_lengths")];
+            tensor<string, []> cast_7_dtype_0 = const()[name = tensor<string, []>("cast_7_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, [1, 14]> expand_dims_3 = const()[name = tensor<string, []>("expand_dims_3"), val = tensor<int32, [1, 14]>([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]])];
+            tensor<int32, [1]> var_184_axes_0 = const()[name = tensor<string, []>("op_184_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [1]> cast_7 = cast(dtype = cast_7_dtype_0, x = current_lengths)[name = tensor<string, []>("cast_20")];
+            tensor<int32, [1, 1]> var_184 = expand_dims(axes = var_184_axes_0, x = cast_7)[name = tensor<string, []>("op_184")];
+            tensor<bool, [1, 14]> time_mask = less(x = expand_dims_3, y = var_184)[name = tensor<string, []>("time_mask")];
+            tensor<int32, [1]> var_186_axes_0 = const()[name = tensor<string, []>("op_186_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<bool, [1, 14, 1]> var_186 = expand_dims(axes = var_186_axes_0, x = time_mask)[name = tensor<string, []>("op_186")];
+            tensor<int32, [3]> var_188_reps_0 = const()[name = tensor<string, []>("op_188_reps_0"), val = tensor<int32, [3]>([1, 1, 16])];
+            tensor<bool, [1, 14, 16]> var_188 = tile(reps = var_188_reps_0, x = var_186)[name = tensor<string, []>("op_188")];
+            tensor<string, []> cast_8_dtype_0 = const()[name = tensor<string, []>("cast_8_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, [1]> var_194_axes_0 = const()[name = tensor<string, []>("op_194_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp32, [1, 14, 16]> cast_8 = cast(dtype = cast_8_dtype_0, x = var_188)[name = tensor<string, []>("cast_19")];
+            tensor<fp32, [1, 1, 14, 16]> var_194 = expand_dims(axes = var_194_axes_0, x = cast_8)[name = tensor<string, []>("op_194")];
+            tensor<int32, [4]> expanded_mask_13_reps_0 = const()[name = tensor<string, []>("expanded_mask_13_reps_0"), val = tensor<int32, [4]>([1, 256, 1, 1])];
+            tensor<fp32, [1, 256, 14, 16]> expanded_mask_13 = tile(reps = expanded_mask_13_reps_0, x = var_194)[name = tensor<string, []>("expanded_mask_13")];
+            tensor<fp32, [1, 256, 14, 16]> input_13 = mul(x = tensor_13, y = expanded_mask_13)[name = tensor<string, []>("input_13")];
+            tensor<string, []> tensor_15_pad_type_0 = const()[name = tensor<string, []>("tensor_15_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [2]> tensor_15_strides_0 = const()[name = tensor<string, []>("tensor_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> tensor_15_pad_0 = const()[name = tensor<string, []>("tensor_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> tensor_15_dilations_0 = const()[name = tensor<string, []>("tensor_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, []> tensor_15_groups_0 = const()[name = tensor<string, []>("tensor_15_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 256, 14, 16]> tensor_15 = conv(bias = model_encoder_pre_encode_conv_6_bias, dilations = tensor_15_dilations_0, groups = tensor_15_groups_0, pad = tensor_15_pad_0, pad_type = tensor_15_pad_type_0, strides = tensor_15_strides_0, weight = model_encoder_pre_encode_conv_6_weight, x = input_13)[name = tensor<string, []>("tensor_15")];
+            tensor<fp32, [1, 256, 14, 16]> input_15 = mul(x = tensor_15, y = expanded_mask_13)[name = tensor<string, []>("input_15")];
+            tensor<fp32, [1, 256, 14, 16]> tensor_workaround = relu(x = input_15)[name = tensor<string, []>("tensor_workaround")];
+            tensor<fp32, [1, 256, 14, 16]> x = mul(x = tensor_workaround, y = expanded_mask_13)[name = tensor<string, []>("x")];
+            tensor<int32, [4]> var_228_perm_0 = const()[name = tensor<string, []>("op_228_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_229 = const()[name = tensor<string, []>("op_229"), val = tensor<int32, [3]>([1, 14, -1])];
+            tensor<fp32, [1, 14, 256, 16]> var_228 = transpose(perm = var_228_perm_0, x = x)[name = tensor<string, []>("transpose_0")];
+            tensor<fp32, [1, 14, 4096]> input = reshape(shape = var_229, x = var_228)[name = tensor<string, []>("input")];
+            tensor<fp32, [1, 14, 512]> chunk_embs_in = linear(bias = model_encoder_pre_encode_out_bias, weight = model_encoder_pre_encode_out_weight, x = input)[name = tensor<string, []>("linear_0")];
+            tensor<string, []> cast_11_dtype_0 = const()[name = tensor<string, []>("cast_11_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, [1]> size0 = const()[name = tensor<string, []>("size0"), val = tensor<int32, [1]>([188])];
+            tensor<int32, [1]> size1 = const()[name = tensor<string, []>("size1"), val = tensor<int32, [1]>([188])];
+            tensor<int32, []> var_264 = const()[name = tensor<string, []>("op_264"), val = tensor<int32, []>(1)];
+            tensor<bool, []> full_concat_interleave_0 = const()[name = tensor<string, []>("full_concat_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp32, [1, 390, 512]> full_concat = concat(axis = var_264, interleave = full_concat_interleave_0, values = (spkcache, fifo, chunk_embs_in))[name = tensor<string, []>("full_concat")];
+            tensor<int32, [1]> var_273 = add(x = spkcache_lengths, y = fifo_lengths)[name = tensor<string, []>("op_273")];
+            tensor<int32, [1]> chunk_lens_in = cast(dtype = cast_11_dtype_0, x = current_lengths)[name = tensor<string, []>("cast_18")];
+            tensor<int32, [1]> pre_encoder_lengths = add(x = var_273, y = chunk_lens_in)[name = tensor<string, []>("total_length")];
+            tensor<int32, [390]> out_pos = const()[name = tensor<string, []>("out_pos"), val = tensor<int32, [390]>([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389])];
+            tensor<bool, [390]> var_284 = greater_equal(x = out_pos, y = spkcache_lengths)[name = tensor<string, []>("op_284")];
+            tensor<string, []> cast_12_dtype_0 = const()[name = tensor<string, []>("cast_12_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<bool, [390]> var_290 = greater_equal(x = out_pos, y = var_273)[name = tensor<string, []>("op_290")];
+            tensor<string, []> cast_13_dtype_0 = const()[name = tensor<string, []>("cast_13_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, [1]> var_297 = sub(x = size0, y = spkcache_lengths)[name = tensor<string, []>("op_297")];
+            tensor<int32, [390]> cast_12 = cast(dtype = cast_12_dtype_0, x = var_284)[name = tensor<string, []>("cast_17")];
+            tensor<int32, [390]> var_298 = mul(x = cast_12, y = var_297)[name = tensor<string, []>("op_298")];
+            tensor<int32, [1]> var_300 = sub(x = size1, y = fifo_lengths)[name = tensor<string, []>("op_300")];
+            tensor<int32, [390]> cast_13 = cast(dtype = cast_13_dtype_0, x = var_290)[name = tensor<string, []>("cast_16")];
+            tensor<int32, [390]> var_301 = mul(x = cast_13, y = var_300)[name = tensor<string, []>("op_301")];
+            tensor<int32, [390]> offset = add(x = var_298, y = var_301)[name = tensor<string, []>("offset")];
+            tensor<int32, [390]> var_305 = add(x = out_pos, y = offset)[name = tensor<string, []>("op_305")];
+            tensor<int32, []> var_309 = const()[name = tensor<string, []>("op_309"), val = tensor<int32, []>(389)];
+            tensor<int32, []> var_310 = const()[name = tensor<string, []>("op_310"), val = tensor<int32, []>(0)];
+            tensor<int32, [390]> minimum_0 = minimum(x = var_305, y = var_309)[name = tensor<string, []>("minimum_0")];
+            tensor<int32, [390]> maximum_0 = maximum(x = minimum_0, y = var_310)[name = tensor<string, []>("maximum_0")];
+            tensor<int32, [1]> var_313_axes_0 = const()[name = tensor<string, []>("op_313_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<int32, [1, 390]> var_313 = expand_dims(axes = var_313_axes_0, x = maximum_0)[name = tensor<string, []>("op_313")];
+            tensor<int32, [1]> var_315_axes_0 = const()[name = tensor<string, []>("op_315_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<int32, [1, 390, 1]> var_315 = expand_dims(axes = var_315_axes_0, x = var_313)[name = tensor<string, []>("op_315")];
+            tensor<int32, [3]> gather_idx_reps_0 = const()[name = tensor<string, []>("gather_idx_reps_0"), val = tensor<int32, [3]>([1, 1, 512])];
+            tensor<int32, [1, 390, 512]> gather_idx = tile(reps = gather_idx_reps_0, x = var_315)[name = tensor<string, []>("gather_idx")];
+            tensor<int32, []> var_320 = const()[name = tensor<string, []>("op_320"), val = tensor<int32, []>(1)];
+            tensor<fp32, [1, 390, 512]> output = gather_along_axis(axis = var_320, indices = gather_idx, x = full_concat)[name = tensor<string, []>("output")];
+            tensor<bool, [390]> var_323 = less(x = out_pos, y = pre_encoder_lengths)[name = tensor<string, []>("op_323")];
+            tensor<string, []> cast_14_dtype_0 = const()[name = tensor<string, []>("cast_14_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, [1]> var_330_axes_0 = const()[name = tensor<string, []>("op_330_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp32, [390]> cast_14 = cast(dtype = cast_14_dtype_0, x = var_323)[name = tensor<string, []>("cast_15")];
+            tensor<fp32, [1, 390]> var_330 = expand_dims(axes = var_330_axes_0, x = cast_14)[name = tensor<string, []>("op_330")];
+            tensor<int32, [1]> var_332_axes_0 = const()[name = tensor<string, []>("op_332_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp32, [1, 390, 1]> var_332 = expand_dims(axes = var_332_axes_0, x = var_330)[name = tensor<string, []>("op_332")];
+            tensor<fp32, [1, 390, 512]> pre_encoder_embs = mul(x = output, y = var_332)[name = tensor<string, []>("op_333")];
+        } -> (pre_encoder_embs, pre_encoder_lengths, chunk_embs_in, chunk_lens_in);
+}

Pipeline_PreEncoder.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88a98803e35186b1dfb41d7f748f7cee5093bb6efeb117f56953c17549792fa4
+size 8948544

Pipeline_PreEncoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb3f36b3b9d3f63e7a4f89a8848c6e3bc1a4a983786a832ea2c60cc395525cc2
+size 26802

Pipeline_PreEncoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88a98803e35186b1dfb41d7f748f7cee5093bb6efeb117f56953c17549792fa4
+size 8948544

Pipeline_PreEncoder.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "6894C507-E04A-4096-A90F-9AB0F58870E0": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "E6A26F48-2E38-4E3A-AF2B-89704AAA4B4C": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        }
+    },
+    "rootModelIdentifier": "6894C507-E04A-4096-A90F-9AB0F58870E0"
+}

Pipeline_Preprocessor.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eee5506c26dd1453734200ef08e1a263599e25fbdd433ecc425e7d8fd3c39641
+size 243

Pipeline_Preprocessor.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15271d7ebb0d1f8f82ae60468e6a554b71d386f6f5f717b00332b9ace990be16
+size 374

Pipeline_Preprocessor.mlmodelc/model.mil ADDED Viewed

The diff for this file is too large to render. See raw diff

Pipeline_Preprocessor.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6d96fe6aa2f786e9ce18f53c2c6058807fbd9733dc48813d30554ee9b1caf80
+size 1184512

Pipeline_Preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ba45e5189ff1a12d01f2ebc6fc5db8ab7ad63cedcbb9847515ad0e7881daa0e
+size 48673

Pipeline_Preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6d96fe6aa2f786e9ce18f53c2c6058807fbd9733dc48813d30554ee9b1caf80
+size 1184512

Pipeline_Preprocessor.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "43321C07-C241-4F12-89F5-DF8385087F7C": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        },
+        "505C3394-5B98-45F7-8F05-9C513B04AFCB": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        }
+    },
+    "rootModelIdentifier": "505C3394-5B98-45F7-8F05-9C513B04AFCB"
+}

README.md ADDED Viewed

	@@ -0,0 +1,137 @@

+# Streaming Sortformer CoreML
+CoreML conversion of NVIDIA's Streaming Sortformer 4-Speaker Diarization model for Apple Silicon.
+## Original Model
+- **Source**: [nvidia/diar_streaming_sortformer_4spk-v2.1](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2.1)
+- **Paper**: [Sortformer: Seamless Integration of Speaker Diarization and ASR](https://arxiv.org/abs/2409.06656)
+- **Benchmark**: 20.57% DER on AMI SDM (NVIDIA reported)
+## Models
+| Model | Description | Input | Output |
+|-------|-------------|-------|--------|
+| `Pipeline_Preprocessor.mlpackage` | Mel spectrogram extraction | Audio waveform | 128-dim mel features |
+| `Pipeline_PreEncoder.mlpackage` | FastConformer encoder + Transformer | Mel features + state | Encoded embeddings |
+| `Pipeline_Head_Fixed.mlpackage` | Speaker prediction head | Embeddings | 4-speaker probabilities |
+## Configuration
+```python
+CONFIG = {
+    "chunk_len": 6,              # Core chunk length (encoder frames)
+    "chunk_left_context": 1,     # Left context frames
+    "chunk_right_context": 7,    # Right context frames
+    "fifo_len": 188,             # FIFO buffer length
+    "spkcache_len": 188,         # Speaker cache length
+    "subsampling_factor": 8,     # 8x subsampling (80ms per encoder frame)
+    "sample_rate": 16000,
+    "mel_features": 128,
+    "n_speakers": 4,
+}
+```
+## Usage
+### Python (coremltools)
+```python
+import coremltools as ct
+import numpy as np
+# Load models
+pre_encoder = ct.models.MLModel("Pipeline_PreEncoder.mlpackage",
+                                 compute_units=ct.ComputeUnit.CPU_ONLY)
+head = ct.models.MLModel("Pipeline_Head_Fixed.mlpackage",
+                         compute_units=ct.ComputeUnit.CPU_ONLY)
+# Initialize state
+spkcache = np.zeros((1, 188, 512), dtype=np.float32)
+fifo = np.zeros((1, 188, 512), dtype=np.float32)
+# Process chunk (mel_features: [1, 112, 128])
+pre_out = pre_encoder.predict({
+    "chunk": mel_features,
+    "chunk_lengths": np.array([actual_length], dtype=np.int32),
+    "spkcache": spkcache,
+    "spkcache_lengths": np.array([0], dtype=np.int32),
+    "fifo": fifo,
+    "fifo_lengths": np.array([0], dtype=np.int32)
+})
+head_out = head.predict({
+    "pre_encoder_embs": pre_out["pre_encoder_embs"],
+    "pre_encoder_lengths": pre_out["pre_encoder_lengths"],
+    "chunk_embs_in": pre_out["chunk_embs_in"],
+    "chunk_lens_in": pre_out["chunk_lens_in"]
+})
+predictions = head_out["speaker_preds"]  # [1, T, 4]
+```
+### Swift (Core ML)
+```swift
+import CoreML
+let preEncoder = try MLModel(contentsOf: preEncoderURL)
+let head = try MLModel(contentsOf: headURL)
+// Create input with MLMultiArray for chunk, spkcache, fifo
+let preEncoderInput = try preEncoder.prediction(from: inputProvider)
+let headInput = try head.prediction(from: preEncoderInput)
+let predictions = headInput.featureValue(for: "speaker_preds")
+```
+## Mel Spectrogram Settings
+For compatibility with the original NeMo model:
+```python
+mel_config = {
+    "sample_rate": 16000,
+    "n_fft": 512,
+    "win_length": 400,      # 25ms
+    "hop_length": 160,      # 10ms
+    "n_mels": 128,
+    "preemph": 0.97,
+    "log_zero_guard_value": 2**-24,
+    "normalize": "per_feature",
+}
+```
+## Streaming Pipeline
+1. **Chunk audio** into ~480ms windows (48 mel frames core + context)
+2. **Compute mel spectrogram** for each chunk
+3. **Run PreEncoder** with current state (spkcache + fifo)
+4. **Run Head** to get 4-speaker probabilities
+5. **Update state** (spkcache/fifo buffers)
+6. **Threshold predictions** (default: 0.5) for binary speaker activity
+## Accuracy
+Verified within 0.12% of original NeMo PyTorch model on chunk-level predictions.
+## Requirements
+- macOS 12+ or iOS 15+
+- Apple Silicon (M1/M2/M3) recommended
+- Python: `coremltools`, `numpy`, `torch`, `torchaudio`
+## License
+Apache 2.0 (following NVIDIA NeMo licensing)
+## Citation
+```bibtex
+@article{park2024sortformer,
+  title={Sortformer: Seamless Integration of Speaker Diarization and ASR by Bridging Timestamps and Tokens},
+  author={Park, Taejin and Huang, He and Koluguri, Nithin and Georgiou, Panagiotis and Watanabe, Shinji and Ginsburg, Boris},
+  journal={arXiv preprint arXiv:2409.06656},
+  year={2024}
+}
+```

convert_to_coreml.py ADDED Viewed

	@@ -0,0 +1,357 @@

+import torch
+import torch.nn as nn
+import coremltools as ct
+import argparse
+import os
+import sys
+import numpy as np
+import types
+# Ensure we use the right environment for imports
+# (User's environment has 'nemo' installed)
+from nemo.collections.asr.models import SortformerEncLabelModel
+from nemo.collections.asr.parts.preprocessing.features import FilterbankFeaturesTA
+from coreml_wrappers import *
+def convert_pre_encoder(
+        model: SortformerEncLabelModel,
+        precision,
+        name: str,
+        input_chunk, input_chunk_len,
+        input_spkcache, input_spkcache_len,
+        input_fifo, input_fifo_len
+):
+    wrapper = PreEncoderWrapper(model)
+    wrapper.eval()
+    traced_model = torch.jit.trace(wrapper, (
+        input_chunk, input_chunk_len,
+        input_spkcache, input_spkcache_len,
+        input_fifo, input_fifo_len
+    ))
+    mlmodel = ct.convert(
+        traced_model,
+        inputs=[
+            ct.TensorType(name="chunk", shape=input_chunk.shape, dtype=np.float32),
+            ct.TensorType(name="chunk_lengths", shape=input_chunk_len.shape, dtype=np.int32),
+            ct.TensorType(name="spkcache", shape=input_spkcache.shape),
+            ct.TensorType(name="spkcache_lengths", shape=input_spkcache_len.shape, dtype=np.int32),
+            ct.TensorType(name="fifo", shape=input_fifo.shape),
+            ct.TensorType(name="fifo_lengths", shape=input_fifo_len.shape, dtype=np.int32),
+        ],
+        outputs=[
+            ct.TensorType(name="pre_encoder_embs", dtype=np.float32),
+            ct.TensorType(name="pre_encoder_lengths", dtype=np.int32),
+            ct.TensorType(name="chunk_pre_encoder_embs", dtype=np.float32),
+            ct.TensorType(name="chunk_pre_encoder_lengths", dtype=np.int32),
+        ],
+        minimum_deployment_target=ct.target.iOS16,
+        compute_precision=precision,
+        compute_units=ct.ComputeUnit.ALL
+    )
+    mlmodel.save(name)
+    return mlmodel, traced_model
+def convert_head(
+        model: SortformerEncLabelModel,
+        precision,
+        name: str,
+        pre_encoder_embs, pre_encoder_lengths,
+        chunk_pre_encoder_embs, chunk_pre_encoder_lengths,
+):
+    wrapper = SortformerHeadWrapper(model)
+    wrapper.eval()
+    traced_model = torch.jit.trace(wrapper, (
+        pre_encoder_embs, pre_encoder_lengths,
+        chunk_pre_encoder_embs, chunk_pre_encoder_lengths,
+    ))
+    mlmodel = ct.convert(
+        traced_model,
+        inputs=[
+            ct.TensorType(name="pre_encoder_embs", shape=pre_encoder_embs.shape, dtype=np.float32),
+            ct.TensorType(name="pre_encoder_lengths", shape=pre_encoder_lengths.shape, dtype=np.int32),
+            ct.TensorType(name="chunk_pre_encoder_embs", shape=chunk_pre_encoder_embs.shape, dtype=np.float32),
+            ct.TensorType(name="chunk_pre_encoder_lengths", shape=chunk_pre_encoder_lengths.shape, dtype=np.int32),
+        ],
+        outputs=[
+            ct.TensorType(name="speaker_preds", dtype=np.float32),
+            ct.TensorType(name="chunk_pre_encoder_embs"),
+            ct.TensorType(name="chunk_pre_encoder_lengths")
+        ],
+        minimum_deployment_target=ct.target.iOS16,
+        compute_precision=precision,
+        compute_units=ct.ComputeUnit.ALL
+    )
+    mlmodel.save(name)
+    return mlmodel, traced_model
+def export_pipeline(
+        model_name: str,
+        output_dir: str,
+        preproc_precision: str = "fp32",
+        pre_encoder_precision: str = "fp32",
+        head_precision: str = "fp16",
+        skip_modules: bool = False,
+        verify: bool = False
+):
+    """
+    Export the Sortformer model as a pipeline of separate CoreML models.
+    Each component can have different precision.
+    Components:
+    1. Preprocessor (audio -> mel features)
+    2. Pre-encoder (features -> pre-encoded embeddings + concat with spkcache/fifo)
+    3. Conformer Encoder (pre-encoded -> encoder embeddings)
+    4. Transformer Encoder (encoder embeddings -> predictions)
+    Args:
+        :param model_name: NeMo model name or path
+        :param output_dir: Output directory for mlpackage files
+        :param preproc_precision: Precision for preprocessor ("fp16" or "fp32")
+        :param pre_encoder_precision: Precision for pre-encoder ("fp16" or "fp32")
+        :param head_precision: Precision for head module (conformer + transformer) ("fp16" or "fp32")
+        :param skip_modules: Whether to skip the individual modules
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    def get_precision(s):
+        return ct.precision.FLOAT16 if s.lower() == "fp16" else ct.precision.FLOAT32
+    print("=" * 70)
+    print("Exporting Sortformer Pipeline")
+    print("=" * 70)
+    print(f"Preprocessor:   {preproc_precision}")
+    print(f"Pre-encoder:    {pre_encoder_precision}")
+    print(f"Head:      {head_precision}")
+    print("=" * 70)
+    # Load model
+    print(f"\nLoading model: {model_name}")
+    if os.path.exists(model_name):
+        model = SortformerEncLabelModel.restore_from(model_name, map_location=torch.device("cpu"))
+    else:
+        model = SortformerEncLabelModel.from_pretrained(model_name, map_location=torch.device("cpu"))
+    model.eval()
+    # Configure for streaming
+    print("Configuring for streaming...")
+    model.sortformer_modules.chunk_len = 6
+    model.sortformer_modules.chunk_right_context = 1
+    model.sortformer_modules.chunk_left_context = 1
+    model.sortformer_modules.fifo_len = 40
+    model.sortformer_modules.spkcache_len = 120
+    model.sortformer_modules.spkcache_update_period = 32
+    modules = model.sortformer_modules
+    preprocessor = model.preprocessor
+    pre_encoder_mlmodel = None
+    head_mlmodel = None
+    if hasattr(preprocessor, 'pad_to'):
+        preprocessor.pad_to = 0
+    # Calculate dimensions
+    chunk_len = modules.chunk_len
+    input_chunk_time = (
+                                   chunk_len + modules.chunk_left_context + modules.chunk_right_context) * modules.subsampling_factor
+    fc_d_model = modules.fc_d_model  # 512 - Conformer output
+    tf_d_model = modules.tf_d_model  # 192 - Transformer input (after projection)
+    spkcache_len = modules.spkcache_len
+    fifo_len = modules.fifo_len
+    # Get feature dim
+    feat_dim = 128
+    if hasattr(model, 'encoder') and hasattr(model.encoder, '_feat_in'):
+        feat_dim = model.encoder._feat_in
+    # Pre-encode output size (after subsampling)
+    pre_encode_out_len = input_chunk_time // modules.subsampling_factor
+    total_concat_len = spkcache_len + fifo_len + pre_encode_out_len
+    print(f"Input chunk frames: {input_chunk_time}")
+    print(f"Pre-encode output: {pre_encode_out_len}")
+    print(f"Total concat len: {total_concat_len}")
+    print(f"Feature dim: {feat_dim}, FC d_model: {fc_d_model}, TF d_model: {tf_d_model}")
+    # Audio samples for preprocessor
+    stride = 160
+    window = 400
+    audio_samples = (input_chunk_time - 1) * stride + window
+    print(audio_samples)
+    # =========================================================
+    # 1. Export Preprocessor
+    # =========================================================
+    if not skip_modules:
+        print("\n[1/4] Exporting Preprocessor...")
+        preproc_wrapper = PreprocessorWrapper(preprocessor)
+        preproc_wrapper.eval()
+        dummy_wav = torch.randn(1, audio_samples)
+        dummy_len = torch.tensor([audio_samples], dtype=torch.long)
+        traced_preproc = torch.jit.trace(preproc_wrapper, (dummy_wav, dummy_len))
+        preproc_mlmodel = ct.convert(
+            traced_preproc,
+            inputs=[
+                ct.TensorType(name="audio_signal", shape=dummy_wav.shape),
+                ct.TensorType(name="length", shape=dummy_len.shape, dtype=np.int32)
+            ],
+            outputs=[
+                ct.TensorType(name="features", dtype=np.float32),
+                ct.TensorType(name="feature_lengths", dtype=np.int32)
+            ],
+            minimum_deployment_target=ct.target.iOS16,
+            compute_precision=get_precision(preproc_precision),
+            compute_units=ct.ComputeUnit.ALL
+        )
+        preproc_mlmodel.save(os.path.join(output_dir, "Pipeline_Preprocessor.mlpackage"))
+        print("  Saved Pipeline_Preprocessor.mlpackage")
+    # =========================================================
+    # 2. Export Pre-Encoder
+    # =========================================================
+    input_chunk = torch.randn(1, input_chunk_time, feat_dim)
+    input_chunk_len = torch.tensor([input_chunk_time], dtype=torch.long)
+    input_spkcache = torch.randn(1, spkcache_len, fc_d_model)
+    input_spkcache_len = torch.tensor([spkcache_len], dtype=torch.long)
+    input_fifo = torch.randn(1, fifo_len, fc_d_model)
+    input_fifo_len = torch.tensor([fifo_len], dtype=torch.long)
+    if not skip_modules:
+        print("\n[2/4] Exporting Pre-Encoder...")
+        pre_encoder_mlmodel, _ = convert_pre_encoder(
+            model,
+            get_precision(pre_encoder_precision),
+            os.path.join(output_dir, "Pipeline_PreEncoder.mlpackage"),
+            input_chunk, input_chunk_len,
+            input_spkcache, input_spkcache_len,
+            input_fifo, input_fifo_len
+        )
+        print("  Saved Pipeline_PreEncoder.mlpackage")
+    # =========================================================
+    # 3. Export Conformer Encoder
+    # =========================================================
+    pre_encoder_embs = torch.randn(1, total_concat_len, fc_d_model)
+    pre_encoder_lengths = torch.tensor([total_concat_len], dtype=torch.long)
+    chunk_pre_encoder_embs = torch.randn(1, pre_encode_out_len, fc_d_model)
+    chunk_pre_encoder_lengths = torch.tensor([pre_encode_out_len], dtype=torch.long)
+    if not skip_modules:
+        print("\n[3/4] Exporting Head Module...")
+        head_mlmodel, _ = convert_head(
+            model,
+            get_precision(head_precision),
+            os.path.join(output_dir, "Pipeline_Head.mlpackage"),
+            pre_encoder_embs, pre_encoder_lengths,
+            chunk_pre_encoder_embs, chunk_pre_encoder_lengths
+        )
+        print("  Saved Pipeline_Head.mlpackage")
+    # =========================================================
+    # 5. Create Combined Pipelines
+    # =========================================================
+    print("\n[4/4] Creating Combined ML Pipelines...")
+    # Load the exported models
+    if skip_modules and not verify:
+        print('Loading Pipeline CoreML Modules...')
+        pre_encoder_mlmodel = ct.models.MLModel(
+            os.path.join(output_dir, "Pipeline_PreEncoder.mlpackage")
+        )
+        head_mlmodel = ct.models.MLModel(
+            os.path.join(output_dir, "Pipeline_Head.mlpackage")
+        )
+        assert pre_encoder_mlmodel is not None and head_mlmodel is not None
+    # Create Full Pipeline: PreEncoder → Conformer → Transformer
+    # Inputs: chunk, chunk_lengths, spkcache, spkcache_lengths, fifo, fifo_lengths
+    # Output: preds
+    if verify:
+        pipeline_model = ct.models.MLModel('coreml_models/SortformerPipeline.mlpackage')
+        spec = pipeline_model.get_spec()
+        print(pipeline_model.input_description)
+        print(pipeline_model.output_description)
+        print(spec)
+    else:
+        try:
+            # Both models now use compute_units=ALL.
+            # The pre_encoder uses ANE-safe gather operations in fixed_concat_and_pad
+            # to avoid zero-length slices that would crash on ANE.
+            pipeline_model = ct.utils.make_pipeline(
+                pre_encoder_mlmodel,
+                head_mlmodel,
+                compute_units=ct.ComputeUnit.ALL
+            )
+            # Save the pipeline
+            pipeline_model.save(os.path.join(output_dir, "SortformerPipeline.mlpackage"))
+            print("  Saved SortformerPipeline.mlpackage (PreEncoder + Conformer + Transformer)")
+        except Exception as e:
+            print(f"  Warning: Could not create full pipeline: {e}")
+            import traceback
+            traceback.print_exc()
+    # =========================================================
+    # Summary
+    # =========================================================
+    print("\n" + "=" * 70)
+    print("Pipeline Export Complete!")
+    print("=" * 70)
+    print(f"Output directory: {output_dir}")
+    print("\nExported models:")
+    print(f"  1. Pipeline_Preprocessor.mlpackage        ({preproc_precision})")
+    print(f"  2. Pipeline_PreEncoder.mlpackage          ({pre_encoder_precision})")
+    print(f"  3. Pipeline_Head.mlpackage                ({head_precision})")
+    print(f"  5. SortformerPipeline.mlpackage           (combined: PreEncoder+Head)")
+    print("\nUsage in inference:")
+    print("  audio -> Preprocessor -> features")
+    print("  features + spkcache + fifo -> SortformerPipeline -> predictions")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", default="nvidia/diar_streaming_sortformer_4spk-v2.1",
+                        help="NeMo model name or path")
+    parser.add_argument("--output_dir", default="coreml_models", help="Output directory")
+    parser.add_argument("--fp16", action="store_true", help="Use FP16 for single model export")
+    # Pipeline options
+    parser.add_argument("--preproc_precision", default="fp32", choices=["fp16", "fp32"], help="Preprocessor precision")
+    parser.add_argument("--pre_encoder_precision", default="fp32", choices=["fp16", "fp32"],
+                        help="Pre-encoder precision")
+    parser.add_argument("--head_precision", default="fp16", choices=["fp16", "fp32"],
+                        help="Conformer encoder precision")
+    parser.add_argument("--skip_modules", action="store_true", help="Skip modules in pipeline export")
+    parser.add_argument("--verify", action="store_true", help="Skip pipeline in pipeline export")
+    args = parser.parse_args()
+    print(f"CoreMLTools Version: {ct.__version__}")
+    export_pipeline(
+        args.model_name,
+        args.output_dir,
+        preproc_precision=args.preproc_precision,
+        pre_encoder_precision=args.pre_encoder_precision,
+        head_precision=args.head_precision,
+        skip_modules=args.skip_modules,
+        verify=args.verify,
+    )

export_nvidia_pipeline.py ADDED Viewed

	@@ -0,0 +1,308 @@

+"""Export combined SortformerPipeline with NVIDIA's 1.04s latency configuration.
+This creates models compatible with the Swift SortformerDiarizer interface.
+"""
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+import torch
+import torch.nn as nn
+import numpy as np
+import coremltools as ct
+from nemo.collections.asr.models import SortformerEncLabelModel
+from coreml_wrappers import PreEncoderWrapper
+# NVIDIA's 1.04s latency configuration
+NVIDIA_CONFIG = {
+    'chunk_len': 6,
+    'chunk_right_context': 7,      # Was 1
+    'chunk_left_context': 1,
+    'fifo_len': 188,               # Was 40
+    'spkcache_len': 188,           # Was 120
+    'spkcache_update_period': 144, # Was 30
+}
+print("=" * 70)
+print("Exporting Combined SortformerPipeline with NVIDIA Config")
+print("=" * 70)
+print(f"Config: {NVIDIA_CONFIG}")
+# Load model
+print("\nLoading NeMo model...")
+model = SortformerEncLabelModel.from_pretrained(
+    "nvidia/diar_streaming_sortformer_4spk-v2.1", map_location="cpu"
+)
+model.eval()
+# Apply NVIDIA config
+modules = model.sortformer_modules
+modules.chunk_len = NVIDIA_CONFIG['chunk_len']
+modules.chunk_right_context = NVIDIA_CONFIG['chunk_right_context']
+modules.chunk_left_context = NVIDIA_CONFIG['chunk_left_context']
+modules.fifo_len = NVIDIA_CONFIG['fifo_len']
+modules.spkcache_len = NVIDIA_CONFIG['spkcache_len']
+modules.spkcache_update_period = NVIDIA_CONFIG['spkcache_update_period']
+# Calculate dimensions
+chunk_len = modules.chunk_len
+input_chunk_time = (chunk_len + modules.chunk_left_context + modules.chunk_right_context) * modules.subsampling_factor
+fc_d_model = modules.fc_d_model  # 512
+spkcache_len = modules.spkcache_len
+fifo_len = modules.fifo_len
+feat_dim = 128
+pre_encode_out_len = input_chunk_time // modules.subsampling_factor
+total_concat_len = spkcache_len + fifo_len + pre_encode_out_len
+print(f"\nDimensions:")
+print(f"  Input chunk frames: {input_chunk_time}")
+print(f"  Pre-encode output: {pre_encode_out_len}")
+print(f"  Total concat len: {total_concat_len}")
+print(f"  FC d_model: {fc_d_model}")
+print(f"  FIFO len: {fifo_len}")
+print(f"  Spkcache len: {spkcache_len}")
+# Calculate audio samples needed for preprocessor
+# NeMo adds internal padding (16 samples each side), so the formula is different
+# Empirically tested: 17920 samples → 112 mel frames, 18160 → 113 frames
+# For 112 frames, we need 17920 samples (not the naive 18160 from stride formula)
+mel_stride = 160
+mel_window = 400
+# Correct formula accounting for NeMo padding
+preprocessor_audio_samples = 17920  # Empirically determined for 112 frames
+print(f"  Preprocessor audio samples: {preprocessor_audio_samples}")
+# Create output directory
+output_dir = "coreml_models_nvidia"
+os.makedirs(output_dir, exist_ok=True)
+# =========================================================
+# 0. Export Preprocessor (audio → mel features)
+# =========================================================
+print("\n[0/3] Exporting Preprocessor...")
+from coreml_wrappers import PreprocessorWrapper
+preprocessor_wrapper = PreprocessorWrapper(model.preprocessor)
+preprocessor_wrapper.eval()
+# Trace with correct audio sample count
+audio_input = torch.randn(1, preprocessor_audio_samples)
+audio_length = torch.tensor([preprocessor_audio_samples], dtype=torch.long)
+traced_preprocessor = torch.jit.trace(preprocessor_wrapper, (audio_input, audio_length))
+preprocessor_ml = ct.convert(
+    traced_preprocessor,
+    inputs=[
+        ct.TensorType(name="audio_signal", shape=audio_input.shape, dtype=np.float32),
+        ct.TensorType(name="length", shape=audio_length.shape, dtype=np.int32),
+    ],
+    outputs=[
+        ct.TensorType(name="features", dtype=np.float32),
+        ct.TensorType(name="feature_lengths", dtype=np.int32),
+    ],
+    minimum_deployment_target=ct.target.iOS16,
+    compute_precision=ct.precision.FLOAT32,
+    compute_units=ct.ComputeUnit.CPU_ONLY  # CPU for FP32 precision
+)
+preprocessor_ml.save(os.path.join(output_dir, "Pipeline_Preprocessor.mlpackage"))
+print(f"  Saved {output_dir}/Pipeline_Preprocessor.mlpackage")
+# =========================================================
+# 1. Export PreEncoder
+# =========================================================
+print("\n[1/3] Exporting PreEncoder...")
+input_chunk = torch.randn(1, input_chunk_time, feat_dim)
+input_chunk_len = torch.tensor([input_chunk_time], dtype=torch.long)
+input_spkcache = torch.randn(1, spkcache_len, fc_d_model)
+input_spkcache_len = torch.tensor([spkcache_len], dtype=torch.long)
+input_fifo = torch.randn(1, fifo_len, fc_d_model)
+input_fifo_len = torch.tensor([fifo_len], dtype=torch.long)
+pre_encoder = PreEncoderWrapper(model)
+pre_encoder.eval()
+traced_pre_encoder = torch.jit.trace(pre_encoder, (
+    input_chunk, input_chunk_len,
+    input_spkcache, input_spkcache_len,
+    input_fifo, input_fifo_len
+))
+# Use names that match for pipeline connection
+pre_encoder_ml = ct.convert(
+    traced_pre_encoder,
+    inputs=[
+        ct.TensorType(name="chunk", shape=input_chunk.shape, dtype=np.float32),
+        ct.TensorType(name="chunk_lengths", shape=input_chunk_len.shape, dtype=np.int32),
+        ct.TensorType(name="spkcache", shape=input_spkcache.shape, dtype=np.float32),
+        ct.TensorType(name="spkcache_lengths", shape=input_spkcache_len.shape, dtype=np.int32),
+        ct.TensorType(name="fifo", shape=input_fifo.shape, dtype=np.float32),
+        ct.TensorType(name="fifo_lengths", shape=input_fifo_len.shape, dtype=np.int32),
+    ],
+    outputs=[
+        ct.TensorType(name="pre_encoder_embs", dtype=np.float32),
+        ct.TensorType(name="pre_encoder_lengths", dtype=np.int32),
+        ct.TensorType(name="chunk_embs_in", dtype=np.float32),
+        ct.TensorType(name="chunk_lens_in", dtype=np.int32),
+    ],
+    minimum_deployment_target=ct.target.iOS16,
+    compute_precision=ct.precision.FLOAT32,
+    compute_units=ct.ComputeUnit.ALL
+)
+pre_encoder_ml.save(os.path.join(output_dir, "Pipeline_PreEncoder.mlpackage"))
+print(f"  Saved {output_dir}/Pipeline_PreEncoder.mlpackage")
+# =========================================================
+# 2. Export Fixed Head (with identity ops to preserve embeddings)
+# =========================================================
+print("\n[2/3] Exporting Fixed Head...")
+class FixedSortformerHead(nn.Module):
+    """Head wrapper that forces chunk_pre_encoder_embs to be computed."""
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.identity_scale = nn.Parameter(torch.ones(1), requires_grad=False)
+    def forward(self, pre_encoder_embs, pre_encoder_lengths, chunk_embs_in, chunk_lens_in):
+        # Frontend encoder
+        spkcache_fifo_chunk_fc_encoder_embs, spkcache_fifo_chunk_fc_encoder_lengths = self.model.frontend_encoder(
+            processed_signal=pre_encoder_embs,
+            processed_signal_length=pre_encoder_lengths,
+            bypass_pre_encode=True,
+        )
+        # Forward inference
+        speaker_preds = self.model.forward_infer(
+            spkcache_fifo_chunk_fc_encoder_embs, spkcache_fifo_chunk_fc_encoder_lengths
+        )
+        # Force the embedding to be computed (prevents optimization)
+        chunk_pre_encoder_embs = chunk_embs_in * self.identity_scale
+        chunk_pre_encoder_lengths = chunk_lens_in + 0
+        return speaker_preds, chunk_pre_encoder_embs, chunk_pre_encoder_lengths
+head = FixedSortformerHead(model)
+head.eval()
+# Input shapes for head - must match PreEncoder output names
+pre_encoder_embs = torch.randn(1, total_concat_len, fc_d_model)
+pre_encoder_lengths = torch.tensor([total_concat_len], dtype=torch.long)
+chunk_embs_in = torch.randn(1, pre_encode_out_len, fc_d_model)
+chunk_lens_in = torch.tensor([pre_encode_out_len], dtype=torch.long)
+traced_head = torch.jit.trace(head, (
+    pre_encoder_embs, pre_encoder_lengths,
+    chunk_embs_in, chunk_lens_in
+))
+head_ml = ct.convert(
+    traced_head,
+    inputs=[
+        ct.TensorType(name="pre_encoder_embs", shape=pre_encoder_embs.shape, dtype=np.float32),
+        ct.TensorType(name="pre_encoder_lengths", shape=pre_encoder_lengths.shape, dtype=np.int32),
+        ct.TensorType(name="chunk_embs_in", shape=chunk_embs_in.shape, dtype=np.float32),
+        ct.TensorType(name="chunk_lens_in", shape=chunk_lens_in.shape, dtype=np.int32),
+    ],
+    outputs=[
+        ct.TensorType(name="speaker_preds", dtype=np.float32),
+        ct.TensorType(name="chunk_pre_encoder_embs", dtype=np.float32),
+        ct.TensorType(name="chunk_pre_encoder_lengths", dtype=np.int32),
+    ],
+    minimum_deployment_target=ct.target.iOS16,
+    compute_precision=ct.precision.FLOAT16,
+    compute_units=ct.ComputeUnit.ALL
+)
+head_ml.save(os.path.join(output_dir, "Pipeline_Head_Fixed.mlpackage"))
+print(f"  Saved {output_dir}/Pipeline_Head_Fixed.mlpackage")
+# =========================================================
+# 3. Create Combined Pipeline
+# =========================================================
+print("\n[3/3] Creating combined pipeline...")
+try:
+    pipeline = ct.utils.make_pipeline(pre_encoder_ml, head_ml, compute_units=ct.ComputeUnit.ALL)
+    pipeline.save(os.path.join(output_dir, "SortformerPipeline.mlpackage"))
+    print(f"  Saved {output_dir}/SortformerPipeline.mlpackage")
+except Exception as e:
+    print(f"  Pipeline creation failed: {e}")
+    print("  Note: Call PreEncoder and Head separately to avoid embedding bug")
+# =========================================================
+# Verification
+# =========================================================
+print("\n" + "=" * 70)
+print("Verification")
+print("=" * 70)
+# Test PreEncoder
+test_chunk = np.random.randn(1, input_chunk_time, feat_dim).astype(np.float32)
+test_chunk_len = np.array([input_chunk_time], dtype=np.int32)
+test_spkcache = np.zeros((1, spkcache_len, fc_d_model), dtype=np.float32)
+test_spkcache_len = np.array([0], dtype=np.int32)
+test_fifo = np.zeros((1, fifo_len, fc_d_model), dtype=np.float32)
+test_fifo_len = np.array([0], dtype=np.int32)
+pre_out = pre_encoder_ml.predict({
+    'chunk': test_chunk,
+    'chunk_lengths': test_chunk_len,
+    'spkcache': test_spkcache,
+    'spkcache_lengths': test_spkcache_len,
+    'fifo': test_fifo,
+    'fifo_lengths': test_fifo_len
+})
+print(f"PreEncoder output shapes:")
+print(f"  pre_encoder_embs: {pre_out['pre_encoder_embs'].shape}")
+print(f"  chunk_embs_in: {pre_out['chunk_embs_in'].shape}")
+print(f"  chunk_embs_in[0,0,0]: {pre_out['chunk_embs_in'][0,0,0]:.6f}")
+# Test Head
+head_out = head_ml.predict({
+    'pre_encoder_embs': pre_out['pre_encoder_embs'],
+    'pre_encoder_lengths': pre_out['pre_encoder_lengths'],
+    'chunk_embs_in': pre_out['chunk_embs_in'],
+    'chunk_lens_in': pre_out['chunk_lens_in']
+})
+print(f"\nHead output shapes:")
+print(f"  speaker_preds: {head_out['speaker_preds'].shape}")
+print(f"  chunk_pre_encoder_embs: {head_out['chunk_pre_encoder_embs'].shape}")
+print(f"  chunk_pre_encoder_embs[0,0,0]: {head_out['chunk_pre_encoder_embs'][0,0,0]:.6f}")
+# Verify embedding preservation
+if np.isclose(pre_out['chunk_embs_in'][0,0,0], head_out['chunk_pre_encoder_embs'][0,0,0], atol=0.01):
+    print("\n✓ Embedding [0,0,0] preserved correctly!")
+else:
+    print(f"\n✗ WARNING: Embedding [0,0,0] corrupted!")
+    print(f"  PreEncoder: {pre_out['chunk_embs_in'][0,0,0]:.6f}")
+    print(f"  Head: {head_out['chunk_pre_encoder_embs'][0,0,0]:.6f}")
+print("\n" + "=" * 70)
+print("Export Complete!")
+print("=" * 70)
+print(f"Models saved to: {output_dir}/")
+print(f"  - Pipeline_PreEncoder.mlpackage")
+print(f"  - Pipeline_Head_Fixed.mlpackage")
+print(f"  - SortformerPipeline.mlpackage (if pipeline creation succeeded)")
+print(f"\nConfiguration (NVIDIA 1.04s latency):")
+for k, v in NVIDIA_CONFIG.items():
+    print(f"  {k}: {v}")
+print(f"\nSwift SortformerConfig should use:")
+print(f"  chunkLen = 6")
+print(f"  chunkLeftContext = 1")
+print(f"  chunkRightContext = 7")
+print(f"  fifoLen = 188")
+print(f"  spkcacheLen = 188")
+print(f"  spkcacheUpdatePeriod = 144")

inference.py ADDED Viewed

	@@ -0,0 +1,192 @@

+#!/usr/bin/env python3
+"""
+Streaming Sortformer CoreML Inference
+This script demonstrates how to use the CoreML-converted NVIDIA Streaming Sortformer
+model for real-time speaker diarization on Apple Silicon.
+Original model: nvidia/diar_streaming_sortformer_4spk-v2.1
+"""
+import os
+import numpy as np
+import coremltools as ct
+# Configuration matching NVIDIA's streaming settings
+CONFIG = {
+    "chunk_len": 6,              # Core chunk length in encoder frames
+    "chunk_left_context": 1,     # Left context frames
+    "chunk_right_context": 7,    # Right context frames
+    "fifo_len": 188,             # FIFO buffer length
+    "spkcache_len": 188,         # Speaker cache length
+    "spkcache_update_period": 144,
+    "subsampling_factor": 8,     # Mel frames per encoder frame
+    "n_speakers": 4,             # Max speakers
+    "sample_rate": 16000,
+    "mel_features": 128,
+}
+class SortformerCoreML:
+    """CoreML Streaming Sortformer Diarizer"""
+    def __init__(self, model_dir: str = ".", compute_units: str = "CPU_ONLY"):
+        """
+        Initialize the CoreML Sortformer pipeline.
+        Args:
+            model_dir: Directory containing the .mlpackage files
+            compute_units: "CPU_ONLY", "CPU_AND_GPU", or "ALL"
+        """
+        cu = getattr(ct.ComputeUnit, compute_units, ct.ComputeUnit.CPU_ONLY)
+        # Load models
+        self.preprocessor = ct.models.MLModel(
+            os.path.join(model_dir, "Pipeline_Preprocessor.mlpackage"),
+            compute_units=cu
+        )
+        self.pre_encoder = ct.models.MLModel(
+            os.path.join(model_dir, "Pipeline_PreEncoder.mlpackage"),
+            compute_units=cu
+        )
+        self.head = ct.models.MLModel(
+            os.path.join(model_dir, "Pipeline_Head_Fixed.mlpackage"),
+            compute_units=cu
+        )
+        # Initialize state buffers
+        self.reset_state()
+    def reset_state(self):
+        """Reset streaming state for new audio session."""
+        self.spkcache = np.zeros((1, CONFIG["spkcache_len"], 512), dtype=np.float32)
+        self.fifo = np.zeros((1, CONFIG["fifo_len"], 512), dtype=np.float32)
+        self.spkcache_len = 0
+        self.fifo_len = 0
+        self.chunk_idx = 0
+    def process_chunk(self, mel_features: np.ndarray, chunk_length: int) -> np.ndarray:
+        """
+        Process a single chunk of mel features.
+        Args:
+            mel_features: Mel spectrogram chunk [1, T, 128] where T <= 112
+            chunk_length: Actual valid length (before padding)
+        Returns:
+            Speaker predictions [num_frames, 4] with probabilities for each speaker
+        """
+        # Pad to 112 if needed
+        if mel_features.shape[1] < 112:
+            pad_len = 112 - mel_features.shape[1]
+            mel_features = np.pad(mel_features, ((0, 0), (0, pad_len), (0, 0)))
+        # Run PreEncoder
+        pre_out = self.pre_encoder.predict({
+            "chunk": mel_features.astype(np.float32),
+            "chunk_lengths": np.array([chunk_length], dtype=np.int32),
+            "spkcache": self.spkcache,
+            "spkcache_lengths": np.array([self.spkcache_len], dtype=np.int32),
+            "fifo": self.fifo,
+            "fifo_lengths": np.array([self.fifo_len], dtype=np.int32)
+        })
+        # Run Head
+        head_out = self.head.predict({
+            "pre_encoder_embs": pre_out["pre_encoder_embs"],
+            "pre_encoder_lengths": pre_out["pre_encoder_lengths"],
+            "chunk_embs_in": pre_out["chunk_embs_in"],
+            "chunk_lens_in": pre_out["chunk_lens_in"]
+        })
+        # Extract predictions for this chunk
+        emb_len = int(head_out["chunk_pre_encoder_lengths"][0])
+        lc = 0 if self.chunk_idx == 0 else 1  # Left context
+        rc = CONFIG["chunk_right_context"]
+        chunk_pred_len = emb_len - lc - rc
+        pred_offset = self.spkcache_len + self.fifo_len + lc
+        predictions = head_out["speaker_preds"][0, pred_offset:pred_offset + chunk_pred_len, :]
+        # Update state (simplified - full implementation needs NeMo's streaming_update logic)
+        self._update_state(pre_out, emb_len)
+        self.chunk_idx += 1
+        return predictions
+    def _update_state(self, pre_out, emb_len):
+        """Update spkcache and fifo state buffers."""
+        # Get new chunk embeddings
+        new_embs = pre_out["chunk_embs_in"][0, :emb_len, :]
+        # Add to fifo
+        if self.fifo_len + emb_len <= CONFIG["fifo_len"]:
+            self.fifo[0, self.fifo_len:self.fifo_len + emb_len, :] = new_embs
+            self.fifo_len += emb_len
+        else:
+            # FIFO overflow - move to spkcache
+            overflow = self.fifo_len + emb_len - CONFIG["fifo_len"]
+            # Move overflow from fifo to spkcache
+            if self.spkcache_len + overflow <= CONFIG["spkcache_len"]:
+                self.spkcache[0, self.spkcache_len:self.spkcache_len + overflow, :] = \
+                    self.fifo[0, :overflow, :]
+                self.spkcache_len += overflow
+            # Shift fifo and add new
+            self.fifo[0, :self.fifo_len - overflow, :] = self.fifo[0, overflow:self.fifo_len, :]
+            self.fifo_len -= overflow
+            self.fifo[0, self.fifo_len:self.fifo_len + emb_len, :] = new_embs
+            self.fifo_len += emb_len
+def process_audio(audio_path: str, model_dir: str = ".") -> list:
+    """
+    Process an audio file and return diarization results.
+    Args:
+        audio_path: Path to audio file (16kHz mono WAV)
+        model_dir: Directory containing CoreML models
+    Returns:
+        List of (start_time, end_time, speaker_id) tuples
+    """
+    import torchaudio
+    import torch
+    # Load audio
+    waveform, sr = torchaudio.load(audio_path)
+    if sr != 16000:
+        waveform = torchaudio.functional.resample(waveform, sr, 16000)
+    if waveform.shape[0] > 1:
+        waveform = waveform.mean(dim=0, keepdim=True)
+    # Initialize model
+    model = SortformerCoreML(model_dir)
+    # Compute mel spectrogram using NeMo-compatible settings
+    # (You may need to use the Pipeline_Preprocessor or native mel computation)
+    # Process in chunks and collect predictions
+    # ... (implementation depends on your mel spectrogram computation)
+    print(f"Loaded audio: {waveform.shape}, {sr}Hz")
+    print("Processing... (implement chunking logic)")
+    return []
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        print("Usage: python inference.py <audio_file.wav>")
+        print("\nThis script requires:")
+        print("  - Pipeline_Preprocessor.mlpackage")
+        print("  - Pipeline_PreEncoder.mlpackage")
+        print("  - Pipeline_Head_Fixed.mlpackage")
+        sys.exit(1)
+    results = process_audio(sys.argv[1])
+    for start, end, speaker in results:
+        print(f"[{start:.2f}s - {end:.2f}s] Speaker {speaker}")

pyproject.toml ADDED Viewed

	@@ -0,0 +1,22 @@

+[project]
+name = "streaming-sortformer-coreml"
+version = "0.1.0"
+description = "CoreML conversion of NVIDIA Streaming Sortformer for Apple Silicon"
+readme = "README.md"
+requires-python = ">=3.10"
+license = "Apache-2.0"
+dependencies = [
+    "coremltools>=7.0",
+    "torch>=2.0",
+    "torchaudio>=2.0",
+    "numpy>=1.24",
+]
+[project.optional-dependencies]
+convert = [
+    "nemo_toolkit[asr]>=2.0",
+]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"