Upload 26 files
Browse files- Pipeline_Head_Fixed.mlmodelc/analytics/coremldata.bin +3 -0
- Pipeline_Head_Fixed.mlmodelc/coremldata.bin +3 -0
- Pipeline_Head_Fixed.mlmodelc/model.mil +0 -0
- Pipeline_Head_Fixed.mlmodelc/weights/weight.bin +3 -0
- Pipeline_Head_Fixed.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- Pipeline_Head_Fixed.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- Pipeline_Head_Fixed.mlpackage/Manifest.json +18 -0
- Pipeline_PreEncoder.mlmodelc/analytics/coremldata.bin +3 -0
- Pipeline_PreEncoder.mlmodelc/coremldata.bin +3 -0
- Pipeline_PreEncoder.mlmodelc/model.mil +201 -0
- Pipeline_PreEncoder.mlmodelc/weights/weight.bin +3 -0
- Pipeline_PreEncoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- Pipeline_PreEncoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- Pipeline_PreEncoder.mlpackage/Manifest.json +18 -0
- Pipeline_Preprocessor.mlmodelc/analytics/coremldata.bin +3 -0
- Pipeline_Preprocessor.mlmodelc/coremldata.bin +3 -0
- Pipeline_Preprocessor.mlmodelc/model.mil +0 -0
- Pipeline_Preprocessor.mlmodelc/weights/weight.bin +3 -0
- Pipeline_Preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- Pipeline_Preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- Pipeline_Preprocessor.mlpackage/Manifest.json +18 -0
- README.md +137 -0
- convert_to_coreml.py +357 -0
- export_nvidia_pipeline.py +308 -0
- inference.py +192 -0
- pyproject.toml +22 -0
Pipeline_Head_Fixed.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5980b0b5b6afd629201028acd9d30ef139405a4ff8e3197551b5749757e19808
|
| 3 |
+
size 243
|
Pipeline_Head_Fixed.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4d3a809042f1aafc6410902c356ad226e4104b9f92f21a266b85a89d501c8e3c
|
| 3 |
+
size 505
|
Pipeline_Head_Fixed.mlmodelc/model.mil
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Pipeline_Head_Fixed.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be56312ef2dbf57706aab7395fc2d5601ac6fbe6c553fd2b25069eba8da9b3b2
|
| 3 |
+
size 235580992
|
Pipeline_Head_Fixed.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93251763d5376dad5dc1f78cb0440397abb5a52e346575f1b6b750e958da13eb
|
| 3 |
+
size 827022
|
Pipeline_Head_Fixed.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be56312ef2dbf57706aab7395fc2d5601ac6fbe6c553fd2b25069eba8da9b3b2
|
| 3 |
+
size 235580992
|
Pipeline_Head_Fixed.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"7D7B39C7-0AD2-4CD7-B6B0-A7E76DCDE6CA": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"B58CA828-CA78-46FC-BD8D-5ABFB5AAEADD": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "B58CA828-CA78-46FC-BD8D-5ABFB5AAEADD"
|
| 18 |
+
}
|
Pipeline_PreEncoder.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f0946b687ccf4274e0228e3bc539e6733c33bf0f4419e28d02220b19a35d884b
|
| 3 |
+
size 243
|
Pipeline_PreEncoder.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:beb4fe86e80615cf79e64e55cb229cc931998b67b51a661bfbdc66204da0ea7b
|
| 3 |
+
size 553
|
Pipeline_PreEncoder.mlmodelc/model.mil
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3505.3.2"}, {"coremlc-version", "3505.4.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
+
{
|
| 4 |
+
func main<ios16>(tensor<fp32, [1, 112, 128]> chunk, tensor<int32, [1]> chunk_lengths, tensor<fp32, [1, 188, 512]> fifo, tensor<int32, [1]> fifo_lengths, tensor<fp32, [1, 188, 512]> spkcache, tensor<int32, [1]> spkcache_lengths) {
|
| 5 |
+
tensor<fp32, [256]> model_encoder_pre_encode_conv_0_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 6 |
+
tensor<fp32, [256, 1, 3, 3]> model_encoder_pre_encode_conv_0_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_0_weight"), val = tensor<fp32, [256, 1, 3, 3]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
|
| 7 |
+
tensor<fp32, [256]> model_encoder_pre_encode_conv_2_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10432)))];
|
| 8 |
+
tensor<fp32, [256, 1, 3, 3]> model_encoder_pre_encode_conv_2_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_2_weight"), val = tensor<fp32, [256, 1, 3, 3]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11520)))];
|
| 9 |
+
tensor<fp32, [256]> model_encoder_pre_encode_conv_3_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20800)))];
|
| 10 |
+
tensor<fp32, [256, 256, 1, 1]> model_encoder_pre_encode_conv_3_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_3_weight"), val = tensor<fp32, [256, 256, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21888)))];
|
| 11 |
+
tensor<fp32, [256]> model_encoder_pre_encode_conv_5_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(284096)))];
|
| 12 |
+
tensor<fp32, [256, 1, 3, 3]> model_encoder_pre_encode_conv_5_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_5_weight"), val = tensor<fp32, [256, 1, 3, 3]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(285184)))];
|
| 13 |
+
tensor<fp32, [256]> model_encoder_pre_encode_conv_6_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_6_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(294464)))];
|
| 14 |
+
tensor<fp32, [256, 256, 1, 1]> model_encoder_pre_encode_conv_6_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_6_weight"), val = tensor<fp32, [256, 256, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295552)))];
|
| 15 |
+
tensor<fp32, [512]> model_encoder_pre_encode_out_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_out_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(557760)))];
|
| 16 |
+
tensor<fp32, [512, 4096]> model_encoder_pre_encode_out_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_out_weight"), val = tensor<fp32, [512, 4096]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(559872)))];
|
| 17 |
+
tensor<int32, [1]> tensor_1_axes_0 = const()[name = tensor<string, []>("tensor_1_axes_0"), val = tensor<int32, [1]>([1])];
|
| 18 |
+
tensor<fp32, [1, 1, 112, 128]> tensor_1 = expand_dims(axes = tensor_1_axes_0, x = chunk)[name = tensor<string, []>("tensor_1")];
|
| 19 |
+
tensor<string, []> cast_0_dtype_0 = const()[name = tensor<string, []>("cast_0_dtype_0"), val = tensor<string, []>("fp32")];
|
| 20 |
+
tensor<int32, [1, 112]> expand_dims_0 = const()[name = tensor<string, []>("expand_dims_0"), val = tensor<int32, [1, 112]>([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111]])];
|
| 21 |
+
tensor<int32, [1]> var_40_axes_0 = const()[name = tensor<string, []>("op_40_axes_0"), val = tensor<int32, [1]>([1])];
|
| 22 |
+
tensor<int32, [1, 1]> var_40 = expand_dims(axes = var_40_axes_0, x = chunk_lengths)[name = tensor<string, []>("op_40")];
|
| 23 |
+
tensor<bool, [1, 112]> time_mask_1 = less(x = expand_dims_0, y = var_40)[name = tensor<string, []>("time_mask_1")];
|
| 24 |
+
tensor<int32, [1]> var_42_axes_0 = const()[name = tensor<string, []>("op_42_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 25 |
+
tensor<bool, [1, 112, 1]> var_42 = expand_dims(axes = var_42_axes_0, x = time_mask_1)[name = tensor<string, []>("op_42")];
|
| 26 |
+
tensor<int32, [3]> var_44_reps_0 = const()[name = tensor<string, []>("op_44_reps_0"), val = tensor<int32, [3]>([1, 1, 128])];
|
| 27 |
+
tensor<bool, [1, 112, 128]> var_44 = tile(reps = var_44_reps_0, x = var_42)[name = tensor<string, []>("op_44")];
|
| 28 |
+
tensor<string, []> cast_2_dtype_0 = const()[name = tensor<string, []>("cast_2_dtype_0"), val = tensor<string, []>("fp32")];
|
| 29 |
+
tensor<int32, [1]> var_50_axes_0 = const()[name = tensor<string, []>("op_50_axes_0"), val = tensor<int32, [1]>([1])];
|
| 30 |
+
tensor<fp32, [1, 112, 128]> cast_2 = cast(dtype = cast_2_dtype_0, x = var_44)[name = tensor<string, []>("cast_25")];
|
| 31 |
+
tensor<fp32, [1, 1, 112, 128]> var_50 = expand_dims(axes = var_50_axes_0, x = cast_2)[name = tensor<string, []>("op_50")];
|
| 32 |
+
tensor<fp32, [1, 1, 112, 128]> input_1 = mul(x = tensor_1, y = var_50)[name = tensor<string, []>("input_1")];
|
| 33 |
+
tensor<string, []> tensor_3_pad_type_0 = const()[name = tensor<string, []>("tensor_3_pad_type_0"), val = tensor<string, []>("custom")];
|
| 34 |
+
tensor<int32, [4]> tensor_3_pad_0 = const()[name = tensor<string, []>("tensor_3_pad_0"), val = tensor<int32, [4]>([1, 1, 1, 1])];
|
| 35 |
+
tensor<int32, [2]> tensor_3_strides_0 = const()[name = tensor<string, []>("tensor_3_strides_0"), val = tensor<int32, [2]>([2, 2])];
|
| 36 |
+
tensor<int32, [2]> tensor_3_dilations_0 = const()[name = tensor<string, []>("tensor_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
|
| 37 |
+
tensor<int32, []> tensor_3_groups_0 = const()[name = tensor<string, []>("tensor_3_groups_0"), val = tensor<int32, []>(1)];
|
| 38 |
+
tensor<fp32, [1, 256, 56, 64]> tensor_3 = conv(bias = model_encoder_pre_encode_conv_0_bias, dilations = tensor_3_dilations_0, groups = tensor_3_groups_0, pad = tensor_3_pad_0, pad_type = tensor_3_pad_type_0, strides = tensor_3_strides_0, weight = model_encoder_pre_encode_conv_0_weight, x = input_1)[name = tensor<string, []>("tensor_3")];
|
| 39 |
+
tensor<fp32, []> var_61_promoted = const()[name = tensor<string, []>("op_61_promoted"), val = tensor<fp32, []>(0x1p+0)];
|
| 40 |
+
tensor<fp32, [1]> cast_0 = cast(dtype = cast_0_dtype_0, x = chunk_lengths)[name = tensor<string, []>("cast_26")];
|
| 41 |
+
tensor<fp32, [1]> var_62 = add(x = cast_0, y = var_61_promoted)[name = tensor<string, []>("op_62")];
|
| 42 |
+
tensor<fp32, []> var_63_promoted = const()[name = tensor<string, []>("op_63_promoted"), val = tensor<fp32, []>(0x1p+0)];
|
| 43 |
+
tensor<fp32, [1]> var_64 = add(x = var_62, y = var_63_promoted)[name = tensor<string, []>("op_64")];
|
| 44 |
+
tensor<fp32, []> var_65_promoted = const()[name = tensor<string, []>("op_65_promoted"), val = tensor<fp32, []>(0x1.8p+1)];
|
| 45 |
+
tensor<fp32, [1]> var_66 = sub(x = var_64, y = var_65_promoted)[name = tensor<string, []>("op_66")];
|
| 46 |
+
tensor<fp32, []> var_21_promoted = const()[name = tensor<string, []>("op_21_promoted"), val = tensor<fp32, []>(0x1p+1)];
|
| 47 |
+
tensor<fp32, [1]> floor_div_0 = floor_div(x = var_66, y = var_21_promoted)[name = tensor<string, []>("floor_div_0")];
|
| 48 |
+
tensor<fp32, []> var_68_promoted = const()[name = tensor<string, []>("op_68_promoted"), val = tensor<fp32, []>(0x1p+0)];
|
| 49 |
+
tensor<fp32, [1]> current_lengths_3 = add(x = floor_div_0, y = var_68_promoted)[name = tensor<string, []>("current_lengths_3")];
|
| 50 |
+
tensor<string, []> cast_3_dtype_0 = const()[name = tensor<string, []>("cast_3_dtype_0"), val = tensor<string, []>("int32")];
|
| 51 |
+
tensor<int32, [1, 56]> expand_dims_1 = const()[name = tensor<string, []>("expand_dims_1"), val = tensor<int32, [1, 56]>([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]])];
|
| 52 |
+
tensor<int32, [1]> var_77_axes_0 = const()[name = tensor<string, []>("op_77_axes_0"), val = tensor<int32, [1]>([1])];
|
| 53 |
+
tensor<int32, [1]> cast_3 = cast(dtype = cast_3_dtype_0, x = current_lengths_3)[name = tensor<string, []>("cast_24")];
|
| 54 |
+
tensor<int32, [1, 1]> var_77 = expand_dims(axes = var_77_axes_0, x = cast_3)[name = tensor<string, []>("op_77")];
|
| 55 |
+
tensor<bool, [1, 56]> time_mask_3 = less(x = expand_dims_1, y = var_77)[name = tensor<string, []>("time_mask_3")];
|
| 56 |
+
tensor<int32, [1]> var_79_axes_0 = const()[name = tensor<string, []>("op_79_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 57 |
+
tensor<bool, [1, 56, 1]> var_79 = expand_dims(axes = var_79_axes_0, x = time_mask_3)[name = tensor<string, []>("op_79")];
|
| 58 |
+
tensor<int32, [3]> var_81_reps_0 = const()[name = tensor<string, []>("op_81_reps_0"), val = tensor<int32, [3]>([1, 1, 64])];
|
| 59 |
+
tensor<bool, [1, 56, 64]> var_81 = tile(reps = var_81_reps_0, x = var_79)[name = tensor<string, []>("op_81")];
|
| 60 |
+
tensor<string, []> cast_4_dtype_0 = const()[name = tensor<string, []>("cast_4_dtype_0"), val = tensor<string, []>("fp32")];
|
| 61 |
+
tensor<int32, [1]> var_87_axes_0 = const()[name = tensor<string, []>("op_87_axes_0"), val = tensor<int32, [1]>([1])];
|
| 62 |
+
tensor<fp32, [1, 56, 64]> cast_4 = cast(dtype = cast_4_dtype_0, x = var_81)[name = tensor<string, []>("cast_23")];
|
| 63 |
+
tensor<fp32, [1, 1, 56, 64]> var_87 = expand_dims(axes = var_87_axes_0, x = cast_4)[name = tensor<string, []>("op_87")];
|
| 64 |
+
tensor<int32, [4]> expanded_mask_3_reps_0 = const()[name = tensor<string, []>("expanded_mask_3_reps_0"), val = tensor<int32, [4]>([1, 256, 1, 1])];
|
| 65 |
+
tensor<fp32, [1, 256, 56, 64]> expanded_mask_3 = tile(reps = expanded_mask_3_reps_0, x = var_87)[name = tensor<string, []>("expanded_mask_3")];
|
| 66 |
+
tensor<fp32, [1, 256, 56, 64]> input_3 = mul(x = tensor_3, y = expanded_mask_3)[name = tensor<string, []>("input_3")];
|
| 67 |
+
tensor<fp32, [1, 256, 56, 64]> tensor_5 = relu(x = input_3)[name = tensor<string, []>("tensor_5")];
|
| 68 |
+
tensor<fp32, [1, 256, 56, 64]> input_5 = mul(x = tensor_5, y = expanded_mask_3)[name = tensor<string, []>("input_5")];
|
| 69 |
+
tensor<string, []> tensor_7_pad_type_0 = const()[name = tensor<string, []>("tensor_7_pad_type_0"), val = tensor<string, []>("custom")];
|
| 70 |
+
tensor<int32, [4]> tensor_7_pad_0 = const()[name = tensor<string, []>("tensor_7_pad_0"), val = tensor<int32, [4]>([1, 1, 1, 1])];
|
| 71 |
+
tensor<int32, [2]> tensor_7_strides_0 = const()[name = tensor<string, []>("tensor_7_strides_0"), val = tensor<int32, [2]>([2, 2])];
|
| 72 |
+
tensor<int32, []> tensor_7_groups_0 = const()[name = tensor<string, []>("tensor_7_groups_0"), val = tensor<int32, []>(256)];
|
| 73 |
+
tensor<int32, [2]> tensor_7_dilations_0 = const()[name = tensor<string, []>("tensor_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
|
| 74 |
+
tensor<fp32, [1, 256, 28, 32]> tensor_7 = conv(bias = model_encoder_pre_encode_conv_2_bias, dilations = tensor_7_dilations_0, groups = tensor_7_groups_0, pad = tensor_7_pad_0, pad_type = tensor_7_pad_type_0, strides = tensor_7_strides_0, weight = model_encoder_pre_encode_conv_2_weight, x = input_5)[name = tensor<string, []>("tensor_7")];
|
| 75 |
+
tensor<fp32, []> var_107_promoted = const()[name = tensor<string, []>("op_107_promoted"), val = tensor<fp32, []>(0x1p+0)];
|
| 76 |
+
tensor<fp32, [1]> var_108 = add(x = current_lengths_3, y = var_107_promoted)[name = tensor<string, []>("op_108")];
|
| 77 |
+
tensor<fp32, []> var_109_promoted = const()[name = tensor<string, []>("op_109_promoted"), val = tensor<fp32, []>(0x1p+0)];
|
| 78 |
+
tensor<fp32, [1]> var_110 = add(x = var_108, y = var_109_promoted)[name = tensor<string, []>("op_110")];
|
| 79 |
+
tensor<fp32, []> var_111_promoted = const()[name = tensor<string, []>("op_111_promoted"), val = tensor<fp32, []>(0x1.8p+1)];
|
| 80 |
+
tensor<fp32, [1]> var_112 = sub(x = var_110, y = var_111_promoted)[name = tensor<string, []>("op_112")];
|
| 81 |
+
tensor<fp32, []> var_21_promoted_1 = const()[name = tensor<string, []>("op_21_promoted_1"), val = tensor<fp32, []>(0x1p+1)];
|
| 82 |
+
tensor<fp32, [1]> floor_div_1 = floor_div(x = var_112, y = var_21_promoted_1)[name = tensor<string, []>("floor_div_1")];
|
| 83 |
+
tensor<fp32, []> var_114_promoted = const()[name = tensor<string, []>("op_114_promoted"), val = tensor<fp32, []>(0x1p+0)];
|
| 84 |
+
tensor<fp32, [1]> current_lengths_5 = add(x = floor_div_1, y = var_114_promoted)[name = tensor<string, []>("current_lengths_5")];
|
| 85 |
+
tensor<string, []> cast_5_dtype_0 = const()[name = tensor<string, []>("cast_5_dtype_0"), val = tensor<string, []>("int32")];
|
| 86 |
+
tensor<int32, [1, 28]> expand_dims_2 = const()[name = tensor<string, []>("expand_dims_2"), val = tensor<int32, [1, 28]>([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]])];
|
| 87 |
+
tensor<int32, [1]> var_123_axes_0 = const()[name = tensor<string, []>("op_123_axes_0"), val = tensor<int32, [1]>([1])];
|
| 88 |
+
tensor<int32, [1]> cast_5 = cast(dtype = cast_5_dtype_0, x = current_lengths_5)[name = tensor<string, []>("cast_22")];
|
| 89 |
+
tensor<int32, [1, 1]> var_123 = expand_dims(axes = var_123_axes_0, x = cast_5)[name = tensor<string, []>("op_123")];
|
| 90 |
+
tensor<bool, [1, 28]> time_mask_5 = less(x = expand_dims_2, y = var_123)[name = tensor<string, []>("time_mask_5")];
|
| 91 |
+
tensor<int32, [1]> var_125_axes_0 = const()[name = tensor<string, []>("op_125_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 92 |
+
tensor<bool, [1, 28, 1]> var_125 = expand_dims(axes = var_125_axes_0, x = time_mask_5)[name = tensor<string, []>("op_125")];
|
| 93 |
+
tensor<int32, [3]> var_127_reps_0 = const()[name = tensor<string, []>("op_127_reps_0"), val = tensor<int32, [3]>([1, 1, 32])];
|
| 94 |
+
tensor<bool, [1, 28, 32]> var_127 = tile(reps = var_127_reps_0, x = var_125)[name = tensor<string, []>("op_127")];
|
| 95 |
+
tensor<string, []> cast_6_dtype_0 = const()[name = tensor<string, []>("cast_6_dtype_0"), val = tensor<string, []>("fp32")];
|
| 96 |
+
tensor<int32, [1]> var_133_axes_0 = const()[name = tensor<string, []>("op_133_axes_0"), val = tensor<int32, [1]>([1])];
|
| 97 |
+
tensor<fp32, [1, 28, 32]> cast_6 = cast(dtype = cast_6_dtype_0, x = var_127)[name = tensor<string, []>("cast_21")];
|
| 98 |
+
tensor<fp32, [1, 1, 28, 32]> var_133 = expand_dims(axes = var_133_axes_0, x = cast_6)[name = tensor<string, []>("op_133")];
|
| 99 |
+
tensor<int32, [4]> expanded_mask_7_reps_0 = const()[name = tensor<string, []>("expanded_mask_7_reps_0"), val = tensor<int32, [4]>([1, 256, 1, 1])];
|
| 100 |
+
tensor<fp32, [1, 256, 28, 32]> expanded_mask_7 = tile(reps = expanded_mask_7_reps_0, x = var_133)[name = tensor<string, []>("expanded_mask_7")];
|
| 101 |
+
tensor<fp32, [1, 256, 28, 32]> input_7 = mul(x = tensor_7, y = expanded_mask_7)[name = tensor<string, []>("input_7")];
|
| 102 |
+
tensor<string, []> tensor_9_pad_type_0 = const()[name = tensor<string, []>("tensor_9_pad_type_0"), val = tensor<string, []>("valid")];
|
| 103 |
+
tensor<int32, [2]> tensor_9_strides_0 = const()[name = tensor<string, []>("tensor_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
|
| 104 |
+
tensor<int32, [4]> tensor_9_pad_0 = const()[name = tensor<string, []>("tensor_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
|
| 105 |
+
tensor<int32, [2]> tensor_9_dilations_0 = const()[name = tensor<string, []>("tensor_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
|
| 106 |
+
tensor<int32, []> tensor_9_groups_0 = const()[name = tensor<string, []>("tensor_9_groups_0"), val = tensor<int32, []>(1)];
|
| 107 |
+
tensor<fp32, [1, 256, 28, 32]> tensor_9 = conv(bias = model_encoder_pre_encode_conv_3_bias, dilations = tensor_9_dilations_0, groups = tensor_9_groups_0, pad = tensor_9_pad_0, pad_type = tensor_9_pad_type_0, strides = tensor_9_strides_0, weight = model_encoder_pre_encode_conv_3_weight, x = input_7)[name = tensor<string, []>("tensor_9")];
|
| 108 |
+
tensor<fp32, [1, 256, 28, 32]> input_9 = mul(x = tensor_9, y = expanded_mask_7)[name = tensor<string, []>("input_9")];
|
| 109 |
+
tensor<fp32, [1, 256, 28, 32]> tensor_11 = relu(x = input_9)[name = tensor<string, []>("tensor_11")];
|
| 110 |
+
tensor<fp32, [1, 256, 28, 32]> input_11 = mul(x = tensor_11, y = expanded_mask_7)[name = tensor<string, []>("input_11")];
|
| 111 |
+
tensor<string, []> tensor_13_pad_type_0 = const()[name = tensor<string, []>("tensor_13_pad_type_0"), val = tensor<string, []>("custom")];
|
| 112 |
+
tensor<int32, [4]> tensor_13_pad_0 = const()[name = tensor<string, []>("tensor_13_pad_0"), val = tensor<int32, [4]>([1, 1, 1, 1])];
|
| 113 |
+
tensor<int32, [2]> tensor_13_strides_0 = const()[name = tensor<string, []>("tensor_13_strides_0"), val = tensor<int32, [2]>([2, 2])];
|
| 114 |
+
tensor<int32, []> tensor_13_groups_0 = const()[name = tensor<string, []>("tensor_13_groups_0"), val = tensor<int32, []>(256)];
|
| 115 |
+
tensor<int32, [2]> tensor_13_dilations_0 = const()[name = tensor<string, []>("tensor_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
|
| 116 |
+
tensor<fp32, [1, 256, 14, 16]> tensor_13 = conv(bias = model_encoder_pre_encode_conv_5_bias, dilations = tensor_13_dilations_0, groups = tensor_13_groups_0, pad = tensor_13_pad_0, pad_type = tensor_13_pad_type_0, strides = tensor_13_strides_0, weight = model_encoder_pre_encode_conv_5_weight, x = input_11)[name = tensor<string, []>("tensor_13")];
|
| 117 |
+
tensor<fp32, []> var_168_promoted = const()[name = tensor<string, []>("op_168_promoted"), val = tensor<fp32, []>(0x1p+0)];
|
| 118 |
+
tensor<fp32, [1]> var_169 = add(x = current_lengths_5, y = var_168_promoted)[name = tensor<string, []>("op_169")];
|
| 119 |
+
tensor<fp32, []> var_170_promoted = const()[name = tensor<string, []>("op_170_promoted"), val = tensor<fp32, []>(0x1p+0)];
|
| 120 |
+
tensor<fp32, [1]> var_171 = add(x = var_169, y = var_170_promoted)[name = tensor<string, []>("op_171")];
|
| 121 |
+
tensor<fp32, []> var_172_promoted = const()[name = tensor<string, []>("op_172_promoted"), val = tensor<fp32, []>(0x1.8p+1)];
|
| 122 |
+
tensor<fp32, [1]> var_173 = sub(x = var_171, y = var_172_promoted)[name = tensor<string, []>("op_173")];
|
| 123 |
+
tensor<fp32, []> var_21_promoted_2 = const()[name = tensor<string, []>("op_21_promoted_2"), val = tensor<fp32, []>(0x1p+1)];
|
| 124 |
+
tensor<fp32, [1]> floor_div_2 = floor_div(x = var_173, y = var_21_promoted_2)[name = tensor<string, []>("floor_div_2")];
|
| 125 |
+
tensor<fp32, []> var_175_promoted = const()[name = tensor<string, []>("op_175_promoted"), val = tensor<fp32, []>(0x1p+0)];
|
| 126 |
+
tensor<fp32, [1]> current_lengths = add(x = floor_div_2, y = var_175_promoted)[name = tensor<string, []>("current_lengths")];
|
| 127 |
+
tensor<string, []> cast_7_dtype_0 = const()[name = tensor<string, []>("cast_7_dtype_0"), val = tensor<string, []>("int32")];
|
| 128 |
+
tensor<int32, [1, 14]> expand_dims_3 = const()[name = tensor<string, []>("expand_dims_3"), val = tensor<int32, [1, 14]>([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]])];
|
| 129 |
+
tensor<int32, [1]> var_184_axes_0 = const()[name = tensor<string, []>("op_184_axes_0"), val = tensor<int32, [1]>([1])];
|
| 130 |
+
tensor<int32, [1]> cast_7 = cast(dtype = cast_7_dtype_0, x = current_lengths)[name = tensor<string, []>("cast_20")];
|
| 131 |
+
tensor<int32, [1, 1]> var_184 = expand_dims(axes = var_184_axes_0, x = cast_7)[name = tensor<string, []>("op_184")];
|
| 132 |
+
tensor<bool, [1, 14]> time_mask = less(x = expand_dims_3, y = var_184)[name = tensor<string, []>("time_mask")];
|
| 133 |
+
tensor<int32, [1]> var_186_axes_0 = const()[name = tensor<string, []>("op_186_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 134 |
+
tensor<bool, [1, 14, 1]> var_186 = expand_dims(axes = var_186_axes_0, x = time_mask)[name = tensor<string, []>("op_186")];
|
| 135 |
+
tensor<int32, [3]> var_188_reps_0 = const()[name = tensor<string, []>("op_188_reps_0"), val = tensor<int32, [3]>([1, 1, 16])];
|
| 136 |
+
tensor<bool, [1, 14, 16]> var_188 = tile(reps = var_188_reps_0, x = var_186)[name = tensor<string, []>("op_188")];
|
| 137 |
+
tensor<string, []> cast_8_dtype_0 = const()[name = tensor<string, []>("cast_8_dtype_0"), val = tensor<string, []>("fp32")];
|
| 138 |
+
tensor<int32, [1]> var_194_axes_0 = const()[name = tensor<string, []>("op_194_axes_0"), val = tensor<int32, [1]>([1])];
|
| 139 |
+
tensor<fp32, [1, 14, 16]> cast_8 = cast(dtype = cast_8_dtype_0, x = var_188)[name = tensor<string, []>("cast_19")];
|
| 140 |
+
tensor<fp32, [1, 1, 14, 16]> var_194 = expand_dims(axes = var_194_axes_0, x = cast_8)[name = tensor<string, []>("op_194")];
|
| 141 |
+
tensor<int32, [4]> expanded_mask_13_reps_0 = const()[name = tensor<string, []>("expanded_mask_13_reps_0"), val = tensor<int32, [4]>([1, 256, 1, 1])];
|
| 142 |
+
tensor<fp32, [1, 256, 14, 16]> expanded_mask_13 = tile(reps = expanded_mask_13_reps_0, x = var_194)[name = tensor<string, []>("expanded_mask_13")];
|
| 143 |
+
tensor<fp32, [1, 256, 14, 16]> input_13 = mul(x = tensor_13, y = expanded_mask_13)[name = tensor<string, []>("input_13")];
|
| 144 |
+
tensor<string, []> tensor_15_pad_type_0 = const()[name = tensor<string, []>("tensor_15_pad_type_0"), val = tensor<string, []>("valid")];
|
| 145 |
+
tensor<int32, [2]> tensor_15_strides_0 = const()[name = tensor<string, []>("tensor_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
|
| 146 |
+
tensor<int32, [4]> tensor_15_pad_0 = const()[name = tensor<string, []>("tensor_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
|
| 147 |
+
tensor<int32, [2]> tensor_15_dilations_0 = const()[name = tensor<string, []>("tensor_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
|
| 148 |
+
tensor<int32, []> tensor_15_groups_0 = const()[name = tensor<string, []>("tensor_15_groups_0"), val = tensor<int32, []>(1)];
|
| 149 |
+
tensor<fp32, [1, 256, 14, 16]> tensor_15 = conv(bias = model_encoder_pre_encode_conv_6_bias, dilations = tensor_15_dilations_0, groups = tensor_15_groups_0, pad = tensor_15_pad_0, pad_type = tensor_15_pad_type_0, strides = tensor_15_strides_0, weight = model_encoder_pre_encode_conv_6_weight, x = input_13)[name = tensor<string, []>("tensor_15")];
|
| 150 |
+
tensor<fp32, [1, 256, 14, 16]> input_15 = mul(x = tensor_15, y = expanded_mask_13)[name = tensor<string, []>("input_15")];
|
| 151 |
+
tensor<fp32, [1, 256, 14, 16]> tensor_workaround = relu(x = input_15)[name = tensor<string, []>("tensor_workaround")];
|
| 152 |
+
tensor<fp32, [1, 256, 14, 16]> x = mul(x = tensor_workaround, y = expanded_mask_13)[name = tensor<string, []>("x")];
|
| 153 |
+
tensor<int32, [4]> var_228_perm_0 = const()[name = tensor<string, []>("op_228_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
|
| 154 |
+
tensor<int32, [3]> var_229 = const()[name = tensor<string, []>("op_229"), val = tensor<int32, [3]>([1, 14, -1])];
|
| 155 |
+
tensor<fp32, [1, 14, 256, 16]> var_228 = transpose(perm = var_228_perm_0, x = x)[name = tensor<string, []>("transpose_0")];
|
| 156 |
+
tensor<fp32, [1, 14, 4096]> input = reshape(shape = var_229, x = var_228)[name = tensor<string, []>("input")];
|
| 157 |
+
tensor<fp32, [1, 14, 512]> chunk_embs_in = linear(bias = model_encoder_pre_encode_out_bias, weight = model_encoder_pre_encode_out_weight, x = input)[name = tensor<string, []>("linear_0")];
|
| 158 |
+
tensor<string, []> cast_11_dtype_0 = const()[name = tensor<string, []>("cast_11_dtype_0"), val = tensor<string, []>("int32")];
|
| 159 |
+
tensor<int32, [1]> size0 = const()[name = tensor<string, []>("size0"), val = tensor<int32, [1]>([188])];
|
| 160 |
+
tensor<int32, [1]> size1 = const()[name = tensor<string, []>("size1"), val = tensor<int32, [1]>([188])];
|
| 161 |
+
tensor<int32, []> var_264 = const()[name = tensor<string, []>("op_264"), val = tensor<int32, []>(1)];
|
| 162 |
+
tensor<bool, []> full_concat_interleave_0 = const()[name = tensor<string, []>("full_concat_interleave_0"), val = tensor<bool, []>(false)];
|
| 163 |
+
tensor<fp32, [1, 390, 512]> full_concat = concat(axis = var_264, interleave = full_concat_interleave_0, values = (spkcache, fifo, chunk_embs_in))[name = tensor<string, []>("full_concat")];
|
| 164 |
+
tensor<int32, [1]> var_273 = add(x = spkcache_lengths, y = fifo_lengths)[name = tensor<string, []>("op_273")];
|
| 165 |
+
tensor<int32, [1]> chunk_lens_in = cast(dtype = cast_11_dtype_0, x = current_lengths)[name = tensor<string, []>("cast_18")];
|
| 166 |
+
tensor<int32, [1]> pre_encoder_lengths = add(x = var_273, y = chunk_lens_in)[name = tensor<string, []>("total_length")];
|
| 167 |
+
tensor<int32, [390]> out_pos = const()[name = tensor<string, []>("out_pos"), val = tensor<int32, [390]>([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389])];
|
| 168 |
+
tensor<bool, [390]> var_284 = greater_equal(x = out_pos, y = spkcache_lengths)[name = tensor<string, []>("op_284")];
|
| 169 |
+
tensor<string, []> cast_12_dtype_0 = const()[name = tensor<string, []>("cast_12_dtype_0"), val = tensor<string, []>("int32")];
|
| 170 |
+
tensor<bool, [390]> var_290 = greater_equal(x = out_pos, y = var_273)[name = tensor<string, []>("op_290")];
|
| 171 |
+
tensor<string, []> cast_13_dtype_0 = const()[name = tensor<string, []>("cast_13_dtype_0"), val = tensor<string, []>("int32")];
|
| 172 |
+
tensor<int32, [1]> var_297 = sub(x = size0, y = spkcache_lengths)[name = tensor<string, []>("op_297")];
|
| 173 |
+
tensor<int32, [390]> cast_12 = cast(dtype = cast_12_dtype_0, x = var_284)[name = tensor<string, []>("cast_17")];
|
| 174 |
+
tensor<int32, [390]> var_298 = mul(x = cast_12, y = var_297)[name = tensor<string, []>("op_298")];
|
| 175 |
+
tensor<int32, [1]> var_300 = sub(x = size1, y = fifo_lengths)[name = tensor<string, []>("op_300")];
|
| 176 |
+
tensor<int32, [390]> cast_13 = cast(dtype = cast_13_dtype_0, x = var_290)[name = tensor<string, []>("cast_16")];
|
| 177 |
+
tensor<int32, [390]> var_301 = mul(x = cast_13, y = var_300)[name = tensor<string, []>("op_301")];
|
| 178 |
+
tensor<int32, [390]> offset = add(x = var_298, y = var_301)[name = tensor<string, []>("offset")];
|
| 179 |
+
tensor<int32, [390]> var_305 = add(x = out_pos, y = offset)[name = tensor<string, []>("op_305")];
|
| 180 |
+
tensor<int32, []> var_309 = const()[name = tensor<string, []>("op_309"), val = tensor<int32, []>(389)];
|
| 181 |
+
tensor<int32, []> var_310 = const()[name = tensor<string, []>("op_310"), val = tensor<int32, []>(0)];
|
| 182 |
+
tensor<int32, [390]> minimum_0 = minimum(x = var_305, y = var_309)[name = tensor<string, []>("minimum_0")];
|
| 183 |
+
tensor<int32, [390]> maximum_0 = maximum(x = minimum_0, y = var_310)[name = tensor<string, []>("maximum_0")];
|
| 184 |
+
tensor<int32, [1]> var_313_axes_0 = const()[name = tensor<string, []>("op_313_axes_0"), val = tensor<int32, [1]>([0])];
|
| 185 |
+
tensor<int32, [1, 390]> var_313 = expand_dims(axes = var_313_axes_0, x = maximum_0)[name = tensor<string, []>("op_313")];
|
| 186 |
+
tensor<int32, [1]> var_315_axes_0 = const()[name = tensor<string, []>("op_315_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 187 |
+
tensor<int32, [1, 390, 1]> var_315 = expand_dims(axes = var_315_axes_0, x = var_313)[name = tensor<string, []>("op_315")];
|
| 188 |
+
tensor<int32, [3]> gather_idx_reps_0 = const()[name = tensor<string, []>("gather_idx_reps_0"), val = tensor<int32, [3]>([1, 1, 512])];
|
| 189 |
+
tensor<int32, [1, 390, 512]> gather_idx = tile(reps = gather_idx_reps_0, x = var_315)[name = tensor<string, []>("gather_idx")];
|
| 190 |
+
tensor<int32, []> var_320 = const()[name = tensor<string, []>("op_320"), val = tensor<int32, []>(1)];
|
| 191 |
+
tensor<fp32, [1, 390, 512]> output = gather_along_axis(axis = var_320, indices = gather_idx, x = full_concat)[name = tensor<string, []>("output")];
|
| 192 |
+
tensor<bool, [390]> var_323 = less(x = out_pos, y = pre_encoder_lengths)[name = tensor<string, []>("op_323")];
|
| 193 |
+
tensor<string, []> cast_14_dtype_0 = const()[name = tensor<string, []>("cast_14_dtype_0"), val = tensor<string, []>("fp32")];
|
| 194 |
+
tensor<int32, [1]> var_330_axes_0 = const()[name = tensor<string, []>("op_330_axes_0"), val = tensor<int32, [1]>([0])];
|
| 195 |
+
tensor<fp32, [390]> cast_14 = cast(dtype = cast_14_dtype_0, x = var_323)[name = tensor<string, []>("cast_15")];
|
| 196 |
+
tensor<fp32, [1, 390]> var_330 = expand_dims(axes = var_330_axes_0, x = cast_14)[name = tensor<string, []>("op_330")];
|
| 197 |
+
tensor<int32, [1]> var_332_axes_0 = const()[name = tensor<string, []>("op_332_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 198 |
+
tensor<fp32, [1, 390, 1]> var_332 = expand_dims(axes = var_332_axes_0, x = var_330)[name = tensor<string, []>("op_332")];
|
| 199 |
+
tensor<fp32, [1, 390, 512]> pre_encoder_embs = mul(x = output, y = var_332)[name = tensor<string, []>("op_333")];
|
| 200 |
+
} -> (pre_encoder_embs, pre_encoder_lengths, chunk_embs_in, chunk_lens_in);
|
| 201 |
+
}
|
Pipeline_PreEncoder.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88a98803e35186b1dfb41d7f748f7cee5093bb6efeb117f56953c17549792fa4
|
| 3 |
+
size 8948544
|
Pipeline_PreEncoder.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fb3f36b3b9d3f63e7a4f89a8848c6e3bc1a4a983786a832ea2c60cc395525cc2
|
| 3 |
+
size 26802
|
Pipeline_PreEncoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88a98803e35186b1dfb41d7f748f7cee5093bb6efeb117f56953c17549792fa4
|
| 3 |
+
size 8948544
|
Pipeline_PreEncoder.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"6894C507-E04A-4096-A90F-9AB0F58870E0": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Specification",
|
| 7 |
+
"name": "model.mlmodel",
|
| 8 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 9 |
+
},
|
| 10 |
+
"E6A26F48-2E38-4E3A-AF2B-89704AAA4B4C": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Weights",
|
| 13 |
+
"name": "weights",
|
| 14 |
+
"path": "com.apple.CoreML/weights"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "6894C507-E04A-4096-A90F-9AB0F58870E0"
|
| 18 |
+
}
|
Pipeline_Preprocessor.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eee5506c26dd1453734200ef08e1a263599e25fbdd433ecc425e7d8fd3c39641
|
| 3 |
+
size 243
|
Pipeline_Preprocessor.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:15271d7ebb0d1f8f82ae60468e6a554b71d386f6f5f717b00332b9ace990be16
|
| 3 |
+
size 374
|
Pipeline_Preprocessor.mlmodelc/model.mil
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Pipeline_Preprocessor.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e6d96fe6aa2f786e9ce18f53c2c6058807fbd9733dc48813d30554ee9b1caf80
|
| 3 |
+
size 1184512
|
Pipeline_Preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4ba45e5189ff1a12d01f2ebc6fc5db8ab7ad63cedcbb9847515ad0e7881daa0e
|
| 3 |
+
size 48673
|
Pipeline_Preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e6d96fe6aa2f786e9ce18f53c2c6058807fbd9733dc48813d30554ee9b1caf80
|
| 3 |
+
size 1184512
|
Pipeline_Preprocessor.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"43321C07-C241-4F12-89F5-DF8385087F7C": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"505C3394-5B98-45F7-8F05-9C513B04AFCB": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "505C3394-5B98-45F7-8F05-9C513B04AFCB"
|
| 18 |
+
}
|
README.md
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Streaming Sortformer CoreML
|
| 2 |
+
|
| 3 |
+
CoreML conversion of NVIDIA's Streaming Sortformer 4-Speaker Diarization model for Apple Silicon.
|
| 4 |
+
|
| 5 |
+
## Original Model
|
| 6 |
+
|
| 7 |
+
- **Source**: [nvidia/diar_streaming_sortformer_4spk-v2.1](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2.1)
|
| 8 |
+
- **Paper**: [Sortformer: Seamless Integration of Speaker Diarization and ASR](https://arxiv.org/abs/2409.06656)
|
| 9 |
+
- **Benchmark**: 20.57% DER on AMI SDM (NVIDIA reported)
|
| 10 |
+
|
| 11 |
+
## Models
|
| 12 |
+
|
| 13 |
+
| Model | Description | Input | Output |
|
| 14 |
+
|-------|-------------|-------|--------|
|
| 15 |
+
| `Pipeline_Preprocessor.mlpackage` | Mel spectrogram extraction | Audio waveform | 128-dim mel features |
|
| 16 |
+
| `Pipeline_PreEncoder.mlpackage` | FastConformer encoder + Transformer | Mel features + state | Encoded embeddings |
|
| 17 |
+
| `Pipeline_Head_Fixed.mlpackage` | Speaker prediction head | Embeddings | 4-speaker probabilities |
|
| 18 |
+
|
| 19 |
+
## Configuration
|
| 20 |
+
|
| 21 |
+
```python
|
| 22 |
+
CONFIG = {
|
| 23 |
+
"chunk_len": 6, # Core chunk length (encoder frames)
|
| 24 |
+
"chunk_left_context": 1, # Left context frames
|
| 25 |
+
"chunk_right_context": 7, # Right context frames
|
| 26 |
+
"fifo_len": 188, # FIFO buffer length
|
| 27 |
+
"spkcache_len": 188, # Speaker cache length
|
| 28 |
+
"subsampling_factor": 8, # 8x subsampling (80ms per encoder frame)
|
| 29 |
+
"sample_rate": 16000,
|
| 30 |
+
"mel_features": 128,
|
| 31 |
+
"n_speakers": 4,
|
| 32 |
+
}
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
## Usage
|
| 36 |
+
|
| 37 |
+
### Python (coremltools)
|
| 38 |
+
|
| 39 |
+
```python
|
| 40 |
+
import coremltools as ct
|
| 41 |
+
import numpy as np
|
| 42 |
+
|
| 43 |
+
# Load models
|
| 44 |
+
pre_encoder = ct.models.MLModel("Pipeline_PreEncoder.mlpackage",
|
| 45 |
+
compute_units=ct.ComputeUnit.CPU_ONLY)
|
| 46 |
+
head = ct.models.MLModel("Pipeline_Head_Fixed.mlpackage",
|
| 47 |
+
compute_units=ct.ComputeUnit.CPU_ONLY)
|
| 48 |
+
|
| 49 |
+
# Initialize state
|
| 50 |
+
spkcache = np.zeros((1, 188, 512), dtype=np.float32)
|
| 51 |
+
fifo = np.zeros((1, 188, 512), dtype=np.float32)
|
| 52 |
+
|
| 53 |
+
# Process chunk (mel_features: [1, 112, 128])
|
| 54 |
+
pre_out = pre_encoder.predict({
|
| 55 |
+
"chunk": mel_features,
|
| 56 |
+
"chunk_lengths": np.array([actual_length], dtype=np.int32),
|
| 57 |
+
"spkcache": spkcache,
|
| 58 |
+
"spkcache_lengths": np.array([0], dtype=np.int32),
|
| 59 |
+
"fifo": fifo,
|
| 60 |
+
"fifo_lengths": np.array([0], dtype=np.int32)
|
| 61 |
+
})
|
| 62 |
+
|
| 63 |
+
head_out = head.predict({
|
| 64 |
+
"pre_encoder_embs": pre_out["pre_encoder_embs"],
|
| 65 |
+
"pre_encoder_lengths": pre_out["pre_encoder_lengths"],
|
| 66 |
+
"chunk_embs_in": pre_out["chunk_embs_in"],
|
| 67 |
+
"chunk_lens_in": pre_out["chunk_lens_in"]
|
| 68 |
+
})
|
| 69 |
+
|
| 70 |
+
predictions = head_out["speaker_preds"] # [1, T, 4]
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
### Swift (Core ML)
|
| 74 |
+
|
| 75 |
+
```swift
|
| 76 |
+
import CoreML
|
| 77 |
+
|
| 78 |
+
let preEncoder = try MLModel(contentsOf: preEncoderURL)
|
| 79 |
+
let head = try MLModel(contentsOf: headURL)
|
| 80 |
+
|
| 81 |
+
// Create input with MLMultiArray for chunk, spkcache, fifo
|
| 82 |
+
let preEncoderInput = try preEncoder.prediction(from: inputProvider)
|
| 83 |
+
let headInput = try head.prediction(from: preEncoderInput)
|
| 84 |
+
|
| 85 |
+
let predictions = headInput.featureValue(for: "speaker_preds")
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
## Mel Spectrogram Settings
|
| 89 |
+
|
| 90 |
+
For compatibility with the original NeMo model:
|
| 91 |
+
|
| 92 |
+
```python
|
| 93 |
+
mel_config = {
|
| 94 |
+
"sample_rate": 16000,
|
| 95 |
+
"n_fft": 512,
|
| 96 |
+
"win_length": 400, # 25ms
|
| 97 |
+
"hop_length": 160, # 10ms
|
| 98 |
+
"n_mels": 128,
|
| 99 |
+
"preemph": 0.97,
|
| 100 |
+
"log_zero_guard_value": 2**-24,
|
| 101 |
+
"normalize": "per_feature",
|
| 102 |
+
}
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
## Streaming Pipeline
|
| 106 |
+
|
| 107 |
+
1. **Chunk audio** into ~480ms windows (48 mel frames core + context)
|
| 108 |
+
2. **Compute mel spectrogram** for each chunk
|
| 109 |
+
3. **Run PreEncoder** with current state (spkcache + fifo)
|
| 110 |
+
4. **Run Head** to get 4-speaker probabilities
|
| 111 |
+
5. **Update state** (spkcache/fifo buffers)
|
| 112 |
+
6. **Threshold predictions** (default: 0.5) for binary speaker activity
|
| 113 |
+
|
| 114 |
+
## Accuracy
|
| 115 |
+
|
| 116 |
+
Verified within 0.12% of original NeMo PyTorch model on chunk-level predictions.
|
| 117 |
+
|
| 118 |
+
## Requirements
|
| 119 |
+
|
| 120 |
+
- macOS 12+ or iOS 15+
|
| 121 |
+
- Apple Silicon (M1/M2/M3) recommended
|
| 122 |
+
- Python: `coremltools`, `numpy`, `torch`, `torchaudio`
|
| 123 |
+
|
| 124 |
+
## License
|
| 125 |
+
|
| 126 |
+
Apache 2.0 (following NVIDIA NeMo licensing)
|
| 127 |
+
|
| 128 |
+
## Citation
|
| 129 |
+
|
| 130 |
+
```bibtex
|
| 131 |
+
@article{park2024sortformer,
|
| 132 |
+
title={Sortformer: Seamless Integration of Speaker Diarization and ASR by Bridging Timestamps and Tokens},
|
| 133 |
+
author={Park, Taejin and Huang, He and Koluguri, Nithin and Georgiou, Panagiotis and Watanabe, Shinji and Ginsburg, Boris},
|
| 134 |
+
journal={arXiv preprint arXiv:2409.06656},
|
| 135 |
+
year={2024}
|
| 136 |
+
}
|
| 137 |
+
```
|
convert_to_coreml.py
ADDED
|
@@ -0,0 +1,357 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import coremltools as ct
|
| 4 |
+
import argparse
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import numpy as np
|
| 8 |
+
import types
|
| 9 |
+
|
| 10 |
+
# Ensure we use the right environment for imports
|
| 11 |
+
# (User's environment has 'nemo' installed)
|
| 12 |
+
from nemo.collections.asr.models import SortformerEncLabelModel
|
| 13 |
+
from nemo.collections.asr.parts.preprocessing.features import FilterbankFeaturesTA
|
| 14 |
+
from coreml_wrappers import *
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def convert_pre_encoder(
|
| 18 |
+
model: SortformerEncLabelModel,
|
| 19 |
+
precision,
|
| 20 |
+
name: str,
|
| 21 |
+
input_chunk, input_chunk_len,
|
| 22 |
+
input_spkcache, input_spkcache_len,
|
| 23 |
+
input_fifo, input_fifo_len
|
| 24 |
+
):
|
| 25 |
+
wrapper = PreEncoderWrapper(model)
|
| 26 |
+
wrapper.eval()
|
| 27 |
+
|
| 28 |
+
traced_model = torch.jit.trace(wrapper, (
|
| 29 |
+
input_chunk, input_chunk_len,
|
| 30 |
+
input_spkcache, input_spkcache_len,
|
| 31 |
+
input_fifo, input_fifo_len
|
| 32 |
+
))
|
| 33 |
+
|
| 34 |
+
mlmodel = ct.convert(
|
| 35 |
+
traced_model,
|
| 36 |
+
inputs=[
|
| 37 |
+
ct.TensorType(name="chunk", shape=input_chunk.shape, dtype=np.float32),
|
| 38 |
+
ct.TensorType(name="chunk_lengths", shape=input_chunk_len.shape, dtype=np.int32),
|
| 39 |
+
ct.TensorType(name="spkcache", shape=input_spkcache.shape),
|
| 40 |
+
ct.TensorType(name="spkcache_lengths", shape=input_spkcache_len.shape, dtype=np.int32),
|
| 41 |
+
ct.TensorType(name="fifo", shape=input_fifo.shape),
|
| 42 |
+
ct.TensorType(name="fifo_lengths", shape=input_fifo_len.shape, dtype=np.int32),
|
| 43 |
+
],
|
| 44 |
+
outputs=[
|
| 45 |
+
ct.TensorType(name="pre_encoder_embs", dtype=np.float32),
|
| 46 |
+
ct.TensorType(name="pre_encoder_lengths", dtype=np.int32),
|
| 47 |
+
ct.TensorType(name="chunk_pre_encoder_embs", dtype=np.float32),
|
| 48 |
+
ct.TensorType(name="chunk_pre_encoder_lengths", dtype=np.int32),
|
| 49 |
+
],
|
| 50 |
+
minimum_deployment_target=ct.target.iOS16,
|
| 51 |
+
compute_precision=precision,
|
| 52 |
+
compute_units=ct.ComputeUnit.ALL
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
mlmodel.save(name)
|
| 56 |
+
return mlmodel, traced_model
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def convert_head(
|
| 60 |
+
model: SortformerEncLabelModel,
|
| 61 |
+
precision,
|
| 62 |
+
name: str,
|
| 63 |
+
pre_encoder_embs, pre_encoder_lengths,
|
| 64 |
+
chunk_pre_encoder_embs, chunk_pre_encoder_lengths,
|
| 65 |
+
):
|
| 66 |
+
wrapper = SortformerHeadWrapper(model)
|
| 67 |
+
wrapper.eval()
|
| 68 |
+
|
| 69 |
+
traced_model = torch.jit.trace(wrapper, (
|
| 70 |
+
pre_encoder_embs, pre_encoder_lengths,
|
| 71 |
+
chunk_pre_encoder_embs, chunk_pre_encoder_lengths,
|
| 72 |
+
))
|
| 73 |
+
|
| 74 |
+
mlmodel = ct.convert(
|
| 75 |
+
traced_model,
|
| 76 |
+
inputs=[
|
| 77 |
+
ct.TensorType(name="pre_encoder_embs", shape=pre_encoder_embs.shape, dtype=np.float32),
|
| 78 |
+
ct.TensorType(name="pre_encoder_lengths", shape=pre_encoder_lengths.shape, dtype=np.int32),
|
| 79 |
+
ct.TensorType(name="chunk_pre_encoder_embs", shape=chunk_pre_encoder_embs.shape, dtype=np.float32),
|
| 80 |
+
ct.TensorType(name="chunk_pre_encoder_lengths", shape=chunk_pre_encoder_lengths.shape, dtype=np.int32),
|
| 81 |
+
],
|
| 82 |
+
outputs=[
|
| 83 |
+
ct.TensorType(name="speaker_preds", dtype=np.float32),
|
| 84 |
+
ct.TensorType(name="chunk_pre_encoder_embs"),
|
| 85 |
+
ct.TensorType(name="chunk_pre_encoder_lengths")
|
| 86 |
+
],
|
| 87 |
+
minimum_deployment_target=ct.target.iOS16,
|
| 88 |
+
compute_precision=precision,
|
| 89 |
+
compute_units=ct.ComputeUnit.ALL
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
mlmodel.save(name)
|
| 93 |
+
return mlmodel, traced_model
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def export_pipeline(
|
| 97 |
+
model_name: str,
|
| 98 |
+
output_dir: str,
|
| 99 |
+
preproc_precision: str = "fp32",
|
| 100 |
+
pre_encoder_precision: str = "fp32",
|
| 101 |
+
head_precision: str = "fp16",
|
| 102 |
+
skip_modules: bool = False,
|
| 103 |
+
verify: bool = False
|
| 104 |
+
):
|
| 105 |
+
"""
|
| 106 |
+
Export the Sortformer model as a pipeline of separate CoreML models.
|
| 107 |
+
Each component can have different precision.
|
| 108 |
+
|
| 109 |
+
Components:
|
| 110 |
+
1. Preprocessor (audio -> mel features)
|
| 111 |
+
2. Pre-encoder (features -> pre-encoded embeddings + concat with spkcache/fifo)
|
| 112 |
+
3. Conformer Encoder (pre-encoded -> encoder embeddings)
|
| 113 |
+
4. Transformer Encoder (encoder embeddings -> predictions)
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
:param model_name: NeMo model name or path
|
| 117 |
+
:param output_dir: Output directory for mlpackage files
|
| 118 |
+
:param preproc_precision: Precision for preprocessor ("fp16" or "fp32")
|
| 119 |
+
:param pre_encoder_precision: Precision for pre-encoder ("fp16" or "fp32")
|
| 120 |
+
:param head_precision: Precision for head module (conformer + transformer) ("fp16" or "fp32")
|
| 121 |
+
:param skip_modules: Whether to skip the individual modules
|
| 122 |
+
"""
|
| 123 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 124 |
+
|
| 125 |
+
def get_precision(s):
|
| 126 |
+
return ct.precision.FLOAT16 if s.lower() == "fp16" else ct.precision.FLOAT32
|
| 127 |
+
|
| 128 |
+
print("=" * 70)
|
| 129 |
+
print("Exporting Sortformer Pipeline")
|
| 130 |
+
print("=" * 70)
|
| 131 |
+
print(f"Preprocessor: {preproc_precision}")
|
| 132 |
+
print(f"Pre-encoder: {pre_encoder_precision}")
|
| 133 |
+
print(f"Head: {head_precision}")
|
| 134 |
+
print("=" * 70)
|
| 135 |
+
|
| 136 |
+
# Load model
|
| 137 |
+
print(f"\nLoading model: {model_name}")
|
| 138 |
+
if os.path.exists(model_name):
|
| 139 |
+
model = SortformerEncLabelModel.restore_from(model_name, map_location=torch.device("cpu"))
|
| 140 |
+
else:
|
| 141 |
+
model = SortformerEncLabelModel.from_pretrained(model_name, map_location=torch.device("cpu"))
|
| 142 |
+
model.eval()
|
| 143 |
+
|
| 144 |
+
# Configure for streaming
|
| 145 |
+
print("Configuring for streaming...")
|
| 146 |
+
model.sortformer_modules.chunk_len = 6
|
| 147 |
+
model.sortformer_modules.chunk_right_context = 1
|
| 148 |
+
model.sortformer_modules.chunk_left_context = 1
|
| 149 |
+
model.sortformer_modules.fifo_len = 40
|
| 150 |
+
model.sortformer_modules.spkcache_len = 120
|
| 151 |
+
model.sortformer_modules.spkcache_update_period = 32
|
| 152 |
+
|
| 153 |
+
modules = model.sortformer_modules
|
| 154 |
+
preprocessor = model.preprocessor
|
| 155 |
+
pre_encoder_mlmodel = None
|
| 156 |
+
head_mlmodel = None
|
| 157 |
+
|
| 158 |
+
if hasattr(preprocessor, 'pad_to'):
|
| 159 |
+
preprocessor.pad_to = 0
|
| 160 |
+
|
| 161 |
+
# Calculate dimensions
|
| 162 |
+
chunk_len = modules.chunk_len
|
| 163 |
+
input_chunk_time = (
|
| 164 |
+
chunk_len + modules.chunk_left_context + modules.chunk_right_context) * modules.subsampling_factor
|
| 165 |
+
fc_d_model = modules.fc_d_model # 512 - Conformer output
|
| 166 |
+
tf_d_model = modules.tf_d_model # 192 - Transformer input (after projection)
|
| 167 |
+
spkcache_len = modules.spkcache_len
|
| 168 |
+
fifo_len = modules.fifo_len
|
| 169 |
+
|
| 170 |
+
# Get feature dim
|
| 171 |
+
feat_dim = 128
|
| 172 |
+
if hasattr(model, 'encoder') and hasattr(model.encoder, '_feat_in'):
|
| 173 |
+
feat_dim = model.encoder._feat_in
|
| 174 |
+
|
| 175 |
+
# Pre-encode output size (after subsampling)
|
| 176 |
+
pre_encode_out_len = input_chunk_time // modules.subsampling_factor
|
| 177 |
+
total_concat_len = spkcache_len + fifo_len + pre_encode_out_len
|
| 178 |
+
|
| 179 |
+
print(f"Input chunk frames: {input_chunk_time}")
|
| 180 |
+
print(f"Pre-encode output: {pre_encode_out_len}")
|
| 181 |
+
print(f"Total concat len: {total_concat_len}")
|
| 182 |
+
print(f"Feature dim: {feat_dim}, FC d_model: {fc_d_model}, TF d_model: {tf_d_model}")
|
| 183 |
+
|
| 184 |
+
# Audio samples for preprocessor
|
| 185 |
+
stride = 160
|
| 186 |
+
window = 400
|
| 187 |
+
audio_samples = (input_chunk_time - 1) * stride + window
|
| 188 |
+
print(audio_samples)
|
| 189 |
+
|
| 190 |
+
# =========================================================
|
| 191 |
+
# 1. Export Preprocessor
|
| 192 |
+
# =========================================================
|
| 193 |
+
|
| 194 |
+
if not skip_modules:
|
| 195 |
+
print("\n[1/4] Exporting Preprocessor...")
|
| 196 |
+
|
| 197 |
+
preproc_wrapper = PreprocessorWrapper(preprocessor)
|
| 198 |
+
preproc_wrapper.eval()
|
| 199 |
+
|
| 200 |
+
dummy_wav = torch.randn(1, audio_samples)
|
| 201 |
+
dummy_len = torch.tensor([audio_samples], dtype=torch.long)
|
| 202 |
+
|
| 203 |
+
traced_preproc = torch.jit.trace(preproc_wrapper, (dummy_wav, dummy_len))
|
| 204 |
+
|
| 205 |
+
preproc_mlmodel = ct.convert(
|
| 206 |
+
traced_preproc,
|
| 207 |
+
inputs=[
|
| 208 |
+
ct.TensorType(name="audio_signal", shape=dummy_wav.shape),
|
| 209 |
+
ct.TensorType(name="length", shape=dummy_len.shape, dtype=np.int32)
|
| 210 |
+
],
|
| 211 |
+
outputs=[
|
| 212 |
+
ct.TensorType(name="features", dtype=np.float32),
|
| 213 |
+
ct.TensorType(name="feature_lengths", dtype=np.int32)
|
| 214 |
+
],
|
| 215 |
+
minimum_deployment_target=ct.target.iOS16,
|
| 216 |
+
compute_precision=get_precision(preproc_precision),
|
| 217 |
+
compute_units=ct.ComputeUnit.ALL
|
| 218 |
+
)
|
| 219 |
+
preproc_mlmodel.save(os.path.join(output_dir, "Pipeline_Preprocessor.mlpackage"))
|
| 220 |
+
print(" Saved Pipeline_Preprocessor.mlpackage")
|
| 221 |
+
|
| 222 |
+
# =========================================================
|
| 223 |
+
# 2. Export Pre-Encoder
|
| 224 |
+
# =========================================================
|
| 225 |
+
|
| 226 |
+
input_chunk = torch.randn(1, input_chunk_time, feat_dim)
|
| 227 |
+
input_chunk_len = torch.tensor([input_chunk_time], dtype=torch.long)
|
| 228 |
+
input_spkcache = torch.randn(1, spkcache_len, fc_d_model)
|
| 229 |
+
input_spkcache_len = torch.tensor([spkcache_len], dtype=torch.long)
|
| 230 |
+
input_fifo = torch.randn(1, fifo_len, fc_d_model)
|
| 231 |
+
input_fifo_len = torch.tensor([fifo_len], dtype=torch.long)
|
| 232 |
+
|
| 233 |
+
if not skip_modules:
|
| 234 |
+
print("\n[2/4] Exporting Pre-Encoder...")
|
| 235 |
+
pre_encoder_mlmodel, _ = convert_pre_encoder(
|
| 236 |
+
model,
|
| 237 |
+
get_precision(pre_encoder_precision),
|
| 238 |
+
os.path.join(output_dir, "Pipeline_PreEncoder.mlpackage"),
|
| 239 |
+
input_chunk, input_chunk_len,
|
| 240 |
+
input_spkcache, input_spkcache_len,
|
| 241 |
+
input_fifo, input_fifo_len
|
| 242 |
+
)
|
| 243 |
+
print(" Saved Pipeline_PreEncoder.mlpackage")
|
| 244 |
+
|
| 245 |
+
# =========================================================
|
| 246 |
+
# 3. Export Conformer Encoder
|
| 247 |
+
# =========================================================
|
| 248 |
+
|
| 249 |
+
pre_encoder_embs = torch.randn(1, total_concat_len, fc_d_model)
|
| 250 |
+
pre_encoder_lengths = torch.tensor([total_concat_len], dtype=torch.long)
|
| 251 |
+
chunk_pre_encoder_embs = torch.randn(1, pre_encode_out_len, fc_d_model)
|
| 252 |
+
chunk_pre_encoder_lengths = torch.tensor([pre_encode_out_len], dtype=torch.long)
|
| 253 |
+
|
| 254 |
+
if not skip_modules:
|
| 255 |
+
print("\n[3/4] Exporting Head Module...")
|
| 256 |
+
head_mlmodel, _ = convert_head(
|
| 257 |
+
model,
|
| 258 |
+
get_precision(head_precision),
|
| 259 |
+
os.path.join(output_dir, "Pipeline_Head.mlpackage"),
|
| 260 |
+
pre_encoder_embs, pre_encoder_lengths,
|
| 261 |
+
chunk_pre_encoder_embs, chunk_pre_encoder_lengths
|
| 262 |
+
)
|
| 263 |
+
print(" Saved Pipeline_Head.mlpackage")
|
| 264 |
+
|
| 265 |
+
# =========================================================
|
| 266 |
+
# 5. Create Combined Pipelines
|
| 267 |
+
# =========================================================
|
| 268 |
+
print("\n[4/4] Creating Combined ML Pipelines...")
|
| 269 |
+
|
| 270 |
+
# Load the exported models
|
| 271 |
+
if skip_modules and not verify:
|
| 272 |
+
print('Loading Pipeline CoreML Modules...')
|
| 273 |
+
pre_encoder_mlmodel = ct.models.MLModel(
|
| 274 |
+
os.path.join(output_dir, "Pipeline_PreEncoder.mlpackage")
|
| 275 |
+
)
|
| 276 |
+
head_mlmodel = ct.models.MLModel(
|
| 277 |
+
os.path.join(output_dir, "Pipeline_Head.mlpackage")
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
assert pre_encoder_mlmodel is not None and head_mlmodel is not None
|
| 281 |
+
|
| 282 |
+
# Create Full Pipeline: PreEncoder → Conformer → Transformer
|
| 283 |
+
# Inputs: chunk, chunk_lengths, spkcache, spkcache_lengths, fifo, fifo_lengths
|
| 284 |
+
# Output: preds
|
| 285 |
+
|
| 286 |
+
if verify:
|
| 287 |
+
pipeline_model = ct.models.MLModel('coreml_models/SortformerPipeline.mlpackage')
|
| 288 |
+
spec = pipeline_model.get_spec()
|
| 289 |
+
print(pipeline_model.input_description)
|
| 290 |
+
print(pipeline_model.output_description)
|
| 291 |
+
print(spec)
|
| 292 |
+
else:
|
| 293 |
+
try:
|
| 294 |
+
# Both models now use compute_units=ALL.
|
| 295 |
+
# The pre_encoder uses ANE-safe gather operations in fixed_concat_and_pad
|
| 296 |
+
# to avoid zero-length slices that would crash on ANE.
|
| 297 |
+
|
| 298 |
+
pipeline_model = ct.utils.make_pipeline(
|
| 299 |
+
pre_encoder_mlmodel,
|
| 300 |
+
head_mlmodel,
|
| 301 |
+
compute_units=ct.ComputeUnit.ALL
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
# Save the pipeline
|
| 305 |
+
pipeline_model.save(os.path.join(output_dir, "SortformerPipeline.mlpackage"))
|
| 306 |
+
print(" Saved SortformerPipeline.mlpackage (PreEncoder + Conformer + Transformer)")
|
| 307 |
+
except Exception as e:
|
| 308 |
+
print(f" Warning: Could not create full pipeline: {e}")
|
| 309 |
+
import traceback
|
| 310 |
+
traceback.print_exc()
|
| 311 |
+
|
| 312 |
+
# =========================================================
|
| 313 |
+
# Summary
|
| 314 |
+
# =========================================================
|
| 315 |
+
print("\n" + "=" * 70)
|
| 316 |
+
print("Pipeline Export Complete!")
|
| 317 |
+
print("=" * 70)
|
| 318 |
+
print(f"Output directory: {output_dir}")
|
| 319 |
+
print("\nExported models:")
|
| 320 |
+
print(f" 1. Pipeline_Preprocessor.mlpackage ({preproc_precision})")
|
| 321 |
+
print(f" 2. Pipeline_PreEncoder.mlpackage ({pre_encoder_precision})")
|
| 322 |
+
print(f" 3. Pipeline_Head.mlpackage ({head_precision})")
|
| 323 |
+
print(f" 5. SortformerPipeline.mlpackage (combined: PreEncoder+Head)")
|
| 324 |
+
print("\nUsage in inference:")
|
| 325 |
+
print(" audio -> Preprocessor -> features")
|
| 326 |
+
print(" features + spkcache + fifo -> SortformerPipeline -> predictions")
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
if __name__ == "__main__":
|
| 330 |
+
parser = argparse.ArgumentParser()
|
| 331 |
+
parser.add_argument("--model_name", default="nvidia/diar_streaming_sortformer_4spk-v2.1",
|
| 332 |
+
help="NeMo model name or path")
|
| 333 |
+
parser.add_argument("--output_dir", default="coreml_models", help="Output directory")
|
| 334 |
+
parser.add_argument("--fp16", action="store_true", help="Use FP16 for single model export")
|
| 335 |
+
|
| 336 |
+
# Pipeline options
|
| 337 |
+
parser.add_argument("--preproc_precision", default="fp32", choices=["fp16", "fp32"], help="Preprocessor precision")
|
| 338 |
+
parser.add_argument("--pre_encoder_precision", default="fp32", choices=["fp16", "fp32"],
|
| 339 |
+
help="Pre-encoder precision")
|
| 340 |
+
parser.add_argument("--head_precision", default="fp16", choices=["fp16", "fp32"],
|
| 341 |
+
help="Conformer encoder precision")
|
| 342 |
+
parser.add_argument("--skip_modules", action="store_true", help="Skip modules in pipeline export")
|
| 343 |
+
parser.add_argument("--verify", action="store_true", help="Skip pipeline in pipeline export")
|
| 344 |
+
|
| 345 |
+
args = parser.parse_args()
|
| 346 |
+
|
| 347 |
+
print(f"CoreMLTools Version: {ct.__version__}")
|
| 348 |
+
|
| 349 |
+
export_pipeline(
|
| 350 |
+
args.model_name,
|
| 351 |
+
args.output_dir,
|
| 352 |
+
preproc_precision=args.preproc_precision,
|
| 353 |
+
pre_encoder_precision=args.pre_encoder_precision,
|
| 354 |
+
head_precision=args.head_precision,
|
| 355 |
+
skip_modules=args.skip_modules,
|
| 356 |
+
verify=args.verify,
|
| 357 |
+
)
|
export_nvidia_pipeline.py
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Export combined SortformerPipeline with NVIDIA's 1.04s latency configuration.
|
| 2 |
+
|
| 3 |
+
This creates models compatible with the Swift SortformerDiarizer interface.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn as nn
|
| 10 |
+
import numpy as np
|
| 11 |
+
import coremltools as ct
|
| 12 |
+
from nemo.collections.asr.models import SortformerEncLabelModel
|
| 13 |
+
from coreml_wrappers import PreEncoderWrapper
|
| 14 |
+
|
| 15 |
+
# NVIDIA's 1.04s latency configuration
|
| 16 |
+
NVIDIA_CONFIG = {
|
| 17 |
+
'chunk_len': 6,
|
| 18 |
+
'chunk_right_context': 7, # Was 1
|
| 19 |
+
'chunk_left_context': 1,
|
| 20 |
+
'fifo_len': 188, # Was 40
|
| 21 |
+
'spkcache_len': 188, # Was 120
|
| 22 |
+
'spkcache_update_period': 144, # Was 30
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
print("=" * 70)
|
| 26 |
+
print("Exporting Combined SortformerPipeline with NVIDIA Config")
|
| 27 |
+
print("=" * 70)
|
| 28 |
+
print(f"Config: {NVIDIA_CONFIG}")
|
| 29 |
+
|
| 30 |
+
# Load model
|
| 31 |
+
print("\nLoading NeMo model...")
|
| 32 |
+
model = SortformerEncLabelModel.from_pretrained(
|
| 33 |
+
"nvidia/diar_streaming_sortformer_4spk-v2.1", map_location="cpu"
|
| 34 |
+
)
|
| 35 |
+
model.eval()
|
| 36 |
+
|
| 37 |
+
# Apply NVIDIA config
|
| 38 |
+
modules = model.sortformer_modules
|
| 39 |
+
modules.chunk_len = NVIDIA_CONFIG['chunk_len']
|
| 40 |
+
modules.chunk_right_context = NVIDIA_CONFIG['chunk_right_context']
|
| 41 |
+
modules.chunk_left_context = NVIDIA_CONFIG['chunk_left_context']
|
| 42 |
+
modules.fifo_len = NVIDIA_CONFIG['fifo_len']
|
| 43 |
+
modules.spkcache_len = NVIDIA_CONFIG['spkcache_len']
|
| 44 |
+
modules.spkcache_update_period = NVIDIA_CONFIG['spkcache_update_period']
|
| 45 |
+
|
| 46 |
+
# Calculate dimensions
|
| 47 |
+
chunk_len = modules.chunk_len
|
| 48 |
+
input_chunk_time = (chunk_len + modules.chunk_left_context + modules.chunk_right_context) * modules.subsampling_factor
|
| 49 |
+
fc_d_model = modules.fc_d_model # 512
|
| 50 |
+
spkcache_len = modules.spkcache_len
|
| 51 |
+
fifo_len = modules.fifo_len
|
| 52 |
+
|
| 53 |
+
feat_dim = 128
|
| 54 |
+
pre_encode_out_len = input_chunk_time // modules.subsampling_factor
|
| 55 |
+
total_concat_len = spkcache_len + fifo_len + pre_encode_out_len
|
| 56 |
+
|
| 57 |
+
print(f"\nDimensions:")
|
| 58 |
+
print(f" Input chunk frames: {input_chunk_time}")
|
| 59 |
+
print(f" Pre-encode output: {pre_encode_out_len}")
|
| 60 |
+
print(f" Total concat len: {total_concat_len}")
|
| 61 |
+
print(f" FC d_model: {fc_d_model}")
|
| 62 |
+
print(f" FIFO len: {fifo_len}")
|
| 63 |
+
print(f" Spkcache len: {spkcache_len}")
|
| 64 |
+
|
| 65 |
+
# Calculate audio samples needed for preprocessor
|
| 66 |
+
# NeMo adds internal padding (16 samples each side), so the formula is different
|
| 67 |
+
# Empirically tested: 17920 samples → 112 mel frames, 18160 → 113 frames
|
| 68 |
+
# For 112 frames, we need 17920 samples (not the naive 18160 from stride formula)
|
| 69 |
+
mel_stride = 160
|
| 70 |
+
mel_window = 400
|
| 71 |
+
# Correct formula accounting for NeMo padding
|
| 72 |
+
preprocessor_audio_samples = 17920 # Empirically determined for 112 frames
|
| 73 |
+
print(f" Preprocessor audio samples: {preprocessor_audio_samples}")
|
| 74 |
+
|
| 75 |
+
# Create output directory
|
| 76 |
+
output_dir = "coreml_models_nvidia"
|
| 77 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 78 |
+
|
| 79 |
+
# =========================================================
|
| 80 |
+
# 0. Export Preprocessor (audio → mel features)
|
| 81 |
+
# =========================================================
|
| 82 |
+
print("\n[0/3] Exporting Preprocessor...")
|
| 83 |
+
|
| 84 |
+
from coreml_wrappers import PreprocessorWrapper
|
| 85 |
+
|
| 86 |
+
preprocessor_wrapper = PreprocessorWrapper(model.preprocessor)
|
| 87 |
+
preprocessor_wrapper.eval()
|
| 88 |
+
|
| 89 |
+
# Trace with correct audio sample count
|
| 90 |
+
audio_input = torch.randn(1, preprocessor_audio_samples)
|
| 91 |
+
audio_length = torch.tensor([preprocessor_audio_samples], dtype=torch.long)
|
| 92 |
+
|
| 93 |
+
traced_preprocessor = torch.jit.trace(preprocessor_wrapper, (audio_input, audio_length))
|
| 94 |
+
|
| 95 |
+
preprocessor_ml = ct.convert(
|
| 96 |
+
traced_preprocessor,
|
| 97 |
+
inputs=[
|
| 98 |
+
ct.TensorType(name="audio_signal", shape=audio_input.shape, dtype=np.float32),
|
| 99 |
+
ct.TensorType(name="length", shape=audio_length.shape, dtype=np.int32),
|
| 100 |
+
],
|
| 101 |
+
outputs=[
|
| 102 |
+
ct.TensorType(name="features", dtype=np.float32),
|
| 103 |
+
ct.TensorType(name="feature_lengths", dtype=np.int32),
|
| 104 |
+
],
|
| 105 |
+
minimum_deployment_target=ct.target.iOS16,
|
| 106 |
+
compute_precision=ct.precision.FLOAT32,
|
| 107 |
+
compute_units=ct.ComputeUnit.CPU_ONLY # CPU for FP32 precision
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
preprocessor_ml.save(os.path.join(output_dir, "Pipeline_Preprocessor.mlpackage"))
|
| 111 |
+
print(f" Saved {output_dir}/Pipeline_Preprocessor.mlpackage")
|
| 112 |
+
|
| 113 |
+
# =========================================================
|
| 114 |
+
# 1. Export PreEncoder
|
| 115 |
+
# =========================================================
|
| 116 |
+
print("\n[1/3] Exporting PreEncoder...")
|
| 117 |
+
|
| 118 |
+
input_chunk = torch.randn(1, input_chunk_time, feat_dim)
|
| 119 |
+
input_chunk_len = torch.tensor([input_chunk_time], dtype=torch.long)
|
| 120 |
+
input_spkcache = torch.randn(1, spkcache_len, fc_d_model)
|
| 121 |
+
input_spkcache_len = torch.tensor([spkcache_len], dtype=torch.long)
|
| 122 |
+
input_fifo = torch.randn(1, fifo_len, fc_d_model)
|
| 123 |
+
input_fifo_len = torch.tensor([fifo_len], dtype=torch.long)
|
| 124 |
+
|
| 125 |
+
pre_encoder = PreEncoderWrapper(model)
|
| 126 |
+
pre_encoder.eval()
|
| 127 |
+
|
| 128 |
+
traced_pre_encoder = torch.jit.trace(pre_encoder, (
|
| 129 |
+
input_chunk, input_chunk_len,
|
| 130 |
+
input_spkcache, input_spkcache_len,
|
| 131 |
+
input_fifo, input_fifo_len
|
| 132 |
+
))
|
| 133 |
+
|
| 134 |
+
# Use names that match for pipeline connection
|
| 135 |
+
pre_encoder_ml = ct.convert(
|
| 136 |
+
traced_pre_encoder,
|
| 137 |
+
inputs=[
|
| 138 |
+
ct.TensorType(name="chunk", shape=input_chunk.shape, dtype=np.float32),
|
| 139 |
+
ct.TensorType(name="chunk_lengths", shape=input_chunk_len.shape, dtype=np.int32),
|
| 140 |
+
ct.TensorType(name="spkcache", shape=input_spkcache.shape, dtype=np.float32),
|
| 141 |
+
ct.TensorType(name="spkcache_lengths", shape=input_spkcache_len.shape, dtype=np.int32),
|
| 142 |
+
ct.TensorType(name="fifo", shape=input_fifo.shape, dtype=np.float32),
|
| 143 |
+
ct.TensorType(name="fifo_lengths", shape=input_fifo_len.shape, dtype=np.int32),
|
| 144 |
+
],
|
| 145 |
+
outputs=[
|
| 146 |
+
ct.TensorType(name="pre_encoder_embs", dtype=np.float32),
|
| 147 |
+
ct.TensorType(name="pre_encoder_lengths", dtype=np.int32),
|
| 148 |
+
ct.TensorType(name="chunk_embs_in", dtype=np.float32),
|
| 149 |
+
ct.TensorType(name="chunk_lens_in", dtype=np.int32),
|
| 150 |
+
],
|
| 151 |
+
minimum_deployment_target=ct.target.iOS16,
|
| 152 |
+
compute_precision=ct.precision.FLOAT32,
|
| 153 |
+
compute_units=ct.ComputeUnit.ALL
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
pre_encoder_ml.save(os.path.join(output_dir, "Pipeline_PreEncoder.mlpackage"))
|
| 157 |
+
print(f" Saved {output_dir}/Pipeline_PreEncoder.mlpackage")
|
| 158 |
+
|
| 159 |
+
# =========================================================
|
| 160 |
+
# 2. Export Fixed Head (with identity ops to preserve embeddings)
|
| 161 |
+
# =========================================================
|
| 162 |
+
print("\n[2/3] Exporting Fixed Head...")
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
class FixedSortformerHead(nn.Module):
|
| 166 |
+
"""Head wrapper that forces chunk_pre_encoder_embs to be computed."""
|
| 167 |
+
|
| 168 |
+
def __init__(self, model):
|
| 169 |
+
super().__init__()
|
| 170 |
+
self.model = model
|
| 171 |
+
self.identity_scale = nn.Parameter(torch.ones(1), requires_grad=False)
|
| 172 |
+
|
| 173 |
+
def forward(self, pre_encoder_embs, pre_encoder_lengths, chunk_embs_in, chunk_lens_in):
|
| 174 |
+
# Frontend encoder
|
| 175 |
+
spkcache_fifo_chunk_fc_encoder_embs, spkcache_fifo_chunk_fc_encoder_lengths = self.model.frontend_encoder(
|
| 176 |
+
processed_signal=pre_encoder_embs,
|
| 177 |
+
processed_signal_length=pre_encoder_lengths,
|
| 178 |
+
bypass_pre_encode=True,
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
# Forward inference
|
| 182 |
+
speaker_preds = self.model.forward_infer(
|
| 183 |
+
spkcache_fifo_chunk_fc_encoder_embs, spkcache_fifo_chunk_fc_encoder_lengths
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
# Force the embedding to be computed (prevents optimization)
|
| 187 |
+
chunk_pre_encoder_embs = chunk_embs_in * self.identity_scale
|
| 188 |
+
chunk_pre_encoder_lengths = chunk_lens_in + 0
|
| 189 |
+
|
| 190 |
+
return speaker_preds, chunk_pre_encoder_embs, chunk_pre_encoder_lengths
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
head = FixedSortformerHead(model)
|
| 194 |
+
head.eval()
|
| 195 |
+
|
| 196 |
+
# Input shapes for head - must match PreEncoder output names
|
| 197 |
+
pre_encoder_embs = torch.randn(1, total_concat_len, fc_d_model)
|
| 198 |
+
pre_encoder_lengths = torch.tensor([total_concat_len], dtype=torch.long)
|
| 199 |
+
chunk_embs_in = torch.randn(1, pre_encode_out_len, fc_d_model)
|
| 200 |
+
chunk_lens_in = torch.tensor([pre_encode_out_len], dtype=torch.long)
|
| 201 |
+
|
| 202 |
+
traced_head = torch.jit.trace(head, (
|
| 203 |
+
pre_encoder_embs, pre_encoder_lengths,
|
| 204 |
+
chunk_embs_in, chunk_lens_in
|
| 205 |
+
))
|
| 206 |
+
|
| 207 |
+
head_ml = ct.convert(
|
| 208 |
+
traced_head,
|
| 209 |
+
inputs=[
|
| 210 |
+
ct.TensorType(name="pre_encoder_embs", shape=pre_encoder_embs.shape, dtype=np.float32),
|
| 211 |
+
ct.TensorType(name="pre_encoder_lengths", shape=pre_encoder_lengths.shape, dtype=np.int32),
|
| 212 |
+
ct.TensorType(name="chunk_embs_in", shape=chunk_embs_in.shape, dtype=np.float32),
|
| 213 |
+
ct.TensorType(name="chunk_lens_in", shape=chunk_lens_in.shape, dtype=np.int32),
|
| 214 |
+
],
|
| 215 |
+
outputs=[
|
| 216 |
+
ct.TensorType(name="speaker_preds", dtype=np.float32),
|
| 217 |
+
ct.TensorType(name="chunk_pre_encoder_embs", dtype=np.float32),
|
| 218 |
+
ct.TensorType(name="chunk_pre_encoder_lengths", dtype=np.int32),
|
| 219 |
+
],
|
| 220 |
+
minimum_deployment_target=ct.target.iOS16,
|
| 221 |
+
compute_precision=ct.precision.FLOAT16,
|
| 222 |
+
compute_units=ct.ComputeUnit.ALL
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
head_ml.save(os.path.join(output_dir, "Pipeline_Head_Fixed.mlpackage"))
|
| 226 |
+
print(f" Saved {output_dir}/Pipeline_Head_Fixed.mlpackage")
|
| 227 |
+
|
| 228 |
+
# =========================================================
|
| 229 |
+
# 3. Create Combined Pipeline
|
| 230 |
+
# =========================================================
|
| 231 |
+
print("\n[3/3] Creating combined pipeline...")
|
| 232 |
+
|
| 233 |
+
try:
|
| 234 |
+
pipeline = ct.utils.make_pipeline(pre_encoder_ml, head_ml, compute_units=ct.ComputeUnit.ALL)
|
| 235 |
+
pipeline.save(os.path.join(output_dir, "SortformerPipeline.mlpackage"))
|
| 236 |
+
print(f" Saved {output_dir}/SortformerPipeline.mlpackage")
|
| 237 |
+
except Exception as e:
|
| 238 |
+
print(f" Pipeline creation failed: {e}")
|
| 239 |
+
print(" Note: Call PreEncoder and Head separately to avoid embedding bug")
|
| 240 |
+
|
| 241 |
+
# =========================================================
|
| 242 |
+
# Verification
|
| 243 |
+
# =========================================================
|
| 244 |
+
print("\n" + "=" * 70)
|
| 245 |
+
print("Verification")
|
| 246 |
+
print("=" * 70)
|
| 247 |
+
|
| 248 |
+
# Test PreEncoder
|
| 249 |
+
test_chunk = np.random.randn(1, input_chunk_time, feat_dim).astype(np.float32)
|
| 250 |
+
test_chunk_len = np.array([input_chunk_time], dtype=np.int32)
|
| 251 |
+
test_spkcache = np.zeros((1, spkcache_len, fc_d_model), dtype=np.float32)
|
| 252 |
+
test_spkcache_len = np.array([0], dtype=np.int32)
|
| 253 |
+
test_fifo = np.zeros((1, fifo_len, fc_d_model), dtype=np.float32)
|
| 254 |
+
test_fifo_len = np.array([0], dtype=np.int32)
|
| 255 |
+
|
| 256 |
+
pre_out = pre_encoder_ml.predict({
|
| 257 |
+
'chunk': test_chunk,
|
| 258 |
+
'chunk_lengths': test_chunk_len,
|
| 259 |
+
'spkcache': test_spkcache,
|
| 260 |
+
'spkcache_lengths': test_spkcache_len,
|
| 261 |
+
'fifo': test_fifo,
|
| 262 |
+
'fifo_lengths': test_fifo_len
|
| 263 |
+
})
|
| 264 |
+
|
| 265 |
+
print(f"PreEncoder output shapes:")
|
| 266 |
+
print(f" pre_encoder_embs: {pre_out['pre_encoder_embs'].shape}")
|
| 267 |
+
print(f" chunk_embs_in: {pre_out['chunk_embs_in'].shape}")
|
| 268 |
+
print(f" chunk_embs_in[0,0,0]: {pre_out['chunk_embs_in'][0,0,0]:.6f}")
|
| 269 |
+
|
| 270 |
+
# Test Head
|
| 271 |
+
head_out = head_ml.predict({
|
| 272 |
+
'pre_encoder_embs': pre_out['pre_encoder_embs'],
|
| 273 |
+
'pre_encoder_lengths': pre_out['pre_encoder_lengths'],
|
| 274 |
+
'chunk_embs_in': pre_out['chunk_embs_in'],
|
| 275 |
+
'chunk_lens_in': pre_out['chunk_lens_in']
|
| 276 |
+
})
|
| 277 |
+
|
| 278 |
+
print(f"\nHead output shapes:")
|
| 279 |
+
print(f" speaker_preds: {head_out['speaker_preds'].shape}")
|
| 280 |
+
print(f" chunk_pre_encoder_embs: {head_out['chunk_pre_encoder_embs'].shape}")
|
| 281 |
+
print(f" chunk_pre_encoder_embs[0,0,0]: {head_out['chunk_pre_encoder_embs'][0,0,0]:.6f}")
|
| 282 |
+
|
| 283 |
+
# Verify embedding preservation
|
| 284 |
+
if np.isclose(pre_out['chunk_embs_in'][0,0,0], head_out['chunk_pre_encoder_embs'][0,0,0], atol=0.01):
|
| 285 |
+
print("\n✓ Embedding [0,0,0] preserved correctly!")
|
| 286 |
+
else:
|
| 287 |
+
print(f"\n✗ WARNING: Embedding [0,0,0] corrupted!")
|
| 288 |
+
print(f" PreEncoder: {pre_out['chunk_embs_in'][0,0,0]:.6f}")
|
| 289 |
+
print(f" Head: {head_out['chunk_pre_encoder_embs'][0,0,0]:.6f}")
|
| 290 |
+
|
| 291 |
+
print("\n" + "=" * 70)
|
| 292 |
+
print("Export Complete!")
|
| 293 |
+
print("=" * 70)
|
| 294 |
+
print(f"Models saved to: {output_dir}/")
|
| 295 |
+
print(f" - Pipeline_PreEncoder.mlpackage")
|
| 296 |
+
print(f" - Pipeline_Head_Fixed.mlpackage")
|
| 297 |
+
print(f" - SortformerPipeline.mlpackage (if pipeline creation succeeded)")
|
| 298 |
+
print(f"\nConfiguration (NVIDIA 1.04s latency):")
|
| 299 |
+
for k, v in NVIDIA_CONFIG.items():
|
| 300 |
+
print(f" {k}: {v}")
|
| 301 |
+
|
| 302 |
+
print(f"\nSwift SortformerConfig should use:")
|
| 303 |
+
print(f" chunkLen = 6")
|
| 304 |
+
print(f" chunkLeftContext = 1")
|
| 305 |
+
print(f" chunkRightContext = 7")
|
| 306 |
+
print(f" fifoLen = 188")
|
| 307 |
+
print(f" spkcacheLen = 188")
|
| 308 |
+
print(f" spkcacheUpdatePeriod = 144")
|
inference.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Streaming Sortformer CoreML Inference
|
| 4 |
+
|
| 5 |
+
This script demonstrates how to use the CoreML-converted NVIDIA Streaming Sortformer
|
| 6 |
+
model for real-time speaker diarization on Apple Silicon.
|
| 7 |
+
|
| 8 |
+
Original model: nvidia/diar_streaming_sortformer_4spk-v2.1
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import os
|
| 12 |
+
import numpy as np
|
| 13 |
+
import coremltools as ct
|
| 14 |
+
|
| 15 |
+
# Configuration matching NVIDIA's streaming settings
|
| 16 |
+
CONFIG = {
|
| 17 |
+
"chunk_len": 6, # Core chunk length in encoder frames
|
| 18 |
+
"chunk_left_context": 1, # Left context frames
|
| 19 |
+
"chunk_right_context": 7, # Right context frames
|
| 20 |
+
"fifo_len": 188, # FIFO buffer length
|
| 21 |
+
"spkcache_len": 188, # Speaker cache length
|
| 22 |
+
"spkcache_update_period": 144,
|
| 23 |
+
"subsampling_factor": 8, # Mel frames per encoder frame
|
| 24 |
+
"n_speakers": 4, # Max speakers
|
| 25 |
+
"sample_rate": 16000,
|
| 26 |
+
"mel_features": 128,
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class SortformerCoreML:
|
| 31 |
+
"""CoreML Streaming Sortformer Diarizer"""
|
| 32 |
+
|
| 33 |
+
def __init__(self, model_dir: str = ".", compute_units: str = "CPU_ONLY"):
|
| 34 |
+
"""
|
| 35 |
+
Initialize the CoreML Sortformer pipeline.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
model_dir: Directory containing the .mlpackage files
|
| 39 |
+
compute_units: "CPU_ONLY", "CPU_AND_GPU", or "ALL"
|
| 40 |
+
"""
|
| 41 |
+
cu = getattr(ct.ComputeUnit, compute_units, ct.ComputeUnit.CPU_ONLY)
|
| 42 |
+
|
| 43 |
+
# Load models
|
| 44 |
+
self.preprocessor = ct.models.MLModel(
|
| 45 |
+
os.path.join(model_dir, "Pipeline_Preprocessor.mlpackage"),
|
| 46 |
+
compute_units=cu
|
| 47 |
+
)
|
| 48 |
+
self.pre_encoder = ct.models.MLModel(
|
| 49 |
+
os.path.join(model_dir, "Pipeline_PreEncoder.mlpackage"),
|
| 50 |
+
compute_units=cu
|
| 51 |
+
)
|
| 52 |
+
self.head = ct.models.MLModel(
|
| 53 |
+
os.path.join(model_dir, "Pipeline_Head_Fixed.mlpackage"),
|
| 54 |
+
compute_units=cu
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Initialize state buffers
|
| 58 |
+
self.reset_state()
|
| 59 |
+
|
| 60 |
+
def reset_state(self):
|
| 61 |
+
"""Reset streaming state for new audio session."""
|
| 62 |
+
self.spkcache = np.zeros((1, CONFIG["spkcache_len"], 512), dtype=np.float32)
|
| 63 |
+
self.fifo = np.zeros((1, CONFIG["fifo_len"], 512), dtype=np.float32)
|
| 64 |
+
self.spkcache_len = 0
|
| 65 |
+
self.fifo_len = 0
|
| 66 |
+
self.chunk_idx = 0
|
| 67 |
+
|
| 68 |
+
def process_chunk(self, mel_features: np.ndarray, chunk_length: int) -> np.ndarray:
|
| 69 |
+
"""
|
| 70 |
+
Process a single chunk of mel features.
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
mel_features: Mel spectrogram chunk [1, T, 128] where T <= 112
|
| 74 |
+
chunk_length: Actual valid length (before padding)
|
| 75 |
+
|
| 76 |
+
Returns:
|
| 77 |
+
Speaker predictions [num_frames, 4] with probabilities for each speaker
|
| 78 |
+
"""
|
| 79 |
+
# Pad to 112 if needed
|
| 80 |
+
if mel_features.shape[1] < 112:
|
| 81 |
+
pad_len = 112 - mel_features.shape[1]
|
| 82 |
+
mel_features = np.pad(mel_features, ((0, 0), (0, pad_len), (0, 0)))
|
| 83 |
+
|
| 84 |
+
# Run PreEncoder
|
| 85 |
+
pre_out = self.pre_encoder.predict({
|
| 86 |
+
"chunk": mel_features.astype(np.float32),
|
| 87 |
+
"chunk_lengths": np.array([chunk_length], dtype=np.int32),
|
| 88 |
+
"spkcache": self.spkcache,
|
| 89 |
+
"spkcache_lengths": np.array([self.spkcache_len], dtype=np.int32),
|
| 90 |
+
"fifo": self.fifo,
|
| 91 |
+
"fifo_lengths": np.array([self.fifo_len], dtype=np.int32)
|
| 92 |
+
})
|
| 93 |
+
|
| 94 |
+
# Run Head
|
| 95 |
+
head_out = self.head.predict({
|
| 96 |
+
"pre_encoder_embs": pre_out["pre_encoder_embs"],
|
| 97 |
+
"pre_encoder_lengths": pre_out["pre_encoder_lengths"],
|
| 98 |
+
"chunk_embs_in": pre_out["chunk_embs_in"],
|
| 99 |
+
"chunk_lens_in": pre_out["chunk_lens_in"]
|
| 100 |
+
})
|
| 101 |
+
|
| 102 |
+
# Extract predictions for this chunk
|
| 103 |
+
emb_len = int(head_out["chunk_pre_encoder_lengths"][0])
|
| 104 |
+
lc = 0 if self.chunk_idx == 0 else 1 # Left context
|
| 105 |
+
rc = CONFIG["chunk_right_context"]
|
| 106 |
+
chunk_pred_len = emb_len - lc - rc
|
| 107 |
+
|
| 108 |
+
pred_offset = self.spkcache_len + self.fifo_len + lc
|
| 109 |
+
predictions = head_out["speaker_preds"][0, pred_offset:pred_offset + chunk_pred_len, :]
|
| 110 |
+
|
| 111 |
+
# Update state (simplified - full implementation needs NeMo's streaming_update logic)
|
| 112 |
+
self._update_state(pre_out, emb_len)
|
| 113 |
+
|
| 114 |
+
self.chunk_idx += 1
|
| 115 |
+
return predictions
|
| 116 |
+
|
| 117 |
+
def _update_state(self, pre_out, emb_len):
|
| 118 |
+
"""Update spkcache and fifo state buffers."""
|
| 119 |
+
# Get new chunk embeddings
|
| 120 |
+
new_embs = pre_out["chunk_embs_in"][0, :emb_len, :]
|
| 121 |
+
|
| 122 |
+
# Add to fifo
|
| 123 |
+
if self.fifo_len + emb_len <= CONFIG["fifo_len"]:
|
| 124 |
+
self.fifo[0, self.fifo_len:self.fifo_len + emb_len, :] = new_embs
|
| 125 |
+
self.fifo_len += emb_len
|
| 126 |
+
else:
|
| 127 |
+
# FIFO overflow - move to spkcache
|
| 128 |
+
overflow = self.fifo_len + emb_len - CONFIG["fifo_len"]
|
| 129 |
+
|
| 130 |
+
# Move overflow from fifo to spkcache
|
| 131 |
+
if self.spkcache_len + overflow <= CONFIG["spkcache_len"]:
|
| 132 |
+
self.spkcache[0, self.spkcache_len:self.spkcache_len + overflow, :] = \
|
| 133 |
+
self.fifo[0, :overflow, :]
|
| 134 |
+
self.spkcache_len += overflow
|
| 135 |
+
|
| 136 |
+
# Shift fifo and add new
|
| 137 |
+
self.fifo[0, :self.fifo_len - overflow, :] = self.fifo[0, overflow:self.fifo_len, :]
|
| 138 |
+
self.fifo_len -= overflow
|
| 139 |
+
self.fifo[0, self.fifo_len:self.fifo_len + emb_len, :] = new_embs
|
| 140 |
+
self.fifo_len += emb_len
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def process_audio(audio_path: str, model_dir: str = ".") -> list:
|
| 144 |
+
"""
|
| 145 |
+
Process an audio file and return diarization results.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
audio_path: Path to audio file (16kHz mono WAV)
|
| 149 |
+
model_dir: Directory containing CoreML models
|
| 150 |
+
|
| 151 |
+
Returns:
|
| 152 |
+
List of (start_time, end_time, speaker_id) tuples
|
| 153 |
+
"""
|
| 154 |
+
import torchaudio
|
| 155 |
+
import torch
|
| 156 |
+
|
| 157 |
+
# Load audio
|
| 158 |
+
waveform, sr = torchaudio.load(audio_path)
|
| 159 |
+
if sr != 16000:
|
| 160 |
+
waveform = torchaudio.functional.resample(waveform, sr, 16000)
|
| 161 |
+
if waveform.shape[0] > 1:
|
| 162 |
+
waveform = waveform.mean(dim=0, keepdim=True)
|
| 163 |
+
|
| 164 |
+
# Initialize model
|
| 165 |
+
model = SortformerCoreML(model_dir)
|
| 166 |
+
|
| 167 |
+
# Compute mel spectrogram using NeMo-compatible settings
|
| 168 |
+
# (You may need to use the Pipeline_Preprocessor or native mel computation)
|
| 169 |
+
|
| 170 |
+
# Process in chunks and collect predictions
|
| 171 |
+
# ... (implementation depends on your mel spectrogram computation)
|
| 172 |
+
|
| 173 |
+
print(f"Loaded audio: {waveform.shape}, {sr}Hz")
|
| 174 |
+
print("Processing... (implement chunking logic)")
|
| 175 |
+
|
| 176 |
+
return []
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
if __name__ == "__main__":
|
| 180 |
+
import sys
|
| 181 |
+
|
| 182 |
+
if len(sys.argv) < 2:
|
| 183 |
+
print("Usage: python inference.py <audio_file.wav>")
|
| 184 |
+
print("\nThis script requires:")
|
| 185 |
+
print(" - Pipeline_Preprocessor.mlpackage")
|
| 186 |
+
print(" - Pipeline_PreEncoder.mlpackage")
|
| 187 |
+
print(" - Pipeline_Head_Fixed.mlpackage")
|
| 188 |
+
sys.exit(1)
|
| 189 |
+
|
| 190 |
+
results = process_audio(sys.argv[1])
|
| 191 |
+
for start, end, speaker in results:
|
| 192 |
+
print(f"[{start:.2f}s - {end:.2f}s] Speaker {speaker}")
|
pyproject.toml
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "streaming-sortformer-coreml"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "CoreML conversion of NVIDIA Streaming Sortformer for Apple Silicon"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.10"
|
| 7 |
+
license = "Apache-2.0"
|
| 8 |
+
dependencies = [
|
| 9 |
+
"coremltools>=7.0",
|
| 10 |
+
"torch>=2.0",
|
| 11 |
+
"torchaudio>=2.0",
|
| 12 |
+
"numpy>=1.24",
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
[project.optional-dependencies]
|
| 16 |
+
convert = [
|
| 17 |
+
"nemo_toolkit[asr]>=2.0",
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
[build-system]
|
| 21 |
+
requires = ["hatchling"]
|
| 22 |
+
build-backend = "hatchling.build"
|