alexwengg commited on
Commit
f9a579a
·
verified ·
1 Parent(s): a7e3983

Upload 26 files

Browse files
Pipeline_Head_Fixed.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5980b0b5b6afd629201028acd9d30ef139405a4ff8e3197551b5749757e19808
3
+ size 243
Pipeline_Head_Fixed.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d3a809042f1aafc6410902c356ad226e4104b9f92f21a266b85a89d501c8e3c
3
+ size 505
Pipeline_Head_Fixed.mlmodelc/model.mil ADDED
The diff for this file is too large to render. See raw diff
 
Pipeline_Head_Fixed.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be56312ef2dbf57706aab7395fc2d5601ac6fbe6c553fd2b25069eba8da9b3b2
3
+ size 235580992
Pipeline_Head_Fixed.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93251763d5376dad5dc1f78cb0440397abb5a52e346575f1b6b750e958da13eb
3
+ size 827022
Pipeline_Head_Fixed.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be56312ef2dbf57706aab7395fc2d5601ac6fbe6c553fd2b25069eba8da9b3b2
3
+ size 235580992
Pipeline_Head_Fixed.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "7D7B39C7-0AD2-4CD7-B6B0-A7E76DCDE6CA": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "B58CA828-CA78-46FC-BD8D-5ABFB5AAEADD": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "B58CA828-CA78-46FC-BD8D-5ABFB5AAEADD"
18
+ }
Pipeline_PreEncoder.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0946b687ccf4274e0228e3bc539e6733c33bf0f4419e28d02220b19a35d884b
3
+ size 243
Pipeline_PreEncoder.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beb4fe86e80615cf79e64e55cb229cc931998b67b51a661bfbdc66204da0ea7b
3
+ size 553
Pipeline_PreEncoder.mlmodelc/model.mil ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ program(1.0)
2
+ [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3505.3.2"}, {"coremlc-version", "3505.4.1"}, {"coremltools-component-torch", "2.4.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
3
+ {
4
+ func main<ios16>(tensor<fp32, [1, 112, 128]> chunk, tensor<int32, [1]> chunk_lengths, tensor<fp32, [1, 188, 512]> fifo, tensor<int32, [1]> fifo_lengths, tensor<fp32, [1, 188, 512]> spkcache, tensor<int32, [1]> spkcache_lengths) {
5
+ tensor<fp32, [256]> model_encoder_pre_encode_conv_0_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_0_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
6
+ tensor<fp32, [256, 1, 3, 3]> model_encoder_pre_encode_conv_0_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_0_weight"), val = tensor<fp32, [256, 1, 3, 3]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1152)))];
7
+ tensor<fp32, [256]> model_encoder_pre_encode_conv_2_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_2_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10432)))];
8
+ tensor<fp32, [256, 1, 3, 3]> model_encoder_pre_encode_conv_2_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_2_weight"), val = tensor<fp32, [256, 1, 3, 3]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11520)))];
9
+ tensor<fp32, [256]> model_encoder_pre_encode_conv_3_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_3_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20800)))];
10
+ tensor<fp32, [256, 256, 1, 1]> model_encoder_pre_encode_conv_3_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_3_weight"), val = tensor<fp32, [256, 256, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(21888)))];
11
+ tensor<fp32, [256]> model_encoder_pre_encode_conv_5_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_5_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(284096)))];
12
+ tensor<fp32, [256, 1, 3, 3]> model_encoder_pre_encode_conv_5_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_5_weight"), val = tensor<fp32, [256, 1, 3, 3]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(285184)))];
13
+ tensor<fp32, [256]> model_encoder_pre_encode_conv_6_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_6_bias"), val = tensor<fp32, [256]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(294464)))];
14
+ tensor<fp32, [256, 256, 1, 1]> model_encoder_pre_encode_conv_6_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_conv_6_weight"), val = tensor<fp32, [256, 256, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295552)))];
15
+ tensor<fp32, [512]> model_encoder_pre_encode_out_bias = const()[name = tensor<string, []>("model_encoder_pre_encode_out_bias"), val = tensor<fp32, [512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(557760)))];
16
+ tensor<fp32, [512, 4096]> model_encoder_pre_encode_out_weight = const()[name = tensor<string, []>("model_encoder_pre_encode_out_weight"), val = tensor<fp32, [512, 4096]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(559872)))];
17
+ tensor<int32, [1]> tensor_1_axes_0 = const()[name = tensor<string, []>("tensor_1_axes_0"), val = tensor<int32, [1]>([1])];
18
+ tensor<fp32, [1, 1, 112, 128]> tensor_1 = expand_dims(axes = tensor_1_axes_0, x = chunk)[name = tensor<string, []>("tensor_1")];
19
+ tensor<string, []> cast_0_dtype_0 = const()[name = tensor<string, []>("cast_0_dtype_0"), val = tensor<string, []>("fp32")];
20
+ tensor<int32, [1, 112]> expand_dims_0 = const()[name = tensor<string, []>("expand_dims_0"), val = tensor<int32, [1, 112]>([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111]])];
21
+ tensor<int32, [1]> var_40_axes_0 = const()[name = tensor<string, []>("op_40_axes_0"), val = tensor<int32, [1]>([1])];
22
+ tensor<int32, [1, 1]> var_40 = expand_dims(axes = var_40_axes_0, x = chunk_lengths)[name = tensor<string, []>("op_40")];
23
+ tensor<bool, [1, 112]> time_mask_1 = less(x = expand_dims_0, y = var_40)[name = tensor<string, []>("time_mask_1")];
24
+ tensor<int32, [1]> var_42_axes_0 = const()[name = tensor<string, []>("op_42_axes_0"), val = tensor<int32, [1]>([-1])];
25
+ tensor<bool, [1, 112, 1]> var_42 = expand_dims(axes = var_42_axes_0, x = time_mask_1)[name = tensor<string, []>("op_42")];
26
+ tensor<int32, [3]> var_44_reps_0 = const()[name = tensor<string, []>("op_44_reps_0"), val = tensor<int32, [3]>([1, 1, 128])];
27
+ tensor<bool, [1, 112, 128]> var_44 = tile(reps = var_44_reps_0, x = var_42)[name = tensor<string, []>("op_44")];
28
+ tensor<string, []> cast_2_dtype_0 = const()[name = tensor<string, []>("cast_2_dtype_0"), val = tensor<string, []>("fp32")];
29
+ tensor<int32, [1]> var_50_axes_0 = const()[name = tensor<string, []>("op_50_axes_0"), val = tensor<int32, [1]>([1])];
30
+ tensor<fp32, [1, 112, 128]> cast_2 = cast(dtype = cast_2_dtype_0, x = var_44)[name = tensor<string, []>("cast_25")];
31
+ tensor<fp32, [1, 1, 112, 128]> var_50 = expand_dims(axes = var_50_axes_0, x = cast_2)[name = tensor<string, []>("op_50")];
32
+ tensor<fp32, [1, 1, 112, 128]> input_1 = mul(x = tensor_1, y = var_50)[name = tensor<string, []>("input_1")];
33
+ tensor<string, []> tensor_3_pad_type_0 = const()[name = tensor<string, []>("tensor_3_pad_type_0"), val = tensor<string, []>("custom")];
34
+ tensor<int32, [4]> tensor_3_pad_0 = const()[name = tensor<string, []>("tensor_3_pad_0"), val = tensor<int32, [4]>([1, 1, 1, 1])];
35
+ tensor<int32, [2]> tensor_3_strides_0 = const()[name = tensor<string, []>("tensor_3_strides_0"), val = tensor<int32, [2]>([2, 2])];
36
+ tensor<int32, [2]> tensor_3_dilations_0 = const()[name = tensor<string, []>("tensor_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
37
+ tensor<int32, []> tensor_3_groups_0 = const()[name = tensor<string, []>("tensor_3_groups_0"), val = tensor<int32, []>(1)];
38
+ tensor<fp32, [1, 256, 56, 64]> tensor_3 = conv(bias = model_encoder_pre_encode_conv_0_bias, dilations = tensor_3_dilations_0, groups = tensor_3_groups_0, pad = tensor_3_pad_0, pad_type = tensor_3_pad_type_0, strides = tensor_3_strides_0, weight = model_encoder_pre_encode_conv_0_weight, x = input_1)[name = tensor<string, []>("tensor_3")];
39
+ tensor<fp32, []> var_61_promoted = const()[name = tensor<string, []>("op_61_promoted"), val = tensor<fp32, []>(0x1p+0)];
40
+ tensor<fp32, [1]> cast_0 = cast(dtype = cast_0_dtype_0, x = chunk_lengths)[name = tensor<string, []>("cast_26")];
41
+ tensor<fp32, [1]> var_62 = add(x = cast_0, y = var_61_promoted)[name = tensor<string, []>("op_62")];
42
+ tensor<fp32, []> var_63_promoted = const()[name = tensor<string, []>("op_63_promoted"), val = tensor<fp32, []>(0x1p+0)];
43
+ tensor<fp32, [1]> var_64 = add(x = var_62, y = var_63_promoted)[name = tensor<string, []>("op_64")];
44
+ tensor<fp32, []> var_65_promoted = const()[name = tensor<string, []>("op_65_promoted"), val = tensor<fp32, []>(0x1.8p+1)];
45
+ tensor<fp32, [1]> var_66 = sub(x = var_64, y = var_65_promoted)[name = tensor<string, []>("op_66")];
46
+ tensor<fp32, []> var_21_promoted = const()[name = tensor<string, []>("op_21_promoted"), val = tensor<fp32, []>(0x1p+1)];
47
+ tensor<fp32, [1]> floor_div_0 = floor_div(x = var_66, y = var_21_promoted)[name = tensor<string, []>("floor_div_0")];
48
+ tensor<fp32, []> var_68_promoted = const()[name = tensor<string, []>("op_68_promoted"), val = tensor<fp32, []>(0x1p+0)];
49
+ tensor<fp32, [1]> current_lengths_3 = add(x = floor_div_0, y = var_68_promoted)[name = tensor<string, []>("current_lengths_3")];
50
+ tensor<string, []> cast_3_dtype_0 = const()[name = tensor<string, []>("cast_3_dtype_0"), val = tensor<string, []>("int32")];
51
+ tensor<int32, [1, 56]> expand_dims_1 = const()[name = tensor<string, []>("expand_dims_1"), val = tensor<int32, [1, 56]>([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]])];
52
+ tensor<int32, [1]> var_77_axes_0 = const()[name = tensor<string, []>("op_77_axes_0"), val = tensor<int32, [1]>([1])];
53
+ tensor<int32, [1]> cast_3 = cast(dtype = cast_3_dtype_0, x = current_lengths_3)[name = tensor<string, []>("cast_24")];
54
+ tensor<int32, [1, 1]> var_77 = expand_dims(axes = var_77_axes_0, x = cast_3)[name = tensor<string, []>("op_77")];
55
+ tensor<bool, [1, 56]> time_mask_3 = less(x = expand_dims_1, y = var_77)[name = tensor<string, []>("time_mask_3")];
56
+ tensor<int32, [1]> var_79_axes_0 = const()[name = tensor<string, []>("op_79_axes_0"), val = tensor<int32, [1]>([-1])];
57
+ tensor<bool, [1, 56, 1]> var_79 = expand_dims(axes = var_79_axes_0, x = time_mask_3)[name = tensor<string, []>("op_79")];
58
+ tensor<int32, [3]> var_81_reps_0 = const()[name = tensor<string, []>("op_81_reps_0"), val = tensor<int32, [3]>([1, 1, 64])];
59
+ tensor<bool, [1, 56, 64]> var_81 = tile(reps = var_81_reps_0, x = var_79)[name = tensor<string, []>("op_81")];
60
+ tensor<string, []> cast_4_dtype_0 = const()[name = tensor<string, []>("cast_4_dtype_0"), val = tensor<string, []>("fp32")];
61
+ tensor<int32, [1]> var_87_axes_0 = const()[name = tensor<string, []>("op_87_axes_0"), val = tensor<int32, [1]>([1])];
62
+ tensor<fp32, [1, 56, 64]> cast_4 = cast(dtype = cast_4_dtype_0, x = var_81)[name = tensor<string, []>("cast_23")];
63
+ tensor<fp32, [1, 1, 56, 64]> var_87 = expand_dims(axes = var_87_axes_0, x = cast_4)[name = tensor<string, []>("op_87")];
64
+ tensor<int32, [4]> expanded_mask_3_reps_0 = const()[name = tensor<string, []>("expanded_mask_3_reps_0"), val = tensor<int32, [4]>([1, 256, 1, 1])];
65
+ tensor<fp32, [1, 256, 56, 64]> expanded_mask_3 = tile(reps = expanded_mask_3_reps_0, x = var_87)[name = tensor<string, []>("expanded_mask_3")];
66
+ tensor<fp32, [1, 256, 56, 64]> input_3 = mul(x = tensor_3, y = expanded_mask_3)[name = tensor<string, []>("input_3")];
67
+ tensor<fp32, [1, 256, 56, 64]> tensor_5 = relu(x = input_3)[name = tensor<string, []>("tensor_5")];
68
+ tensor<fp32, [1, 256, 56, 64]> input_5 = mul(x = tensor_5, y = expanded_mask_3)[name = tensor<string, []>("input_5")];
69
+ tensor<string, []> tensor_7_pad_type_0 = const()[name = tensor<string, []>("tensor_7_pad_type_0"), val = tensor<string, []>("custom")];
70
+ tensor<int32, [4]> tensor_7_pad_0 = const()[name = tensor<string, []>("tensor_7_pad_0"), val = tensor<int32, [4]>([1, 1, 1, 1])];
71
+ tensor<int32, [2]> tensor_7_strides_0 = const()[name = tensor<string, []>("tensor_7_strides_0"), val = tensor<int32, [2]>([2, 2])];
72
+ tensor<int32, []> tensor_7_groups_0 = const()[name = tensor<string, []>("tensor_7_groups_0"), val = tensor<int32, []>(256)];
73
+ tensor<int32, [2]> tensor_7_dilations_0 = const()[name = tensor<string, []>("tensor_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
74
+ tensor<fp32, [1, 256, 28, 32]> tensor_7 = conv(bias = model_encoder_pre_encode_conv_2_bias, dilations = tensor_7_dilations_0, groups = tensor_7_groups_0, pad = tensor_7_pad_0, pad_type = tensor_7_pad_type_0, strides = tensor_7_strides_0, weight = model_encoder_pre_encode_conv_2_weight, x = input_5)[name = tensor<string, []>("tensor_7")];
75
+ tensor<fp32, []> var_107_promoted = const()[name = tensor<string, []>("op_107_promoted"), val = tensor<fp32, []>(0x1p+0)];
76
+ tensor<fp32, [1]> var_108 = add(x = current_lengths_3, y = var_107_promoted)[name = tensor<string, []>("op_108")];
77
+ tensor<fp32, []> var_109_promoted = const()[name = tensor<string, []>("op_109_promoted"), val = tensor<fp32, []>(0x1p+0)];
78
+ tensor<fp32, [1]> var_110 = add(x = var_108, y = var_109_promoted)[name = tensor<string, []>("op_110")];
79
+ tensor<fp32, []> var_111_promoted = const()[name = tensor<string, []>("op_111_promoted"), val = tensor<fp32, []>(0x1.8p+1)];
80
+ tensor<fp32, [1]> var_112 = sub(x = var_110, y = var_111_promoted)[name = tensor<string, []>("op_112")];
81
+ tensor<fp32, []> var_21_promoted_1 = const()[name = tensor<string, []>("op_21_promoted_1"), val = tensor<fp32, []>(0x1p+1)];
82
+ tensor<fp32, [1]> floor_div_1 = floor_div(x = var_112, y = var_21_promoted_1)[name = tensor<string, []>("floor_div_1")];
83
+ tensor<fp32, []> var_114_promoted = const()[name = tensor<string, []>("op_114_promoted"), val = tensor<fp32, []>(0x1p+0)];
84
+ tensor<fp32, [1]> current_lengths_5 = add(x = floor_div_1, y = var_114_promoted)[name = tensor<string, []>("current_lengths_5")];
85
+ tensor<string, []> cast_5_dtype_0 = const()[name = tensor<string, []>("cast_5_dtype_0"), val = tensor<string, []>("int32")];
86
+ tensor<int32, [1, 28]> expand_dims_2 = const()[name = tensor<string, []>("expand_dims_2"), val = tensor<int32, [1, 28]>([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]])];
87
+ tensor<int32, [1]> var_123_axes_0 = const()[name = tensor<string, []>("op_123_axes_0"), val = tensor<int32, [1]>([1])];
88
+ tensor<int32, [1]> cast_5 = cast(dtype = cast_5_dtype_0, x = current_lengths_5)[name = tensor<string, []>("cast_22")];
89
+ tensor<int32, [1, 1]> var_123 = expand_dims(axes = var_123_axes_0, x = cast_5)[name = tensor<string, []>("op_123")];
90
+ tensor<bool, [1, 28]> time_mask_5 = less(x = expand_dims_2, y = var_123)[name = tensor<string, []>("time_mask_5")];
91
+ tensor<int32, [1]> var_125_axes_0 = const()[name = tensor<string, []>("op_125_axes_0"), val = tensor<int32, [1]>([-1])];
92
+ tensor<bool, [1, 28, 1]> var_125 = expand_dims(axes = var_125_axes_0, x = time_mask_5)[name = tensor<string, []>("op_125")];
93
+ tensor<int32, [3]> var_127_reps_0 = const()[name = tensor<string, []>("op_127_reps_0"), val = tensor<int32, [3]>([1, 1, 32])];
94
+ tensor<bool, [1, 28, 32]> var_127 = tile(reps = var_127_reps_0, x = var_125)[name = tensor<string, []>("op_127")];
95
+ tensor<string, []> cast_6_dtype_0 = const()[name = tensor<string, []>("cast_6_dtype_0"), val = tensor<string, []>("fp32")];
96
+ tensor<int32, [1]> var_133_axes_0 = const()[name = tensor<string, []>("op_133_axes_0"), val = tensor<int32, [1]>([1])];
97
+ tensor<fp32, [1, 28, 32]> cast_6 = cast(dtype = cast_6_dtype_0, x = var_127)[name = tensor<string, []>("cast_21")];
98
+ tensor<fp32, [1, 1, 28, 32]> var_133 = expand_dims(axes = var_133_axes_0, x = cast_6)[name = tensor<string, []>("op_133")];
99
+ tensor<int32, [4]> expanded_mask_7_reps_0 = const()[name = tensor<string, []>("expanded_mask_7_reps_0"), val = tensor<int32, [4]>([1, 256, 1, 1])];
100
+ tensor<fp32, [1, 256, 28, 32]> expanded_mask_7 = tile(reps = expanded_mask_7_reps_0, x = var_133)[name = tensor<string, []>("expanded_mask_7")];
101
+ tensor<fp32, [1, 256, 28, 32]> input_7 = mul(x = tensor_7, y = expanded_mask_7)[name = tensor<string, []>("input_7")];
102
+ tensor<string, []> tensor_9_pad_type_0 = const()[name = tensor<string, []>("tensor_9_pad_type_0"), val = tensor<string, []>("valid")];
103
+ tensor<int32, [2]> tensor_9_strides_0 = const()[name = tensor<string, []>("tensor_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
104
+ tensor<int32, [4]> tensor_9_pad_0 = const()[name = tensor<string, []>("tensor_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
105
+ tensor<int32, [2]> tensor_9_dilations_0 = const()[name = tensor<string, []>("tensor_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
106
+ tensor<int32, []> tensor_9_groups_0 = const()[name = tensor<string, []>("tensor_9_groups_0"), val = tensor<int32, []>(1)];
107
+ tensor<fp32, [1, 256, 28, 32]> tensor_9 = conv(bias = model_encoder_pre_encode_conv_3_bias, dilations = tensor_9_dilations_0, groups = tensor_9_groups_0, pad = tensor_9_pad_0, pad_type = tensor_9_pad_type_0, strides = tensor_9_strides_0, weight = model_encoder_pre_encode_conv_3_weight, x = input_7)[name = tensor<string, []>("tensor_9")];
108
+ tensor<fp32, [1, 256, 28, 32]> input_9 = mul(x = tensor_9, y = expanded_mask_7)[name = tensor<string, []>("input_9")];
109
+ tensor<fp32, [1, 256, 28, 32]> tensor_11 = relu(x = input_9)[name = tensor<string, []>("tensor_11")];
110
+ tensor<fp32, [1, 256, 28, 32]> input_11 = mul(x = tensor_11, y = expanded_mask_7)[name = tensor<string, []>("input_11")];
111
+ tensor<string, []> tensor_13_pad_type_0 = const()[name = tensor<string, []>("tensor_13_pad_type_0"), val = tensor<string, []>("custom")];
112
+ tensor<int32, [4]> tensor_13_pad_0 = const()[name = tensor<string, []>("tensor_13_pad_0"), val = tensor<int32, [4]>([1, 1, 1, 1])];
113
+ tensor<int32, [2]> tensor_13_strides_0 = const()[name = tensor<string, []>("tensor_13_strides_0"), val = tensor<int32, [2]>([2, 2])];
114
+ tensor<int32, []> tensor_13_groups_0 = const()[name = tensor<string, []>("tensor_13_groups_0"), val = tensor<int32, []>(256)];
115
+ tensor<int32, [2]> tensor_13_dilations_0 = const()[name = tensor<string, []>("tensor_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
116
+ tensor<fp32, [1, 256, 14, 16]> tensor_13 = conv(bias = model_encoder_pre_encode_conv_5_bias, dilations = tensor_13_dilations_0, groups = tensor_13_groups_0, pad = tensor_13_pad_0, pad_type = tensor_13_pad_type_0, strides = tensor_13_strides_0, weight = model_encoder_pre_encode_conv_5_weight, x = input_11)[name = tensor<string, []>("tensor_13")];
117
+ tensor<fp32, []> var_168_promoted = const()[name = tensor<string, []>("op_168_promoted"), val = tensor<fp32, []>(0x1p+0)];
118
+ tensor<fp32, [1]> var_169 = add(x = current_lengths_5, y = var_168_promoted)[name = tensor<string, []>("op_169")];
119
+ tensor<fp32, []> var_170_promoted = const()[name = tensor<string, []>("op_170_promoted"), val = tensor<fp32, []>(0x1p+0)];
120
+ tensor<fp32, [1]> var_171 = add(x = var_169, y = var_170_promoted)[name = tensor<string, []>("op_171")];
121
+ tensor<fp32, []> var_172_promoted = const()[name = tensor<string, []>("op_172_promoted"), val = tensor<fp32, []>(0x1.8p+1)];
122
+ tensor<fp32, [1]> var_173 = sub(x = var_171, y = var_172_promoted)[name = tensor<string, []>("op_173")];
123
+ tensor<fp32, []> var_21_promoted_2 = const()[name = tensor<string, []>("op_21_promoted_2"), val = tensor<fp32, []>(0x1p+1)];
124
+ tensor<fp32, [1]> floor_div_2 = floor_div(x = var_173, y = var_21_promoted_2)[name = tensor<string, []>("floor_div_2")];
125
+ tensor<fp32, []> var_175_promoted = const()[name = tensor<string, []>("op_175_promoted"), val = tensor<fp32, []>(0x1p+0)];
126
+ tensor<fp32, [1]> current_lengths = add(x = floor_div_2, y = var_175_promoted)[name = tensor<string, []>("current_lengths")];
127
+ tensor<string, []> cast_7_dtype_0 = const()[name = tensor<string, []>("cast_7_dtype_0"), val = tensor<string, []>("int32")];
128
+ tensor<int32, [1, 14]> expand_dims_3 = const()[name = tensor<string, []>("expand_dims_3"), val = tensor<int32, [1, 14]>([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]])];
129
+ tensor<int32, [1]> var_184_axes_0 = const()[name = tensor<string, []>("op_184_axes_0"), val = tensor<int32, [1]>([1])];
130
+ tensor<int32, [1]> cast_7 = cast(dtype = cast_7_dtype_0, x = current_lengths)[name = tensor<string, []>("cast_20")];
131
+ tensor<int32, [1, 1]> var_184 = expand_dims(axes = var_184_axes_0, x = cast_7)[name = tensor<string, []>("op_184")];
132
+ tensor<bool, [1, 14]> time_mask = less(x = expand_dims_3, y = var_184)[name = tensor<string, []>("time_mask")];
133
+ tensor<int32, [1]> var_186_axes_0 = const()[name = tensor<string, []>("op_186_axes_0"), val = tensor<int32, [1]>([-1])];
134
+ tensor<bool, [1, 14, 1]> var_186 = expand_dims(axes = var_186_axes_0, x = time_mask)[name = tensor<string, []>("op_186")];
135
+ tensor<int32, [3]> var_188_reps_0 = const()[name = tensor<string, []>("op_188_reps_0"), val = tensor<int32, [3]>([1, 1, 16])];
136
+ tensor<bool, [1, 14, 16]> var_188 = tile(reps = var_188_reps_0, x = var_186)[name = tensor<string, []>("op_188")];
137
+ tensor<string, []> cast_8_dtype_0 = const()[name = tensor<string, []>("cast_8_dtype_0"), val = tensor<string, []>("fp32")];
138
+ tensor<int32, [1]> var_194_axes_0 = const()[name = tensor<string, []>("op_194_axes_0"), val = tensor<int32, [1]>([1])];
139
+ tensor<fp32, [1, 14, 16]> cast_8 = cast(dtype = cast_8_dtype_0, x = var_188)[name = tensor<string, []>("cast_19")];
140
+ tensor<fp32, [1, 1, 14, 16]> var_194 = expand_dims(axes = var_194_axes_0, x = cast_8)[name = tensor<string, []>("op_194")];
141
+ tensor<int32, [4]> expanded_mask_13_reps_0 = const()[name = tensor<string, []>("expanded_mask_13_reps_0"), val = tensor<int32, [4]>([1, 256, 1, 1])];
142
+ tensor<fp32, [1, 256, 14, 16]> expanded_mask_13 = tile(reps = expanded_mask_13_reps_0, x = var_194)[name = tensor<string, []>("expanded_mask_13")];
143
+ tensor<fp32, [1, 256, 14, 16]> input_13 = mul(x = tensor_13, y = expanded_mask_13)[name = tensor<string, []>("input_13")];
144
+ tensor<string, []> tensor_15_pad_type_0 = const()[name = tensor<string, []>("tensor_15_pad_type_0"), val = tensor<string, []>("valid")];
145
+ tensor<int32, [2]> tensor_15_strides_0 = const()[name = tensor<string, []>("tensor_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
146
+ tensor<int32, [4]> tensor_15_pad_0 = const()[name = tensor<string, []>("tensor_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
147
+ tensor<int32, [2]> tensor_15_dilations_0 = const()[name = tensor<string, []>("tensor_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
148
+ tensor<int32, []> tensor_15_groups_0 = const()[name = tensor<string, []>("tensor_15_groups_0"), val = tensor<int32, []>(1)];
149
+ tensor<fp32, [1, 256, 14, 16]> tensor_15 = conv(bias = model_encoder_pre_encode_conv_6_bias, dilations = tensor_15_dilations_0, groups = tensor_15_groups_0, pad = tensor_15_pad_0, pad_type = tensor_15_pad_type_0, strides = tensor_15_strides_0, weight = model_encoder_pre_encode_conv_6_weight, x = input_13)[name = tensor<string, []>("tensor_15")];
150
+ tensor<fp32, [1, 256, 14, 16]> input_15 = mul(x = tensor_15, y = expanded_mask_13)[name = tensor<string, []>("input_15")];
151
+ tensor<fp32, [1, 256, 14, 16]> tensor_workaround = relu(x = input_15)[name = tensor<string, []>("tensor_workaround")];
152
+ tensor<fp32, [1, 256, 14, 16]> x = mul(x = tensor_workaround, y = expanded_mask_13)[name = tensor<string, []>("x")];
153
+ tensor<int32, [4]> var_228_perm_0 = const()[name = tensor<string, []>("op_228_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
154
+ tensor<int32, [3]> var_229 = const()[name = tensor<string, []>("op_229"), val = tensor<int32, [3]>([1, 14, -1])];
155
+ tensor<fp32, [1, 14, 256, 16]> var_228 = transpose(perm = var_228_perm_0, x = x)[name = tensor<string, []>("transpose_0")];
156
+ tensor<fp32, [1, 14, 4096]> input = reshape(shape = var_229, x = var_228)[name = tensor<string, []>("input")];
157
+ tensor<fp32, [1, 14, 512]> chunk_embs_in = linear(bias = model_encoder_pre_encode_out_bias, weight = model_encoder_pre_encode_out_weight, x = input)[name = tensor<string, []>("linear_0")];
158
+ tensor<string, []> cast_11_dtype_0 = const()[name = tensor<string, []>("cast_11_dtype_0"), val = tensor<string, []>("int32")];
159
+ tensor<int32, [1]> size0 = const()[name = tensor<string, []>("size0"), val = tensor<int32, [1]>([188])];
160
+ tensor<int32, [1]> size1 = const()[name = tensor<string, []>("size1"), val = tensor<int32, [1]>([188])];
161
+ tensor<int32, []> var_264 = const()[name = tensor<string, []>("op_264"), val = tensor<int32, []>(1)];
162
+ tensor<bool, []> full_concat_interleave_0 = const()[name = tensor<string, []>("full_concat_interleave_0"), val = tensor<bool, []>(false)];
163
+ tensor<fp32, [1, 390, 512]> full_concat = concat(axis = var_264, interleave = full_concat_interleave_0, values = (spkcache, fifo, chunk_embs_in))[name = tensor<string, []>("full_concat")];
164
+ tensor<int32, [1]> var_273 = add(x = spkcache_lengths, y = fifo_lengths)[name = tensor<string, []>("op_273")];
165
+ tensor<int32, [1]> chunk_lens_in = cast(dtype = cast_11_dtype_0, x = current_lengths)[name = tensor<string, []>("cast_18")];
166
+ tensor<int32, [1]> pre_encoder_lengths = add(x = var_273, y = chunk_lens_in)[name = tensor<string, []>("total_length")];
167
+ tensor<int32, [390]> out_pos = const()[name = tensor<string, []>("out_pos"), val = tensor<int32, [390]>([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389])];
168
+ tensor<bool, [390]> var_284 = greater_equal(x = out_pos, y = spkcache_lengths)[name = tensor<string, []>("op_284")];
169
+ tensor<string, []> cast_12_dtype_0 = const()[name = tensor<string, []>("cast_12_dtype_0"), val = tensor<string, []>("int32")];
170
+ tensor<bool, [390]> var_290 = greater_equal(x = out_pos, y = var_273)[name = tensor<string, []>("op_290")];
171
+ tensor<string, []> cast_13_dtype_0 = const()[name = tensor<string, []>("cast_13_dtype_0"), val = tensor<string, []>("int32")];
172
+ tensor<int32, [1]> var_297 = sub(x = size0, y = spkcache_lengths)[name = tensor<string, []>("op_297")];
173
+ tensor<int32, [390]> cast_12 = cast(dtype = cast_12_dtype_0, x = var_284)[name = tensor<string, []>("cast_17")];
174
+ tensor<int32, [390]> var_298 = mul(x = cast_12, y = var_297)[name = tensor<string, []>("op_298")];
175
+ tensor<int32, [1]> var_300 = sub(x = size1, y = fifo_lengths)[name = tensor<string, []>("op_300")];
176
+ tensor<int32, [390]> cast_13 = cast(dtype = cast_13_dtype_0, x = var_290)[name = tensor<string, []>("cast_16")];
177
+ tensor<int32, [390]> var_301 = mul(x = cast_13, y = var_300)[name = tensor<string, []>("op_301")];
178
+ tensor<int32, [390]> offset = add(x = var_298, y = var_301)[name = tensor<string, []>("offset")];
179
+ tensor<int32, [390]> var_305 = add(x = out_pos, y = offset)[name = tensor<string, []>("op_305")];
180
+ tensor<int32, []> var_309 = const()[name = tensor<string, []>("op_309"), val = tensor<int32, []>(389)];
181
+ tensor<int32, []> var_310 = const()[name = tensor<string, []>("op_310"), val = tensor<int32, []>(0)];
182
+ tensor<int32, [390]> minimum_0 = minimum(x = var_305, y = var_309)[name = tensor<string, []>("minimum_0")];
183
+ tensor<int32, [390]> maximum_0 = maximum(x = minimum_0, y = var_310)[name = tensor<string, []>("maximum_0")];
184
+ tensor<int32, [1]> var_313_axes_0 = const()[name = tensor<string, []>("op_313_axes_0"), val = tensor<int32, [1]>([0])];
185
+ tensor<int32, [1, 390]> var_313 = expand_dims(axes = var_313_axes_0, x = maximum_0)[name = tensor<string, []>("op_313")];
186
+ tensor<int32, [1]> var_315_axes_0 = const()[name = tensor<string, []>("op_315_axes_0"), val = tensor<int32, [1]>([-1])];
187
+ tensor<int32, [1, 390, 1]> var_315 = expand_dims(axes = var_315_axes_0, x = var_313)[name = tensor<string, []>("op_315")];
188
+ tensor<int32, [3]> gather_idx_reps_0 = const()[name = tensor<string, []>("gather_idx_reps_0"), val = tensor<int32, [3]>([1, 1, 512])];
189
+ tensor<int32, [1, 390, 512]> gather_idx = tile(reps = gather_idx_reps_0, x = var_315)[name = tensor<string, []>("gather_idx")];
190
+ tensor<int32, []> var_320 = const()[name = tensor<string, []>("op_320"), val = tensor<int32, []>(1)];
191
+ tensor<fp32, [1, 390, 512]> output = gather_along_axis(axis = var_320, indices = gather_idx, x = full_concat)[name = tensor<string, []>("output")];
192
+ tensor<bool, [390]> var_323 = less(x = out_pos, y = pre_encoder_lengths)[name = tensor<string, []>("op_323")];
193
+ tensor<string, []> cast_14_dtype_0 = const()[name = tensor<string, []>("cast_14_dtype_0"), val = tensor<string, []>("fp32")];
194
+ tensor<int32, [1]> var_330_axes_0 = const()[name = tensor<string, []>("op_330_axes_0"), val = tensor<int32, [1]>([0])];
195
+ tensor<fp32, [390]> cast_14 = cast(dtype = cast_14_dtype_0, x = var_323)[name = tensor<string, []>("cast_15")];
196
+ tensor<fp32, [1, 390]> var_330 = expand_dims(axes = var_330_axes_0, x = cast_14)[name = tensor<string, []>("op_330")];
197
+ tensor<int32, [1]> var_332_axes_0 = const()[name = tensor<string, []>("op_332_axes_0"), val = tensor<int32, [1]>([-1])];
198
+ tensor<fp32, [1, 390, 1]> var_332 = expand_dims(axes = var_332_axes_0, x = var_330)[name = tensor<string, []>("op_332")];
199
+ tensor<fp32, [1, 390, 512]> pre_encoder_embs = mul(x = output, y = var_332)[name = tensor<string, []>("op_333")];
200
+ } -> (pre_encoder_embs, pre_encoder_lengths, chunk_embs_in, chunk_lens_in);
201
+ }
Pipeline_PreEncoder.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88a98803e35186b1dfb41d7f748f7cee5093bb6efeb117f56953c17549792fa4
3
+ size 8948544
Pipeline_PreEncoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb3f36b3b9d3f63e7a4f89a8848c6e3bc1a4a983786a832ea2c60cc395525cc2
3
+ size 26802
Pipeline_PreEncoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88a98803e35186b1dfb41d7f748f7cee5093bb6efeb117f56953c17549792fa4
3
+ size 8948544
Pipeline_PreEncoder.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "6894C507-E04A-4096-A90F-9AB0F58870E0": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "E6A26F48-2E38-4E3A-AF2B-89704AAA4B4C": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "6894C507-E04A-4096-A90F-9AB0F58870E0"
18
+ }
Pipeline_Preprocessor.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eee5506c26dd1453734200ef08e1a263599e25fbdd433ecc425e7d8fd3c39641
3
+ size 243
Pipeline_Preprocessor.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15271d7ebb0d1f8f82ae60468e6a554b71d386f6f5f717b00332b9ace990be16
3
+ size 374
Pipeline_Preprocessor.mlmodelc/model.mil ADDED
The diff for this file is too large to render. See raw diff
 
Pipeline_Preprocessor.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6d96fe6aa2f786e9ce18f53c2c6058807fbd9733dc48813d30554ee9b1caf80
3
+ size 1184512
Pipeline_Preprocessor.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ba45e5189ff1a12d01f2ebc6fc5db8ab7ad63cedcbb9847515ad0e7881daa0e
3
+ size 48673
Pipeline_Preprocessor.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6d96fe6aa2f786e9ce18f53c2c6058807fbd9733dc48813d30554ee9b1caf80
3
+ size 1184512
Pipeline_Preprocessor.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "43321C07-C241-4F12-89F5-DF8385087F7C": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "505C3394-5B98-45F7-8F05-9C513B04AFCB": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "505C3394-5B98-45F7-8F05-9C513B04AFCB"
18
+ }
README.md ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Streaming Sortformer CoreML
2
+
3
+ CoreML conversion of NVIDIA's Streaming Sortformer 4-Speaker Diarization model for Apple Silicon.
4
+
5
+ ## Original Model
6
+
7
+ - **Source**: [nvidia/diar_streaming_sortformer_4spk-v2.1](https://huggingface.co/nvidia/diar_streaming_sortformer_4spk-v2.1)
8
+ - **Paper**: [Sortformer: Seamless Integration of Speaker Diarization and ASR](https://arxiv.org/abs/2409.06656)
9
+ - **Benchmark**: 20.57% DER on AMI SDM (NVIDIA reported)
10
+
11
+ ## Models
12
+
13
+ | Model | Description | Input | Output |
14
+ |-------|-------------|-------|--------|
15
+ | `Pipeline_Preprocessor.mlpackage` | Mel spectrogram extraction | Audio waveform | 128-dim mel features |
16
+ | `Pipeline_PreEncoder.mlpackage` | FastConformer encoder + Transformer | Mel features + state | Encoded embeddings |
17
+ | `Pipeline_Head_Fixed.mlpackage` | Speaker prediction head | Embeddings | 4-speaker probabilities |
18
+
19
+ ## Configuration
20
+
21
+ ```python
22
+ CONFIG = {
23
+ "chunk_len": 6, # Core chunk length (encoder frames)
24
+ "chunk_left_context": 1, # Left context frames
25
+ "chunk_right_context": 7, # Right context frames
26
+ "fifo_len": 188, # FIFO buffer length
27
+ "spkcache_len": 188, # Speaker cache length
28
+ "subsampling_factor": 8, # 8x subsampling (80ms per encoder frame)
29
+ "sample_rate": 16000,
30
+ "mel_features": 128,
31
+ "n_speakers": 4,
32
+ }
33
+ ```
34
+
35
+ ## Usage
36
+
37
+ ### Python (coremltools)
38
+
39
+ ```python
40
+ import coremltools as ct
41
+ import numpy as np
42
+
43
+ # Load models
44
+ pre_encoder = ct.models.MLModel("Pipeline_PreEncoder.mlpackage",
45
+ compute_units=ct.ComputeUnit.CPU_ONLY)
46
+ head = ct.models.MLModel("Pipeline_Head_Fixed.mlpackage",
47
+ compute_units=ct.ComputeUnit.CPU_ONLY)
48
+
49
+ # Initialize state
50
+ spkcache = np.zeros((1, 188, 512), dtype=np.float32)
51
+ fifo = np.zeros((1, 188, 512), dtype=np.float32)
52
+
53
+ # Process chunk (mel_features: [1, 112, 128])
54
+ pre_out = pre_encoder.predict({
55
+ "chunk": mel_features,
56
+ "chunk_lengths": np.array([actual_length], dtype=np.int32),
57
+ "spkcache": spkcache,
58
+ "spkcache_lengths": np.array([0], dtype=np.int32),
59
+ "fifo": fifo,
60
+ "fifo_lengths": np.array([0], dtype=np.int32)
61
+ })
62
+
63
+ head_out = head.predict({
64
+ "pre_encoder_embs": pre_out["pre_encoder_embs"],
65
+ "pre_encoder_lengths": pre_out["pre_encoder_lengths"],
66
+ "chunk_embs_in": pre_out["chunk_embs_in"],
67
+ "chunk_lens_in": pre_out["chunk_lens_in"]
68
+ })
69
+
70
+ predictions = head_out["speaker_preds"] # [1, T, 4]
71
+ ```
72
+
73
+ ### Swift (Core ML)
74
+
75
+ ```swift
76
+ import CoreML
77
+
78
+ let preEncoder = try MLModel(contentsOf: preEncoderURL)
79
+ let head = try MLModel(contentsOf: headURL)
80
+
81
+ // Create input with MLMultiArray for chunk, spkcache, fifo
82
+ let preEncoderInput = try preEncoder.prediction(from: inputProvider)
83
+ let headInput = try head.prediction(from: preEncoderInput)
84
+
85
+ let predictions = headInput.featureValue(for: "speaker_preds")
86
+ ```
87
+
88
+ ## Mel Spectrogram Settings
89
+
90
+ For compatibility with the original NeMo model:
91
+
92
+ ```python
93
+ mel_config = {
94
+ "sample_rate": 16000,
95
+ "n_fft": 512,
96
+ "win_length": 400, # 25ms
97
+ "hop_length": 160, # 10ms
98
+ "n_mels": 128,
99
+ "preemph": 0.97,
100
+ "log_zero_guard_value": 2**-24,
101
+ "normalize": "per_feature",
102
+ }
103
+ ```
104
+
105
+ ## Streaming Pipeline
106
+
107
+ 1. **Chunk audio** into ~480ms windows (48 mel frames core + context)
108
+ 2. **Compute mel spectrogram** for each chunk
109
+ 3. **Run PreEncoder** with current state (spkcache + fifo)
110
+ 4. **Run Head** to get 4-speaker probabilities
111
+ 5. **Update state** (spkcache/fifo buffers)
112
+ 6. **Threshold predictions** (default: 0.5) for binary speaker activity
113
+
114
+ ## Accuracy
115
+
116
+ Verified within 0.12% of original NeMo PyTorch model on chunk-level predictions.
117
+
118
+ ## Requirements
119
+
120
+ - macOS 12+ or iOS 15+
121
+ - Apple Silicon (M1/M2/M3) recommended
122
+ - Python: `coremltools`, `numpy`, `torch`, `torchaudio`
123
+
124
+ ## License
125
+
126
+ Apache 2.0 (following NVIDIA NeMo licensing)
127
+
128
+ ## Citation
129
+
130
+ ```bibtex
131
+ @article{park2024sortformer,
132
+ title={Sortformer: Seamless Integration of Speaker Diarization and ASR by Bridging Timestamps and Tokens},
133
+ author={Park, Taejin and Huang, He and Koluguri, Nithin and Georgiou, Panagiotis and Watanabe, Shinji and Ginsburg, Boris},
134
+ journal={arXiv preprint arXiv:2409.06656},
135
+ year={2024}
136
+ }
137
+ ```
convert_to_coreml.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import coremltools as ct
4
+ import argparse
5
+ import os
6
+ import sys
7
+ import numpy as np
8
+ import types
9
+
10
+ # Ensure we use the right environment for imports
11
+ # (User's environment has 'nemo' installed)
12
+ from nemo.collections.asr.models import SortformerEncLabelModel
13
+ from nemo.collections.asr.parts.preprocessing.features import FilterbankFeaturesTA
14
+ from coreml_wrappers import *
15
+
16
+
17
+ def convert_pre_encoder(
18
+ model: SortformerEncLabelModel,
19
+ precision,
20
+ name: str,
21
+ input_chunk, input_chunk_len,
22
+ input_spkcache, input_spkcache_len,
23
+ input_fifo, input_fifo_len
24
+ ):
25
+ wrapper = PreEncoderWrapper(model)
26
+ wrapper.eval()
27
+
28
+ traced_model = torch.jit.trace(wrapper, (
29
+ input_chunk, input_chunk_len,
30
+ input_spkcache, input_spkcache_len,
31
+ input_fifo, input_fifo_len
32
+ ))
33
+
34
+ mlmodel = ct.convert(
35
+ traced_model,
36
+ inputs=[
37
+ ct.TensorType(name="chunk", shape=input_chunk.shape, dtype=np.float32),
38
+ ct.TensorType(name="chunk_lengths", shape=input_chunk_len.shape, dtype=np.int32),
39
+ ct.TensorType(name="spkcache", shape=input_spkcache.shape),
40
+ ct.TensorType(name="spkcache_lengths", shape=input_spkcache_len.shape, dtype=np.int32),
41
+ ct.TensorType(name="fifo", shape=input_fifo.shape),
42
+ ct.TensorType(name="fifo_lengths", shape=input_fifo_len.shape, dtype=np.int32),
43
+ ],
44
+ outputs=[
45
+ ct.TensorType(name="pre_encoder_embs", dtype=np.float32),
46
+ ct.TensorType(name="pre_encoder_lengths", dtype=np.int32),
47
+ ct.TensorType(name="chunk_pre_encoder_embs", dtype=np.float32),
48
+ ct.TensorType(name="chunk_pre_encoder_lengths", dtype=np.int32),
49
+ ],
50
+ minimum_deployment_target=ct.target.iOS16,
51
+ compute_precision=precision,
52
+ compute_units=ct.ComputeUnit.ALL
53
+ )
54
+
55
+ mlmodel.save(name)
56
+ return mlmodel, traced_model
57
+
58
+
59
+ def convert_head(
60
+ model: SortformerEncLabelModel,
61
+ precision,
62
+ name: str,
63
+ pre_encoder_embs, pre_encoder_lengths,
64
+ chunk_pre_encoder_embs, chunk_pre_encoder_lengths,
65
+ ):
66
+ wrapper = SortformerHeadWrapper(model)
67
+ wrapper.eval()
68
+
69
+ traced_model = torch.jit.trace(wrapper, (
70
+ pre_encoder_embs, pre_encoder_lengths,
71
+ chunk_pre_encoder_embs, chunk_pre_encoder_lengths,
72
+ ))
73
+
74
+ mlmodel = ct.convert(
75
+ traced_model,
76
+ inputs=[
77
+ ct.TensorType(name="pre_encoder_embs", shape=pre_encoder_embs.shape, dtype=np.float32),
78
+ ct.TensorType(name="pre_encoder_lengths", shape=pre_encoder_lengths.shape, dtype=np.int32),
79
+ ct.TensorType(name="chunk_pre_encoder_embs", shape=chunk_pre_encoder_embs.shape, dtype=np.float32),
80
+ ct.TensorType(name="chunk_pre_encoder_lengths", shape=chunk_pre_encoder_lengths.shape, dtype=np.int32),
81
+ ],
82
+ outputs=[
83
+ ct.TensorType(name="speaker_preds", dtype=np.float32),
84
+ ct.TensorType(name="chunk_pre_encoder_embs"),
85
+ ct.TensorType(name="chunk_pre_encoder_lengths")
86
+ ],
87
+ minimum_deployment_target=ct.target.iOS16,
88
+ compute_precision=precision,
89
+ compute_units=ct.ComputeUnit.ALL
90
+ )
91
+
92
+ mlmodel.save(name)
93
+ return mlmodel, traced_model
94
+
95
+
96
+ def export_pipeline(
97
+ model_name: str,
98
+ output_dir: str,
99
+ preproc_precision: str = "fp32",
100
+ pre_encoder_precision: str = "fp32",
101
+ head_precision: str = "fp16",
102
+ skip_modules: bool = False,
103
+ verify: bool = False
104
+ ):
105
+ """
106
+ Export the Sortformer model as a pipeline of separate CoreML models.
107
+ Each component can have different precision.
108
+
109
+ Components:
110
+ 1. Preprocessor (audio -> mel features)
111
+ 2. Pre-encoder (features -> pre-encoded embeddings + concat with spkcache/fifo)
112
+ 3. Conformer Encoder (pre-encoded -> encoder embeddings)
113
+ 4. Transformer Encoder (encoder embeddings -> predictions)
114
+
115
+ Args:
116
+ :param model_name: NeMo model name or path
117
+ :param output_dir: Output directory for mlpackage files
118
+ :param preproc_precision: Precision for preprocessor ("fp16" or "fp32")
119
+ :param pre_encoder_precision: Precision for pre-encoder ("fp16" or "fp32")
120
+ :param head_precision: Precision for head module (conformer + transformer) ("fp16" or "fp32")
121
+ :param skip_modules: Whether to skip the individual modules
122
+ """
123
+ os.makedirs(output_dir, exist_ok=True)
124
+
125
+ def get_precision(s):
126
+ return ct.precision.FLOAT16 if s.lower() == "fp16" else ct.precision.FLOAT32
127
+
128
+ print("=" * 70)
129
+ print("Exporting Sortformer Pipeline")
130
+ print("=" * 70)
131
+ print(f"Preprocessor: {preproc_precision}")
132
+ print(f"Pre-encoder: {pre_encoder_precision}")
133
+ print(f"Head: {head_precision}")
134
+ print("=" * 70)
135
+
136
+ # Load model
137
+ print(f"\nLoading model: {model_name}")
138
+ if os.path.exists(model_name):
139
+ model = SortformerEncLabelModel.restore_from(model_name, map_location=torch.device("cpu"))
140
+ else:
141
+ model = SortformerEncLabelModel.from_pretrained(model_name, map_location=torch.device("cpu"))
142
+ model.eval()
143
+
144
+ # Configure for streaming
145
+ print("Configuring for streaming...")
146
+ model.sortformer_modules.chunk_len = 6
147
+ model.sortformer_modules.chunk_right_context = 1
148
+ model.sortformer_modules.chunk_left_context = 1
149
+ model.sortformer_modules.fifo_len = 40
150
+ model.sortformer_modules.spkcache_len = 120
151
+ model.sortformer_modules.spkcache_update_period = 32
152
+
153
+ modules = model.sortformer_modules
154
+ preprocessor = model.preprocessor
155
+ pre_encoder_mlmodel = None
156
+ head_mlmodel = None
157
+
158
+ if hasattr(preprocessor, 'pad_to'):
159
+ preprocessor.pad_to = 0
160
+
161
+ # Calculate dimensions
162
+ chunk_len = modules.chunk_len
163
+ input_chunk_time = (
164
+ chunk_len + modules.chunk_left_context + modules.chunk_right_context) * modules.subsampling_factor
165
+ fc_d_model = modules.fc_d_model # 512 - Conformer output
166
+ tf_d_model = modules.tf_d_model # 192 - Transformer input (after projection)
167
+ spkcache_len = modules.spkcache_len
168
+ fifo_len = modules.fifo_len
169
+
170
+ # Get feature dim
171
+ feat_dim = 128
172
+ if hasattr(model, 'encoder') and hasattr(model.encoder, '_feat_in'):
173
+ feat_dim = model.encoder._feat_in
174
+
175
+ # Pre-encode output size (after subsampling)
176
+ pre_encode_out_len = input_chunk_time // modules.subsampling_factor
177
+ total_concat_len = spkcache_len + fifo_len + pre_encode_out_len
178
+
179
+ print(f"Input chunk frames: {input_chunk_time}")
180
+ print(f"Pre-encode output: {pre_encode_out_len}")
181
+ print(f"Total concat len: {total_concat_len}")
182
+ print(f"Feature dim: {feat_dim}, FC d_model: {fc_d_model}, TF d_model: {tf_d_model}")
183
+
184
+ # Audio samples for preprocessor
185
+ stride = 160
186
+ window = 400
187
+ audio_samples = (input_chunk_time - 1) * stride + window
188
+ print(audio_samples)
189
+
190
+ # =========================================================
191
+ # 1. Export Preprocessor
192
+ # =========================================================
193
+
194
+ if not skip_modules:
195
+ print("\n[1/4] Exporting Preprocessor...")
196
+
197
+ preproc_wrapper = PreprocessorWrapper(preprocessor)
198
+ preproc_wrapper.eval()
199
+
200
+ dummy_wav = torch.randn(1, audio_samples)
201
+ dummy_len = torch.tensor([audio_samples], dtype=torch.long)
202
+
203
+ traced_preproc = torch.jit.trace(preproc_wrapper, (dummy_wav, dummy_len))
204
+
205
+ preproc_mlmodel = ct.convert(
206
+ traced_preproc,
207
+ inputs=[
208
+ ct.TensorType(name="audio_signal", shape=dummy_wav.shape),
209
+ ct.TensorType(name="length", shape=dummy_len.shape, dtype=np.int32)
210
+ ],
211
+ outputs=[
212
+ ct.TensorType(name="features", dtype=np.float32),
213
+ ct.TensorType(name="feature_lengths", dtype=np.int32)
214
+ ],
215
+ minimum_deployment_target=ct.target.iOS16,
216
+ compute_precision=get_precision(preproc_precision),
217
+ compute_units=ct.ComputeUnit.ALL
218
+ )
219
+ preproc_mlmodel.save(os.path.join(output_dir, "Pipeline_Preprocessor.mlpackage"))
220
+ print(" Saved Pipeline_Preprocessor.mlpackage")
221
+
222
+ # =========================================================
223
+ # 2. Export Pre-Encoder
224
+ # =========================================================
225
+
226
+ input_chunk = torch.randn(1, input_chunk_time, feat_dim)
227
+ input_chunk_len = torch.tensor([input_chunk_time], dtype=torch.long)
228
+ input_spkcache = torch.randn(1, spkcache_len, fc_d_model)
229
+ input_spkcache_len = torch.tensor([spkcache_len], dtype=torch.long)
230
+ input_fifo = torch.randn(1, fifo_len, fc_d_model)
231
+ input_fifo_len = torch.tensor([fifo_len], dtype=torch.long)
232
+
233
+ if not skip_modules:
234
+ print("\n[2/4] Exporting Pre-Encoder...")
235
+ pre_encoder_mlmodel, _ = convert_pre_encoder(
236
+ model,
237
+ get_precision(pre_encoder_precision),
238
+ os.path.join(output_dir, "Pipeline_PreEncoder.mlpackage"),
239
+ input_chunk, input_chunk_len,
240
+ input_spkcache, input_spkcache_len,
241
+ input_fifo, input_fifo_len
242
+ )
243
+ print(" Saved Pipeline_PreEncoder.mlpackage")
244
+
245
+ # =========================================================
246
+ # 3. Export Conformer Encoder
247
+ # =========================================================
248
+
249
+ pre_encoder_embs = torch.randn(1, total_concat_len, fc_d_model)
250
+ pre_encoder_lengths = torch.tensor([total_concat_len], dtype=torch.long)
251
+ chunk_pre_encoder_embs = torch.randn(1, pre_encode_out_len, fc_d_model)
252
+ chunk_pre_encoder_lengths = torch.tensor([pre_encode_out_len], dtype=torch.long)
253
+
254
+ if not skip_modules:
255
+ print("\n[3/4] Exporting Head Module...")
256
+ head_mlmodel, _ = convert_head(
257
+ model,
258
+ get_precision(head_precision),
259
+ os.path.join(output_dir, "Pipeline_Head.mlpackage"),
260
+ pre_encoder_embs, pre_encoder_lengths,
261
+ chunk_pre_encoder_embs, chunk_pre_encoder_lengths
262
+ )
263
+ print(" Saved Pipeline_Head.mlpackage")
264
+
265
+ # =========================================================
266
+ # 5. Create Combined Pipelines
267
+ # =========================================================
268
+ print("\n[4/4] Creating Combined ML Pipelines...")
269
+
270
+ # Load the exported models
271
+ if skip_modules and not verify:
272
+ print('Loading Pipeline CoreML Modules...')
273
+ pre_encoder_mlmodel = ct.models.MLModel(
274
+ os.path.join(output_dir, "Pipeline_PreEncoder.mlpackage")
275
+ )
276
+ head_mlmodel = ct.models.MLModel(
277
+ os.path.join(output_dir, "Pipeline_Head.mlpackage")
278
+ )
279
+
280
+ assert pre_encoder_mlmodel is not None and head_mlmodel is not None
281
+
282
+ # Create Full Pipeline: PreEncoder → Conformer → Transformer
283
+ # Inputs: chunk, chunk_lengths, spkcache, spkcache_lengths, fifo, fifo_lengths
284
+ # Output: preds
285
+
286
+ if verify:
287
+ pipeline_model = ct.models.MLModel('coreml_models/SortformerPipeline.mlpackage')
288
+ spec = pipeline_model.get_spec()
289
+ print(pipeline_model.input_description)
290
+ print(pipeline_model.output_description)
291
+ print(spec)
292
+ else:
293
+ try:
294
+ # Both models now use compute_units=ALL.
295
+ # The pre_encoder uses ANE-safe gather operations in fixed_concat_and_pad
296
+ # to avoid zero-length slices that would crash on ANE.
297
+
298
+ pipeline_model = ct.utils.make_pipeline(
299
+ pre_encoder_mlmodel,
300
+ head_mlmodel,
301
+ compute_units=ct.ComputeUnit.ALL
302
+ )
303
+
304
+ # Save the pipeline
305
+ pipeline_model.save(os.path.join(output_dir, "SortformerPipeline.mlpackage"))
306
+ print(" Saved SortformerPipeline.mlpackage (PreEncoder + Conformer + Transformer)")
307
+ except Exception as e:
308
+ print(f" Warning: Could not create full pipeline: {e}")
309
+ import traceback
310
+ traceback.print_exc()
311
+
312
+ # =========================================================
313
+ # Summary
314
+ # =========================================================
315
+ print("\n" + "=" * 70)
316
+ print("Pipeline Export Complete!")
317
+ print("=" * 70)
318
+ print(f"Output directory: {output_dir}")
319
+ print("\nExported models:")
320
+ print(f" 1. Pipeline_Preprocessor.mlpackage ({preproc_precision})")
321
+ print(f" 2. Pipeline_PreEncoder.mlpackage ({pre_encoder_precision})")
322
+ print(f" 3. Pipeline_Head.mlpackage ({head_precision})")
323
+ print(f" 5. SortformerPipeline.mlpackage (combined: PreEncoder+Head)")
324
+ print("\nUsage in inference:")
325
+ print(" audio -> Preprocessor -> features")
326
+ print(" features + spkcache + fifo -> SortformerPipeline -> predictions")
327
+
328
+
329
+ if __name__ == "__main__":
330
+ parser = argparse.ArgumentParser()
331
+ parser.add_argument("--model_name", default="nvidia/diar_streaming_sortformer_4spk-v2.1",
332
+ help="NeMo model name or path")
333
+ parser.add_argument("--output_dir", default="coreml_models", help="Output directory")
334
+ parser.add_argument("--fp16", action="store_true", help="Use FP16 for single model export")
335
+
336
+ # Pipeline options
337
+ parser.add_argument("--preproc_precision", default="fp32", choices=["fp16", "fp32"], help="Preprocessor precision")
338
+ parser.add_argument("--pre_encoder_precision", default="fp32", choices=["fp16", "fp32"],
339
+ help="Pre-encoder precision")
340
+ parser.add_argument("--head_precision", default="fp16", choices=["fp16", "fp32"],
341
+ help="Conformer encoder precision")
342
+ parser.add_argument("--skip_modules", action="store_true", help="Skip modules in pipeline export")
343
+ parser.add_argument("--verify", action="store_true", help="Skip pipeline in pipeline export")
344
+
345
+ args = parser.parse_args()
346
+
347
+ print(f"CoreMLTools Version: {ct.__version__}")
348
+
349
+ export_pipeline(
350
+ args.model_name,
351
+ args.output_dir,
352
+ preproc_precision=args.preproc_precision,
353
+ pre_encoder_precision=args.pre_encoder_precision,
354
+ head_precision=args.head_precision,
355
+ skip_modules=args.skip_modules,
356
+ verify=args.verify,
357
+ )
export_nvidia_pipeline.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Export combined SortformerPipeline with NVIDIA's 1.04s latency configuration.
2
+
3
+ This creates models compatible with the Swift SortformerDiarizer interface.
4
+ """
5
+ import os
6
+ os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ import numpy as np
11
+ import coremltools as ct
12
+ from nemo.collections.asr.models import SortformerEncLabelModel
13
+ from coreml_wrappers import PreEncoderWrapper
14
+
15
+ # NVIDIA's 1.04s latency configuration
16
+ NVIDIA_CONFIG = {
17
+ 'chunk_len': 6,
18
+ 'chunk_right_context': 7, # Was 1
19
+ 'chunk_left_context': 1,
20
+ 'fifo_len': 188, # Was 40
21
+ 'spkcache_len': 188, # Was 120
22
+ 'spkcache_update_period': 144, # Was 30
23
+ }
24
+
25
+ print("=" * 70)
26
+ print("Exporting Combined SortformerPipeline with NVIDIA Config")
27
+ print("=" * 70)
28
+ print(f"Config: {NVIDIA_CONFIG}")
29
+
30
+ # Load model
31
+ print("\nLoading NeMo model...")
32
+ model = SortformerEncLabelModel.from_pretrained(
33
+ "nvidia/diar_streaming_sortformer_4spk-v2.1", map_location="cpu"
34
+ )
35
+ model.eval()
36
+
37
+ # Apply NVIDIA config
38
+ modules = model.sortformer_modules
39
+ modules.chunk_len = NVIDIA_CONFIG['chunk_len']
40
+ modules.chunk_right_context = NVIDIA_CONFIG['chunk_right_context']
41
+ modules.chunk_left_context = NVIDIA_CONFIG['chunk_left_context']
42
+ modules.fifo_len = NVIDIA_CONFIG['fifo_len']
43
+ modules.spkcache_len = NVIDIA_CONFIG['spkcache_len']
44
+ modules.spkcache_update_period = NVIDIA_CONFIG['spkcache_update_period']
45
+
46
+ # Calculate dimensions
47
+ chunk_len = modules.chunk_len
48
+ input_chunk_time = (chunk_len + modules.chunk_left_context + modules.chunk_right_context) * modules.subsampling_factor
49
+ fc_d_model = modules.fc_d_model # 512
50
+ spkcache_len = modules.spkcache_len
51
+ fifo_len = modules.fifo_len
52
+
53
+ feat_dim = 128
54
+ pre_encode_out_len = input_chunk_time // modules.subsampling_factor
55
+ total_concat_len = spkcache_len + fifo_len + pre_encode_out_len
56
+
57
+ print(f"\nDimensions:")
58
+ print(f" Input chunk frames: {input_chunk_time}")
59
+ print(f" Pre-encode output: {pre_encode_out_len}")
60
+ print(f" Total concat len: {total_concat_len}")
61
+ print(f" FC d_model: {fc_d_model}")
62
+ print(f" FIFO len: {fifo_len}")
63
+ print(f" Spkcache len: {spkcache_len}")
64
+
65
+ # Calculate audio samples needed for preprocessor
66
+ # NeMo adds internal padding (16 samples each side), so the formula is different
67
+ # Empirically tested: 17920 samples → 112 mel frames, 18160 → 113 frames
68
+ # For 112 frames, we need 17920 samples (not the naive 18160 from stride formula)
69
+ mel_stride = 160
70
+ mel_window = 400
71
+ # Correct formula accounting for NeMo padding
72
+ preprocessor_audio_samples = 17920 # Empirically determined for 112 frames
73
+ print(f" Preprocessor audio samples: {preprocessor_audio_samples}")
74
+
75
+ # Create output directory
76
+ output_dir = "coreml_models_nvidia"
77
+ os.makedirs(output_dir, exist_ok=True)
78
+
79
+ # =========================================================
80
+ # 0. Export Preprocessor (audio → mel features)
81
+ # =========================================================
82
+ print("\n[0/3] Exporting Preprocessor...")
83
+
84
+ from coreml_wrappers import PreprocessorWrapper
85
+
86
+ preprocessor_wrapper = PreprocessorWrapper(model.preprocessor)
87
+ preprocessor_wrapper.eval()
88
+
89
+ # Trace with correct audio sample count
90
+ audio_input = torch.randn(1, preprocessor_audio_samples)
91
+ audio_length = torch.tensor([preprocessor_audio_samples], dtype=torch.long)
92
+
93
+ traced_preprocessor = torch.jit.trace(preprocessor_wrapper, (audio_input, audio_length))
94
+
95
+ preprocessor_ml = ct.convert(
96
+ traced_preprocessor,
97
+ inputs=[
98
+ ct.TensorType(name="audio_signal", shape=audio_input.shape, dtype=np.float32),
99
+ ct.TensorType(name="length", shape=audio_length.shape, dtype=np.int32),
100
+ ],
101
+ outputs=[
102
+ ct.TensorType(name="features", dtype=np.float32),
103
+ ct.TensorType(name="feature_lengths", dtype=np.int32),
104
+ ],
105
+ minimum_deployment_target=ct.target.iOS16,
106
+ compute_precision=ct.precision.FLOAT32,
107
+ compute_units=ct.ComputeUnit.CPU_ONLY # CPU for FP32 precision
108
+ )
109
+
110
+ preprocessor_ml.save(os.path.join(output_dir, "Pipeline_Preprocessor.mlpackage"))
111
+ print(f" Saved {output_dir}/Pipeline_Preprocessor.mlpackage")
112
+
113
+ # =========================================================
114
+ # 1. Export PreEncoder
115
+ # =========================================================
116
+ print("\n[1/3] Exporting PreEncoder...")
117
+
118
+ input_chunk = torch.randn(1, input_chunk_time, feat_dim)
119
+ input_chunk_len = torch.tensor([input_chunk_time], dtype=torch.long)
120
+ input_spkcache = torch.randn(1, spkcache_len, fc_d_model)
121
+ input_spkcache_len = torch.tensor([spkcache_len], dtype=torch.long)
122
+ input_fifo = torch.randn(1, fifo_len, fc_d_model)
123
+ input_fifo_len = torch.tensor([fifo_len], dtype=torch.long)
124
+
125
+ pre_encoder = PreEncoderWrapper(model)
126
+ pre_encoder.eval()
127
+
128
+ traced_pre_encoder = torch.jit.trace(pre_encoder, (
129
+ input_chunk, input_chunk_len,
130
+ input_spkcache, input_spkcache_len,
131
+ input_fifo, input_fifo_len
132
+ ))
133
+
134
+ # Use names that match for pipeline connection
135
+ pre_encoder_ml = ct.convert(
136
+ traced_pre_encoder,
137
+ inputs=[
138
+ ct.TensorType(name="chunk", shape=input_chunk.shape, dtype=np.float32),
139
+ ct.TensorType(name="chunk_lengths", shape=input_chunk_len.shape, dtype=np.int32),
140
+ ct.TensorType(name="spkcache", shape=input_spkcache.shape, dtype=np.float32),
141
+ ct.TensorType(name="spkcache_lengths", shape=input_spkcache_len.shape, dtype=np.int32),
142
+ ct.TensorType(name="fifo", shape=input_fifo.shape, dtype=np.float32),
143
+ ct.TensorType(name="fifo_lengths", shape=input_fifo_len.shape, dtype=np.int32),
144
+ ],
145
+ outputs=[
146
+ ct.TensorType(name="pre_encoder_embs", dtype=np.float32),
147
+ ct.TensorType(name="pre_encoder_lengths", dtype=np.int32),
148
+ ct.TensorType(name="chunk_embs_in", dtype=np.float32),
149
+ ct.TensorType(name="chunk_lens_in", dtype=np.int32),
150
+ ],
151
+ minimum_deployment_target=ct.target.iOS16,
152
+ compute_precision=ct.precision.FLOAT32,
153
+ compute_units=ct.ComputeUnit.ALL
154
+ )
155
+
156
+ pre_encoder_ml.save(os.path.join(output_dir, "Pipeline_PreEncoder.mlpackage"))
157
+ print(f" Saved {output_dir}/Pipeline_PreEncoder.mlpackage")
158
+
159
+ # =========================================================
160
+ # 2. Export Fixed Head (with identity ops to preserve embeddings)
161
+ # =========================================================
162
+ print("\n[2/3] Exporting Fixed Head...")
163
+
164
+
165
+ class FixedSortformerHead(nn.Module):
166
+ """Head wrapper that forces chunk_pre_encoder_embs to be computed."""
167
+
168
+ def __init__(self, model):
169
+ super().__init__()
170
+ self.model = model
171
+ self.identity_scale = nn.Parameter(torch.ones(1), requires_grad=False)
172
+
173
+ def forward(self, pre_encoder_embs, pre_encoder_lengths, chunk_embs_in, chunk_lens_in):
174
+ # Frontend encoder
175
+ spkcache_fifo_chunk_fc_encoder_embs, spkcache_fifo_chunk_fc_encoder_lengths = self.model.frontend_encoder(
176
+ processed_signal=pre_encoder_embs,
177
+ processed_signal_length=pre_encoder_lengths,
178
+ bypass_pre_encode=True,
179
+ )
180
+
181
+ # Forward inference
182
+ speaker_preds = self.model.forward_infer(
183
+ spkcache_fifo_chunk_fc_encoder_embs, spkcache_fifo_chunk_fc_encoder_lengths
184
+ )
185
+
186
+ # Force the embedding to be computed (prevents optimization)
187
+ chunk_pre_encoder_embs = chunk_embs_in * self.identity_scale
188
+ chunk_pre_encoder_lengths = chunk_lens_in + 0
189
+
190
+ return speaker_preds, chunk_pre_encoder_embs, chunk_pre_encoder_lengths
191
+
192
+
193
+ head = FixedSortformerHead(model)
194
+ head.eval()
195
+
196
+ # Input shapes for head - must match PreEncoder output names
197
+ pre_encoder_embs = torch.randn(1, total_concat_len, fc_d_model)
198
+ pre_encoder_lengths = torch.tensor([total_concat_len], dtype=torch.long)
199
+ chunk_embs_in = torch.randn(1, pre_encode_out_len, fc_d_model)
200
+ chunk_lens_in = torch.tensor([pre_encode_out_len], dtype=torch.long)
201
+
202
+ traced_head = torch.jit.trace(head, (
203
+ pre_encoder_embs, pre_encoder_lengths,
204
+ chunk_embs_in, chunk_lens_in
205
+ ))
206
+
207
+ head_ml = ct.convert(
208
+ traced_head,
209
+ inputs=[
210
+ ct.TensorType(name="pre_encoder_embs", shape=pre_encoder_embs.shape, dtype=np.float32),
211
+ ct.TensorType(name="pre_encoder_lengths", shape=pre_encoder_lengths.shape, dtype=np.int32),
212
+ ct.TensorType(name="chunk_embs_in", shape=chunk_embs_in.shape, dtype=np.float32),
213
+ ct.TensorType(name="chunk_lens_in", shape=chunk_lens_in.shape, dtype=np.int32),
214
+ ],
215
+ outputs=[
216
+ ct.TensorType(name="speaker_preds", dtype=np.float32),
217
+ ct.TensorType(name="chunk_pre_encoder_embs", dtype=np.float32),
218
+ ct.TensorType(name="chunk_pre_encoder_lengths", dtype=np.int32),
219
+ ],
220
+ minimum_deployment_target=ct.target.iOS16,
221
+ compute_precision=ct.precision.FLOAT16,
222
+ compute_units=ct.ComputeUnit.ALL
223
+ )
224
+
225
+ head_ml.save(os.path.join(output_dir, "Pipeline_Head_Fixed.mlpackage"))
226
+ print(f" Saved {output_dir}/Pipeline_Head_Fixed.mlpackage")
227
+
228
+ # =========================================================
229
+ # 3. Create Combined Pipeline
230
+ # =========================================================
231
+ print("\n[3/3] Creating combined pipeline...")
232
+
233
+ try:
234
+ pipeline = ct.utils.make_pipeline(pre_encoder_ml, head_ml, compute_units=ct.ComputeUnit.ALL)
235
+ pipeline.save(os.path.join(output_dir, "SortformerPipeline.mlpackage"))
236
+ print(f" Saved {output_dir}/SortformerPipeline.mlpackage")
237
+ except Exception as e:
238
+ print(f" Pipeline creation failed: {e}")
239
+ print(" Note: Call PreEncoder and Head separately to avoid embedding bug")
240
+
241
+ # =========================================================
242
+ # Verification
243
+ # =========================================================
244
+ print("\n" + "=" * 70)
245
+ print("Verification")
246
+ print("=" * 70)
247
+
248
+ # Test PreEncoder
249
+ test_chunk = np.random.randn(1, input_chunk_time, feat_dim).astype(np.float32)
250
+ test_chunk_len = np.array([input_chunk_time], dtype=np.int32)
251
+ test_spkcache = np.zeros((1, spkcache_len, fc_d_model), dtype=np.float32)
252
+ test_spkcache_len = np.array([0], dtype=np.int32)
253
+ test_fifo = np.zeros((1, fifo_len, fc_d_model), dtype=np.float32)
254
+ test_fifo_len = np.array([0], dtype=np.int32)
255
+
256
+ pre_out = pre_encoder_ml.predict({
257
+ 'chunk': test_chunk,
258
+ 'chunk_lengths': test_chunk_len,
259
+ 'spkcache': test_spkcache,
260
+ 'spkcache_lengths': test_spkcache_len,
261
+ 'fifo': test_fifo,
262
+ 'fifo_lengths': test_fifo_len
263
+ })
264
+
265
+ print(f"PreEncoder output shapes:")
266
+ print(f" pre_encoder_embs: {pre_out['pre_encoder_embs'].shape}")
267
+ print(f" chunk_embs_in: {pre_out['chunk_embs_in'].shape}")
268
+ print(f" chunk_embs_in[0,0,0]: {pre_out['chunk_embs_in'][0,0,0]:.6f}")
269
+
270
+ # Test Head
271
+ head_out = head_ml.predict({
272
+ 'pre_encoder_embs': pre_out['pre_encoder_embs'],
273
+ 'pre_encoder_lengths': pre_out['pre_encoder_lengths'],
274
+ 'chunk_embs_in': pre_out['chunk_embs_in'],
275
+ 'chunk_lens_in': pre_out['chunk_lens_in']
276
+ })
277
+
278
+ print(f"\nHead output shapes:")
279
+ print(f" speaker_preds: {head_out['speaker_preds'].shape}")
280
+ print(f" chunk_pre_encoder_embs: {head_out['chunk_pre_encoder_embs'].shape}")
281
+ print(f" chunk_pre_encoder_embs[0,0,0]: {head_out['chunk_pre_encoder_embs'][0,0,0]:.6f}")
282
+
283
+ # Verify embedding preservation
284
+ if np.isclose(pre_out['chunk_embs_in'][0,0,0], head_out['chunk_pre_encoder_embs'][0,0,0], atol=0.01):
285
+ print("\n✓ Embedding [0,0,0] preserved correctly!")
286
+ else:
287
+ print(f"\n✗ WARNING: Embedding [0,0,0] corrupted!")
288
+ print(f" PreEncoder: {pre_out['chunk_embs_in'][0,0,0]:.6f}")
289
+ print(f" Head: {head_out['chunk_pre_encoder_embs'][0,0,0]:.6f}")
290
+
291
+ print("\n" + "=" * 70)
292
+ print("Export Complete!")
293
+ print("=" * 70)
294
+ print(f"Models saved to: {output_dir}/")
295
+ print(f" - Pipeline_PreEncoder.mlpackage")
296
+ print(f" - Pipeline_Head_Fixed.mlpackage")
297
+ print(f" - SortformerPipeline.mlpackage (if pipeline creation succeeded)")
298
+ print(f"\nConfiguration (NVIDIA 1.04s latency):")
299
+ for k, v in NVIDIA_CONFIG.items():
300
+ print(f" {k}: {v}")
301
+
302
+ print(f"\nSwift SortformerConfig should use:")
303
+ print(f" chunkLen = 6")
304
+ print(f" chunkLeftContext = 1")
305
+ print(f" chunkRightContext = 7")
306
+ print(f" fifoLen = 188")
307
+ print(f" spkcacheLen = 188")
308
+ print(f" spkcacheUpdatePeriod = 144")
inference.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Streaming Sortformer CoreML Inference
4
+
5
+ This script demonstrates how to use the CoreML-converted NVIDIA Streaming Sortformer
6
+ model for real-time speaker diarization on Apple Silicon.
7
+
8
+ Original model: nvidia/diar_streaming_sortformer_4spk-v2.1
9
+ """
10
+
11
+ import os
12
+ import numpy as np
13
+ import coremltools as ct
14
+
15
+ # Configuration matching NVIDIA's streaming settings
16
+ CONFIG = {
17
+ "chunk_len": 6, # Core chunk length in encoder frames
18
+ "chunk_left_context": 1, # Left context frames
19
+ "chunk_right_context": 7, # Right context frames
20
+ "fifo_len": 188, # FIFO buffer length
21
+ "spkcache_len": 188, # Speaker cache length
22
+ "spkcache_update_period": 144,
23
+ "subsampling_factor": 8, # Mel frames per encoder frame
24
+ "n_speakers": 4, # Max speakers
25
+ "sample_rate": 16000,
26
+ "mel_features": 128,
27
+ }
28
+
29
+
30
+ class SortformerCoreML:
31
+ """CoreML Streaming Sortformer Diarizer"""
32
+
33
+ def __init__(self, model_dir: str = ".", compute_units: str = "CPU_ONLY"):
34
+ """
35
+ Initialize the CoreML Sortformer pipeline.
36
+
37
+ Args:
38
+ model_dir: Directory containing the .mlpackage files
39
+ compute_units: "CPU_ONLY", "CPU_AND_GPU", or "ALL"
40
+ """
41
+ cu = getattr(ct.ComputeUnit, compute_units, ct.ComputeUnit.CPU_ONLY)
42
+
43
+ # Load models
44
+ self.preprocessor = ct.models.MLModel(
45
+ os.path.join(model_dir, "Pipeline_Preprocessor.mlpackage"),
46
+ compute_units=cu
47
+ )
48
+ self.pre_encoder = ct.models.MLModel(
49
+ os.path.join(model_dir, "Pipeline_PreEncoder.mlpackage"),
50
+ compute_units=cu
51
+ )
52
+ self.head = ct.models.MLModel(
53
+ os.path.join(model_dir, "Pipeline_Head_Fixed.mlpackage"),
54
+ compute_units=cu
55
+ )
56
+
57
+ # Initialize state buffers
58
+ self.reset_state()
59
+
60
+ def reset_state(self):
61
+ """Reset streaming state for new audio session."""
62
+ self.spkcache = np.zeros((1, CONFIG["spkcache_len"], 512), dtype=np.float32)
63
+ self.fifo = np.zeros((1, CONFIG["fifo_len"], 512), dtype=np.float32)
64
+ self.spkcache_len = 0
65
+ self.fifo_len = 0
66
+ self.chunk_idx = 0
67
+
68
+ def process_chunk(self, mel_features: np.ndarray, chunk_length: int) -> np.ndarray:
69
+ """
70
+ Process a single chunk of mel features.
71
+
72
+ Args:
73
+ mel_features: Mel spectrogram chunk [1, T, 128] where T <= 112
74
+ chunk_length: Actual valid length (before padding)
75
+
76
+ Returns:
77
+ Speaker predictions [num_frames, 4] with probabilities for each speaker
78
+ """
79
+ # Pad to 112 if needed
80
+ if mel_features.shape[1] < 112:
81
+ pad_len = 112 - mel_features.shape[1]
82
+ mel_features = np.pad(mel_features, ((0, 0), (0, pad_len), (0, 0)))
83
+
84
+ # Run PreEncoder
85
+ pre_out = self.pre_encoder.predict({
86
+ "chunk": mel_features.astype(np.float32),
87
+ "chunk_lengths": np.array([chunk_length], dtype=np.int32),
88
+ "spkcache": self.spkcache,
89
+ "spkcache_lengths": np.array([self.spkcache_len], dtype=np.int32),
90
+ "fifo": self.fifo,
91
+ "fifo_lengths": np.array([self.fifo_len], dtype=np.int32)
92
+ })
93
+
94
+ # Run Head
95
+ head_out = self.head.predict({
96
+ "pre_encoder_embs": pre_out["pre_encoder_embs"],
97
+ "pre_encoder_lengths": pre_out["pre_encoder_lengths"],
98
+ "chunk_embs_in": pre_out["chunk_embs_in"],
99
+ "chunk_lens_in": pre_out["chunk_lens_in"]
100
+ })
101
+
102
+ # Extract predictions for this chunk
103
+ emb_len = int(head_out["chunk_pre_encoder_lengths"][0])
104
+ lc = 0 if self.chunk_idx == 0 else 1 # Left context
105
+ rc = CONFIG["chunk_right_context"]
106
+ chunk_pred_len = emb_len - lc - rc
107
+
108
+ pred_offset = self.spkcache_len + self.fifo_len + lc
109
+ predictions = head_out["speaker_preds"][0, pred_offset:pred_offset + chunk_pred_len, :]
110
+
111
+ # Update state (simplified - full implementation needs NeMo's streaming_update logic)
112
+ self._update_state(pre_out, emb_len)
113
+
114
+ self.chunk_idx += 1
115
+ return predictions
116
+
117
+ def _update_state(self, pre_out, emb_len):
118
+ """Update spkcache and fifo state buffers."""
119
+ # Get new chunk embeddings
120
+ new_embs = pre_out["chunk_embs_in"][0, :emb_len, :]
121
+
122
+ # Add to fifo
123
+ if self.fifo_len + emb_len <= CONFIG["fifo_len"]:
124
+ self.fifo[0, self.fifo_len:self.fifo_len + emb_len, :] = new_embs
125
+ self.fifo_len += emb_len
126
+ else:
127
+ # FIFO overflow - move to spkcache
128
+ overflow = self.fifo_len + emb_len - CONFIG["fifo_len"]
129
+
130
+ # Move overflow from fifo to spkcache
131
+ if self.spkcache_len + overflow <= CONFIG["spkcache_len"]:
132
+ self.spkcache[0, self.spkcache_len:self.spkcache_len + overflow, :] = \
133
+ self.fifo[0, :overflow, :]
134
+ self.spkcache_len += overflow
135
+
136
+ # Shift fifo and add new
137
+ self.fifo[0, :self.fifo_len - overflow, :] = self.fifo[0, overflow:self.fifo_len, :]
138
+ self.fifo_len -= overflow
139
+ self.fifo[0, self.fifo_len:self.fifo_len + emb_len, :] = new_embs
140
+ self.fifo_len += emb_len
141
+
142
+
143
+ def process_audio(audio_path: str, model_dir: str = ".") -> list:
144
+ """
145
+ Process an audio file and return diarization results.
146
+
147
+ Args:
148
+ audio_path: Path to audio file (16kHz mono WAV)
149
+ model_dir: Directory containing CoreML models
150
+
151
+ Returns:
152
+ List of (start_time, end_time, speaker_id) tuples
153
+ """
154
+ import torchaudio
155
+ import torch
156
+
157
+ # Load audio
158
+ waveform, sr = torchaudio.load(audio_path)
159
+ if sr != 16000:
160
+ waveform = torchaudio.functional.resample(waveform, sr, 16000)
161
+ if waveform.shape[0] > 1:
162
+ waveform = waveform.mean(dim=0, keepdim=True)
163
+
164
+ # Initialize model
165
+ model = SortformerCoreML(model_dir)
166
+
167
+ # Compute mel spectrogram using NeMo-compatible settings
168
+ # (You may need to use the Pipeline_Preprocessor or native mel computation)
169
+
170
+ # Process in chunks and collect predictions
171
+ # ... (implementation depends on your mel spectrogram computation)
172
+
173
+ print(f"Loaded audio: {waveform.shape}, {sr}Hz")
174
+ print("Processing... (implement chunking logic)")
175
+
176
+ return []
177
+
178
+
179
+ if __name__ == "__main__":
180
+ import sys
181
+
182
+ if len(sys.argv) < 2:
183
+ print("Usage: python inference.py <audio_file.wav>")
184
+ print("\nThis script requires:")
185
+ print(" - Pipeline_Preprocessor.mlpackage")
186
+ print(" - Pipeline_PreEncoder.mlpackage")
187
+ print(" - Pipeline_Head_Fixed.mlpackage")
188
+ sys.exit(1)
189
+
190
+ results = process_audio(sys.argv[1])
191
+ for start, end, speaker in results:
192
+ print(f"[{start:.2f}s - {end:.2f}s] Speaker {speaker}")
pyproject.toml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "streaming-sortformer-coreml"
3
+ version = "0.1.0"
4
+ description = "CoreML conversion of NVIDIA Streaming Sortformer for Apple Silicon"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ license = "Apache-2.0"
8
+ dependencies = [
9
+ "coremltools>=7.0",
10
+ "torch>=2.0",
11
+ "torchaudio>=2.0",
12
+ "numpy>=1.24",
13
+ ]
14
+
15
+ [project.optional-dependencies]
16
+ convert = [
17
+ "nemo_toolkit[asr]>=2.0",
18
+ ]
19
+
20
+ [build-system]
21
+ requires = ["hatchling"]
22
+ build-backend = "hatchling.build"