alexwengg commited on
Commit
83bddac
·
verified ·
1 Parent(s): 2e41cf8

Add CAM++ CoreML (preprocessor + speaker embedding + card)

Browse files
CamPlusPlus.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8689da22bfcbf91474daacc0128bba99898fa3104d9920bad4f2f4396b0e5e0c
3
+ size 243
CamPlusPlus.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96fc1461a01a8a89fa8ff3efed0bb6dc6bc186d44d566f50a02919a957e27778
3
+ size 315
CamPlusPlus.mlmodelc/model.mil ADDED
The diff for this file is too large to render. See raw diff
 
CamPlusPlus.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:058c317b6a768cac30fcf89b46676d7df1aa1915357c44ecc7ada4e5b7c11590
3
+ size 13854912
CamPlusPreprocessor.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11e99d94be389165098b54c5de6f8745ed44de70f6fd294a20ca9548c626df77
3
+ size 243
CamPlusPreprocessor.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64fb697bb216eddb9f691250e45fee44b2ef87b52b047ede0063fe7275571f00
3
+ size 330
CamPlusPreprocessor.mlmodelc/model.mil ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ program(1.0)
2
+ [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3520.4.1"}, {"coremlc-version", "3520.5.1"}, {"coremltools-component-torch", "2.5.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
3
+ {
4
+ func main<ios17>(tensor<fp32, [1, ?]> waveform) [FlexibleShapeInformation = tuple<tuple<tensor<string, []>, dict<tensor<string, []>, tensor<int32, [?]>>>, tuple<tensor<string, []>, dict<tensor<string, []>, list<tensor<int32, [2]>, ?>>>>((("DefaultShapes", {{"waveform", [1, 64000]}}), ("RangeDims", {{"waveform", [[1, 1], [8000, 1600000]]}})))] {
5
+ tensor<fp32, [400]> window = const()[name = tensor<string, []>("window"), val = tensor<fp32, [400]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
6
+ tensor<fp32, [400, 1, 400]> frame_kernel = const()[name = tensor<string, []>("frame_kernel"), val = tensor<fp32, [400, 1, 400]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1728)))];
7
+ tensor<int32, [1]> var_8_axes_0 = const()[name = tensor<string, []>("op_8_axes_0"), val = tensor<int32, [1]>([1])];
8
+ tensor<fp32, [1, 1, ?]> var_8 = expand_dims(axes = var_8_axes_0, x = waveform)[name = tensor<string, []>("op_8")];
9
+ tensor<string, []> var_24_pad_type_0 = const()[name = tensor<string, []>("op_24_pad_type_0"), val = tensor<string, []>("valid")];
10
+ tensor<int32, [1]> var_24_strides_0 = const()[name = tensor<string, []>("op_24_strides_0"), val = tensor<int32, [1]>([160])];
11
+ tensor<int32, [2]> var_24_pad_0 = const()[name = tensor<string, []>("op_24_pad_0"), val = tensor<int32, [2]>([0, 0])];
12
+ tensor<int32, [1]> var_24_dilations_0 = const()[name = tensor<string, []>("op_24_dilations_0"), val = tensor<int32, [1]>([1])];
13
+ tensor<int32, []> var_24_groups_0 = const()[name = tensor<string, []>("op_24_groups_0"), val = tensor<int32, []>(1)];
14
+ tensor<fp32, [1, 400, ?]> var_24 = conv(dilations = var_24_dilations_0, groups = var_24_groups_0, pad = var_24_pad_0, pad_type = var_24_pad_type_0, strides = var_24_strides_0, weight = frame_kernel, x = var_8)[name = tensor<string, []>("op_24")];
15
+ tensor<int32, [3]> var_27_begin_0 = const()[name = tensor<string, []>("op_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
16
+ tensor<int32, [3]> var_27_end_0 = const()[name = tensor<string, []>("op_27_end_0"), val = tensor<int32, [3]>([1, 400, 0])];
17
+ tensor<bool, [3]> var_27_end_mask_0 = const()[name = tensor<string, []>("op_27_end_mask_0"), val = tensor<bool, [3]>([false, true, true])];
18
+ tensor<bool, [3]> var_27_squeeze_mask_0 = const()[name = tensor<string, []>("op_27_squeeze_mask_0"), val = tensor<bool, [3]>([true, false, false])];
19
+ tensor<fp32, [400, ?]> var_27 = slice_by_index(begin = var_27_begin_0, end = var_27_end_0, end_mask = var_27_end_mask_0, squeeze_mask = var_27_squeeze_mask_0, x = var_24)[name = tensor<string, []>("op_27")];
20
+ tensor<int32, [2]> frames_1_perm_0 = const()[name = tensor<string, []>("frames_1_perm_0"), val = tensor<int32, [2]>([1, 0])];
21
+ tensor<int32, [1]> var_33_axes_0 = const()[name = tensor<string, []>("op_33_axes_0"), val = tensor<int32, [1]>([1])];
22
+ tensor<bool, []> var_33_keep_dims_0 = const()[name = tensor<string, []>("op_33_keep_dims_0"), val = tensor<bool, []>(true)];
23
+ tensor<fp32, [?, 400]> frames_1 = transpose(perm = frames_1_perm_0, x = var_27)[name = tensor<string, []>("transpose_3")];
24
+ tensor<fp32, [?, 1]> var_33 = reduce_mean(axes = var_33_axes_0, keep_dims = var_33_keep_dims_0, x = frames_1)[name = tensor<string, []>("op_33")];
25
+ tensor<fp32, [?, 400]> frames_3 = sub(x = frames_1, y = var_33)[name = tensor<string, []>("frames_3")];
26
+ tensor<int32, [2]> var_45_begin_0 = const()[name = tensor<string, []>("op_45_begin_0"), val = tensor<int32, [2]>([0, 0])];
27
+ tensor<int32, [2]> var_45_end_0 = const()[name = tensor<string, []>("op_45_end_0"), val = tensor<int32, [2]>([0, 1])];
28
+ tensor<bool, [2]> var_45_end_mask_0 = const()[name = tensor<string, []>("op_45_end_mask_0"), val = tensor<bool, [2]>([true, false])];
29
+ tensor<fp32, [?, 1]> var_45 = slice_by_index(begin = var_45_begin_0, end = var_45_end_0, end_mask = var_45_end_mask_0, x = frames_3)[name = tensor<string, []>("op_45")];
30
+ tensor<int32, [2]> var_55_begin_0 = const()[name = tensor<string, []>("op_55_begin_0"), val = tensor<int32, [2]>([0, 0])];
31
+ tensor<int32, [2]> var_55_end_0 = const()[name = tensor<string, []>("op_55_end_0"), val = tensor<int32, [2]>([0, 399])];
32
+ tensor<bool, [2]> var_55_end_mask_0 = const()[name = tensor<string, []>("op_55_end_mask_0"), val = tensor<bool, [2]>([true, false])];
33
+ tensor<fp32, [?, 399]> var_55 = slice_by_index(begin = var_55_begin_0, end = var_55_end_0, end_mask = var_55_end_mask_0, x = frames_3)[name = tensor<string, []>("op_55")];
34
+ tensor<int32, []> var_57 = const()[name = tensor<string, []>("op_57"), val = tensor<int32, []>(1)];
35
+ tensor<bool, []> shifted_interleave_0 = const()[name = tensor<string, []>("shifted_interleave_0"), val = tensor<bool, []>(false)];
36
+ tensor<fp32, [?, 400]> shifted = concat(axis = var_57, interleave = shifted_interleave_0, values = (var_45, var_55))[name = tensor<string, []>("shifted")];
37
+ tensor<fp32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<fp32, []>(0x1.f0a3d8p-1)];
38
+ tensor<fp32, [?, 400]> var_60 = mul(x = shifted, y = var_59)[name = tensor<string, []>("op_60")];
39
+ tensor<fp32, [?, 400]> frames_5 = sub(x = frames_3, y = var_60)[name = tensor<string, []>("frames_5")];
40
+ tensor<fp32, [?, 400]> input = mul(x = frames_5, y = window)[name = tensor<string, []>("input")];
41
+ tensor<fp32, []> const_0 = const()[name = tensor<string, []>("const_0"), val = tensor<fp32, []>(0x0p+0)];
42
+ tensor<int32, [4]> frames_pad_0 = const()[name = tensor<string, []>("frames_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 112])];
43
+ tensor<string, []> frames_mode_0 = const()[name = tensor<string, []>("frames_mode_0"), val = tensor<string, []>("constant")];
44
+ tensor<fp32, [?, 512]> frames = pad(constant_val = const_0, mode = frames_mode_0, pad = frames_pad_0, x = input)[name = tensor<string, []>("frames")];
45
+ tensor<fp32, [257, 512]> transpose_0 = const()[name = tensor<string, []>("transpose_0"), val = tensor<fp32, [257, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(641792)))];
46
+ tensor<fp32, [257]> re_bias_0 = const()[name = tensor<string, []>("re_bias_0"), val = tensor<fp32, [257]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1168192)))];
47
+ tensor<fp32, [?, 257]> re = linear(bias = re_bias_0, weight = transpose_0, x = frames)[name = tensor<string, []>("re")];
48
+ tensor<fp32, [257, 512]> transpose_1 = const()[name = tensor<string, []>("transpose_1"), val = tensor<fp32, [257, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1169344)))];
49
+ tensor<fp32, [?, 257]> im = linear(bias = re_bias_0, weight = transpose_1, x = frames)[name = tensor<string, []>("im")];
50
+ tensor<fp32, [?, 257]> var_72 = mul(x = re, y = re)[name = tensor<string, []>("op_72")];
51
+ tensor<fp32, [?, 257]> var_73 = mul(x = im, y = im)[name = tensor<string, []>("op_73")];
52
+ tensor<fp32, [?, 257]> var_75 = add(x = var_72, y = var_73)[name = tensor<string, []>("op_75")];
53
+ tensor<fp32, []> var_76 = const()[name = tensor<string, []>("op_76"), val = tensor<fp32, []>(0x1p-23)];
54
+ tensor<fp32, []> const_1 = const()[name = tensor<string, []>("const_1"), val = tensor<fp32, []>(0x1.fffffep+127)];
55
+ tensor<fp32, [?, 257]> clip_0 = clip(alpha = var_76, beta = const_1, x = var_75)[name = tensor<string, []>("clip_0")];
56
+ tensor<fp32, [80, 257]> transpose_2 = const()[name = tensor<string, []>("transpose_2"), val = tensor<fp32, [80, 257]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1695744)))];
57
+ tensor<fp32, [80]> var_79_bias_0 = const()[name = tensor<string, []>("op_79_bias_0"), val = tensor<fp32, [80]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1778048)))];
58
+ tensor<fp32, [?, 80]> var_79 = linear(bias = var_79_bias_0, weight = transpose_2, x = clip_0)[name = tensor<string, []>("op_79")];
59
+ tensor<fp32, []> fbank_epsilon_0 = const()[name = tensor<string, []>("fbank_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
60
+ tensor<fp32, [?, 80]> fbank = log(epsilon = fbank_epsilon_0, x = var_79)[name = tensor<string, []>("fbank")];
61
+ tensor<int32, [1]> var_82_axes_0 = const()[name = tensor<string, []>("op_82_axes_0"), val = tensor<int32, [1]>([0])];
62
+ tensor<fp32, [1, ?, 80]> features = expand_dims(axes = var_82_axes_0, x = fbank)[name = tensor<string, []>("op_82")];
63
+ } -> (features);
64
+ }
CamPlusPreprocessor.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09f13054339581673c8b5745954471945a61fea54441b6e55f934f041611143a
3
+ size 1778432
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: campplus-upstream
4
+ license_link: https://github.com/modelscope/FunASR
5
+ language: [zh]
6
+ library_name: coreml
7
+ tags: [coreml, ane, speaker-verification, speaker-diarization, campplus, funasr, fluidaudio]
8
+ pipeline_tag: audio-classification
9
+ ---
10
+
11
+ # CAM++ — CoreML (Apple Neural Engine)
12
+
13
+ CoreML conversion of FunASR's **CAM++** speaker-embedding model (~7.2M params), for
14
+ on-device speaker verification / diarization on Apple Silicon. Upstream:
15
+ [iic/speech_campplus_sv_zh-cn_16k-common](https://www.modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common).
16
+
17
+ ## Files
18
+
19
+ | File | Precision | Compute unit | Role |
20
+ |------|-----------|--------------|------|
21
+ | `CamPlusPreprocessor.mlmodelc` | FP32 | CPU | waveform → 80-d fbank features |
22
+ | `CamPlusPlus.mlmodelc` | FP16 | ANE | fbank → 192-d speaker embedding |
23
+
24
+ ## Pipeline
25
+
26
+ ```
27
+ waveform → [Preprocessor fp32/CPU] → fbank [1,T,80]
28
+ → [CAM++ fp16/ANE] → embedding [1,192] (L2-normalize, then cosine for verification/clustering)
29
+ ```
30
+
31
+ CAM++ normalizes the fbank internally. The 192-d embedding is used with cosine
32
+ similarity for speaker verification and diarization clustering.
33
+
34
+ Parity: torch↔CoreML embedding cosine 0.99998 (random) / 0.99999 (real audio via the preprocessor).
35
+
36
+ ## License
37
+
38
+ Weights derive from FunASR's CAM++; upstream license applies. Format conversion only.