Add CAM++ CoreML (preprocessor + speaker embedding + card)
Browse files- CamPlusPlus.mlmodelc/analytics/coremldata.bin +3 -0
- CamPlusPlus.mlmodelc/coremldata.bin +3 -0
- CamPlusPlus.mlmodelc/model.mil +0 -0
- CamPlusPlus.mlmodelc/weights/weight.bin +3 -0
- CamPlusPreprocessor.mlmodelc/analytics/coremldata.bin +3 -0
- CamPlusPreprocessor.mlmodelc/coremldata.bin +3 -0
- CamPlusPreprocessor.mlmodelc/model.mil +64 -0
- CamPlusPreprocessor.mlmodelc/weights/weight.bin +3 -0
- README.md +38 -0
CamPlusPlus.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8689da22bfcbf91474daacc0128bba99898fa3104d9920bad4f2f4396b0e5e0c
|
| 3 |
+
size 243
|
CamPlusPlus.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:96fc1461a01a8a89fa8ff3efed0bb6dc6bc186d44d566f50a02919a957e27778
|
| 3 |
+
size 315
|
CamPlusPlus.mlmodelc/model.mil
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
CamPlusPlus.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:058c317b6a768cac30fcf89b46676d7df1aa1915357c44ecc7ada4e5b7c11590
|
| 3 |
+
size 13854912
|
CamPlusPreprocessor.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:11e99d94be389165098b54c5de6f8745ed44de70f6fd294a20ca9548c626df77
|
| 3 |
+
size 243
|
CamPlusPreprocessor.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:64fb697bb216eddb9f691250e45fee44b2ef87b52b047ede0063fe7275571f00
|
| 3 |
+
size 330
|
CamPlusPreprocessor.mlmodelc/model.mil
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3520.4.1"}, {"coremlc-version", "3520.5.1"}, {"coremltools-component-torch", "2.5.1"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
+
{
|
| 4 |
+
func main<ios17>(tensor<fp32, [1, ?]> waveform) [FlexibleShapeInformation = tuple<tuple<tensor<string, []>, dict<tensor<string, []>, tensor<int32, [?]>>>, tuple<tensor<string, []>, dict<tensor<string, []>, list<tensor<int32, [2]>, ?>>>>((("DefaultShapes", {{"waveform", [1, 64000]}}), ("RangeDims", {{"waveform", [[1, 1], [8000, 1600000]]}})))] {
|
| 5 |
+
tensor<fp32, [400]> window = const()[name = tensor<string, []>("window"), val = tensor<fp32, [400]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 6 |
+
tensor<fp32, [400, 1, 400]> frame_kernel = const()[name = tensor<string, []>("frame_kernel"), val = tensor<fp32, [400, 1, 400]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1728)))];
|
| 7 |
+
tensor<int32, [1]> var_8_axes_0 = const()[name = tensor<string, []>("op_8_axes_0"), val = tensor<int32, [1]>([1])];
|
| 8 |
+
tensor<fp32, [1, 1, ?]> var_8 = expand_dims(axes = var_8_axes_0, x = waveform)[name = tensor<string, []>("op_8")];
|
| 9 |
+
tensor<string, []> var_24_pad_type_0 = const()[name = tensor<string, []>("op_24_pad_type_0"), val = tensor<string, []>("valid")];
|
| 10 |
+
tensor<int32, [1]> var_24_strides_0 = const()[name = tensor<string, []>("op_24_strides_0"), val = tensor<int32, [1]>([160])];
|
| 11 |
+
tensor<int32, [2]> var_24_pad_0 = const()[name = tensor<string, []>("op_24_pad_0"), val = tensor<int32, [2]>([0, 0])];
|
| 12 |
+
tensor<int32, [1]> var_24_dilations_0 = const()[name = tensor<string, []>("op_24_dilations_0"), val = tensor<int32, [1]>([1])];
|
| 13 |
+
tensor<int32, []> var_24_groups_0 = const()[name = tensor<string, []>("op_24_groups_0"), val = tensor<int32, []>(1)];
|
| 14 |
+
tensor<fp32, [1, 400, ?]> var_24 = conv(dilations = var_24_dilations_0, groups = var_24_groups_0, pad = var_24_pad_0, pad_type = var_24_pad_type_0, strides = var_24_strides_0, weight = frame_kernel, x = var_8)[name = tensor<string, []>("op_24")];
|
| 15 |
+
tensor<int32, [3]> var_27_begin_0 = const()[name = tensor<string, []>("op_27_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
|
| 16 |
+
tensor<int32, [3]> var_27_end_0 = const()[name = tensor<string, []>("op_27_end_0"), val = tensor<int32, [3]>([1, 400, 0])];
|
| 17 |
+
tensor<bool, [3]> var_27_end_mask_0 = const()[name = tensor<string, []>("op_27_end_mask_0"), val = tensor<bool, [3]>([false, true, true])];
|
| 18 |
+
tensor<bool, [3]> var_27_squeeze_mask_0 = const()[name = tensor<string, []>("op_27_squeeze_mask_0"), val = tensor<bool, [3]>([true, false, false])];
|
| 19 |
+
tensor<fp32, [400, ?]> var_27 = slice_by_index(begin = var_27_begin_0, end = var_27_end_0, end_mask = var_27_end_mask_0, squeeze_mask = var_27_squeeze_mask_0, x = var_24)[name = tensor<string, []>("op_27")];
|
| 20 |
+
tensor<int32, [2]> frames_1_perm_0 = const()[name = tensor<string, []>("frames_1_perm_0"), val = tensor<int32, [2]>([1, 0])];
|
| 21 |
+
tensor<int32, [1]> var_33_axes_0 = const()[name = tensor<string, []>("op_33_axes_0"), val = tensor<int32, [1]>([1])];
|
| 22 |
+
tensor<bool, []> var_33_keep_dims_0 = const()[name = tensor<string, []>("op_33_keep_dims_0"), val = tensor<bool, []>(true)];
|
| 23 |
+
tensor<fp32, [?, 400]> frames_1 = transpose(perm = frames_1_perm_0, x = var_27)[name = tensor<string, []>("transpose_3")];
|
| 24 |
+
tensor<fp32, [?, 1]> var_33 = reduce_mean(axes = var_33_axes_0, keep_dims = var_33_keep_dims_0, x = frames_1)[name = tensor<string, []>("op_33")];
|
| 25 |
+
tensor<fp32, [?, 400]> frames_3 = sub(x = frames_1, y = var_33)[name = tensor<string, []>("frames_3")];
|
| 26 |
+
tensor<int32, [2]> var_45_begin_0 = const()[name = tensor<string, []>("op_45_begin_0"), val = tensor<int32, [2]>([0, 0])];
|
| 27 |
+
tensor<int32, [2]> var_45_end_0 = const()[name = tensor<string, []>("op_45_end_0"), val = tensor<int32, [2]>([0, 1])];
|
| 28 |
+
tensor<bool, [2]> var_45_end_mask_0 = const()[name = tensor<string, []>("op_45_end_mask_0"), val = tensor<bool, [2]>([true, false])];
|
| 29 |
+
tensor<fp32, [?, 1]> var_45 = slice_by_index(begin = var_45_begin_0, end = var_45_end_0, end_mask = var_45_end_mask_0, x = frames_3)[name = tensor<string, []>("op_45")];
|
| 30 |
+
tensor<int32, [2]> var_55_begin_0 = const()[name = tensor<string, []>("op_55_begin_0"), val = tensor<int32, [2]>([0, 0])];
|
| 31 |
+
tensor<int32, [2]> var_55_end_0 = const()[name = tensor<string, []>("op_55_end_0"), val = tensor<int32, [2]>([0, 399])];
|
| 32 |
+
tensor<bool, [2]> var_55_end_mask_0 = const()[name = tensor<string, []>("op_55_end_mask_0"), val = tensor<bool, [2]>([true, false])];
|
| 33 |
+
tensor<fp32, [?, 399]> var_55 = slice_by_index(begin = var_55_begin_0, end = var_55_end_0, end_mask = var_55_end_mask_0, x = frames_3)[name = tensor<string, []>("op_55")];
|
| 34 |
+
tensor<int32, []> var_57 = const()[name = tensor<string, []>("op_57"), val = tensor<int32, []>(1)];
|
| 35 |
+
tensor<bool, []> shifted_interleave_0 = const()[name = tensor<string, []>("shifted_interleave_0"), val = tensor<bool, []>(false)];
|
| 36 |
+
tensor<fp32, [?, 400]> shifted = concat(axis = var_57, interleave = shifted_interleave_0, values = (var_45, var_55))[name = tensor<string, []>("shifted")];
|
| 37 |
+
tensor<fp32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<fp32, []>(0x1.f0a3d8p-1)];
|
| 38 |
+
tensor<fp32, [?, 400]> var_60 = mul(x = shifted, y = var_59)[name = tensor<string, []>("op_60")];
|
| 39 |
+
tensor<fp32, [?, 400]> frames_5 = sub(x = frames_3, y = var_60)[name = tensor<string, []>("frames_5")];
|
| 40 |
+
tensor<fp32, [?, 400]> input = mul(x = frames_5, y = window)[name = tensor<string, []>("input")];
|
| 41 |
+
tensor<fp32, []> const_0 = const()[name = tensor<string, []>("const_0"), val = tensor<fp32, []>(0x0p+0)];
|
| 42 |
+
tensor<int32, [4]> frames_pad_0 = const()[name = tensor<string, []>("frames_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 112])];
|
| 43 |
+
tensor<string, []> frames_mode_0 = const()[name = tensor<string, []>("frames_mode_0"), val = tensor<string, []>("constant")];
|
| 44 |
+
tensor<fp32, [?, 512]> frames = pad(constant_val = const_0, mode = frames_mode_0, pad = frames_pad_0, x = input)[name = tensor<string, []>("frames")];
|
| 45 |
+
tensor<fp32, [257, 512]> transpose_0 = const()[name = tensor<string, []>("transpose_0"), val = tensor<fp32, [257, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(641792)))];
|
| 46 |
+
tensor<fp32, [257]> re_bias_0 = const()[name = tensor<string, []>("re_bias_0"), val = tensor<fp32, [257]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1168192)))];
|
| 47 |
+
tensor<fp32, [?, 257]> re = linear(bias = re_bias_0, weight = transpose_0, x = frames)[name = tensor<string, []>("re")];
|
| 48 |
+
tensor<fp32, [257, 512]> transpose_1 = const()[name = tensor<string, []>("transpose_1"), val = tensor<fp32, [257, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1169344)))];
|
| 49 |
+
tensor<fp32, [?, 257]> im = linear(bias = re_bias_0, weight = transpose_1, x = frames)[name = tensor<string, []>("im")];
|
| 50 |
+
tensor<fp32, [?, 257]> var_72 = mul(x = re, y = re)[name = tensor<string, []>("op_72")];
|
| 51 |
+
tensor<fp32, [?, 257]> var_73 = mul(x = im, y = im)[name = tensor<string, []>("op_73")];
|
| 52 |
+
tensor<fp32, [?, 257]> var_75 = add(x = var_72, y = var_73)[name = tensor<string, []>("op_75")];
|
| 53 |
+
tensor<fp32, []> var_76 = const()[name = tensor<string, []>("op_76"), val = tensor<fp32, []>(0x1p-23)];
|
| 54 |
+
tensor<fp32, []> const_1 = const()[name = tensor<string, []>("const_1"), val = tensor<fp32, []>(0x1.fffffep+127)];
|
| 55 |
+
tensor<fp32, [?, 257]> clip_0 = clip(alpha = var_76, beta = const_1, x = var_75)[name = tensor<string, []>("clip_0")];
|
| 56 |
+
tensor<fp32, [80, 257]> transpose_2 = const()[name = tensor<string, []>("transpose_2"), val = tensor<fp32, [80, 257]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1695744)))];
|
| 57 |
+
tensor<fp32, [80]> var_79_bias_0 = const()[name = tensor<string, []>("op_79_bias_0"), val = tensor<fp32, [80]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1778048)))];
|
| 58 |
+
tensor<fp32, [?, 80]> var_79 = linear(bias = var_79_bias_0, weight = transpose_2, x = clip_0)[name = tensor<string, []>("op_79")];
|
| 59 |
+
tensor<fp32, []> fbank_epsilon_0 = const()[name = tensor<string, []>("fbank_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
|
| 60 |
+
tensor<fp32, [?, 80]> fbank = log(epsilon = fbank_epsilon_0, x = var_79)[name = tensor<string, []>("fbank")];
|
| 61 |
+
tensor<int32, [1]> var_82_axes_0 = const()[name = tensor<string, []>("op_82_axes_0"), val = tensor<int32, [1]>([0])];
|
| 62 |
+
tensor<fp32, [1, ?, 80]> features = expand_dims(axes = var_82_axes_0, x = fbank)[name = tensor<string, []>("op_82")];
|
| 63 |
+
} -> (features);
|
| 64 |
+
}
|
CamPlusPreprocessor.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09f13054339581673c8b5745954471945a61fea54441b6e55f934f041611143a
|
| 3 |
+
size 1778432
|
README.md
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: other
|
| 3 |
+
license_name: campplus-upstream
|
| 4 |
+
license_link: https://github.com/modelscope/FunASR
|
| 5 |
+
language: [zh]
|
| 6 |
+
library_name: coreml
|
| 7 |
+
tags: [coreml, ane, speaker-verification, speaker-diarization, campplus, funasr, fluidaudio]
|
| 8 |
+
pipeline_tag: audio-classification
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# CAM++ — CoreML (Apple Neural Engine)
|
| 12 |
+
|
| 13 |
+
CoreML conversion of FunASR's **CAM++** speaker-embedding model (~7.2M params), for
|
| 14 |
+
on-device speaker verification / diarization on Apple Silicon. Upstream:
|
| 15 |
+
[iic/speech_campplus_sv_zh-cn_16k-common](https://www.modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common).
|
| 16 |
+
|
| 17 |
+
## Files
|
| 18 |
+
|
| 19 |
+
| File | Precision | Compute unit | Role |
|
| 20 |
+
|------|-----------|--------------|------|
|
| 21 |
+
| `CamPlusPreprocessor.mlmodelc` | FP32 | CPU | waveform → 80-d fbank features |
|
| 22 |
+
| `CamPlusPlus.mlmodelc` | FP16 | ANE | fbank → 192-d speaker embedding |
|
| 23 |
+
|
| 24 |
+
## Pipeline
|
| 25 |
+
|
| 26 |
+
```
|
| 27 |
+
waveform → [Preprocessor fp32/CPU] → fbank [1,T,80]
|
| 28 |
+
→ [CAM++ fp16/ANE] → embedding [1,192] (L2-normalize, then cosine for verification/clustering)
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
CAM++ normalizes the fbank internally. The 192-d embedding is used with cosine
|
| 32 |
+
similarity for speaker verification and diarization clustering.
|
| 33 |
+
|
| 34 |
+
Parity: torch↔CoreML embedding cosine 0.99998 (random) / 0.99999 (real audio via the preprocessor).
|
| 35 |
+
|
| 36 |
+
## License
|
| 37 |
+
|
| 38 |
+
Weights derive from FunASR's CAM++; upstream license applies. Format conversion only.
|