leduclinh aufklarer commited on
Commit
6949b43
·
0 Parent(s):

Duplicate from aufklarer/Pyannote-Segmentation-MLX

Browse files

Co-authored-by: Ivan <aufklarer@users.noreply.huggingface.co>

Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +78 -0
  3. config.json +45 -0
  4. model.safetensors +3 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - mlx
5
+ - voice-activity-detection
6
+ - speaker-segmentation
7
+ - speaker-diarization
8
+ - pyannote
9
+ - apple-silicon
10
+ base_model: pyannote/segmentation-3.0
11
+ library_name: mlx
12
+ pipeline_tag: voice-activity-detection
13
+ ---
14
+
15
+ # Pyannote Segmentation 3.0 — MLX
16
+
17
+ MLX-compatible weights for [pyannote/segmentation-3.0](https://huggingface.co/pyannote/segmentation-3.0) (PyanNet), converted from the official PyTorch Lightning checkpoint with pre-computed SincNet filters.
18
+
19
+ ## Model
20
+
21
+ PyanNet is a speaker segmentation model (~1.5M params) that processes 10-second audio windows and outputs 7-class powerset probabilities for up to 3 simultaneous speakers. Used for both voice activity detection (binary) and speaker diarization (per-speaker).
22
+
23
+ **Architecture:** SincNet → BiLSTM(4 layers) → Linear(2 layers) → 7-class softmax
24
+
25
+ **Output classes:** non-speech, spk1, spk2, spk3, spk1+2, spk1+3, spk2+3
26
+
27
+ ## Usage (Swift / MLX)
28
+
29
+ ```swift
30
+ import SpeechVAD
31
+
32
+ // Voice Activity Detection
33
+ let vad = try await PyannoteVADModel.fromPretrained()
34
+ let segments = vad.detectSpeech(audio: samples, sampleRate: 16000)
35
+ for seg in segments {
36
+ print("Speech: \(seg.startTime)s - \(seg.endTime)s")
37
+ }
38
+
39
+ // Speaker Diarization (with WeSpeaker embeddings)
40
+ let pipeline = try await DiarizationPipeline.fromPretrained()
41
+ let result = pipeline.diarize(audio: samples, sampleRate: 16000)
42
+ for seg in result.segments {
43
+ print("Speaker \(seg.speakerId): \(seg.startTime)s - \(seg.endTime)s")
44
+ }
45
+ ```
46
+
47
+ Part of [qwen3-asr-swift](https://github.com/ivan-digital/qwen3-asr-swift).
48
+
49
+ ## Conversion
50
+
51
+ ```bash
52
+ python3 scripts/convert_pyannote.py --token YOUR_HF_TOKEN --upload
53
+ ```
54
+
55
+ Converts the gated pyannote/segmentation-3.0 checkpoint using a custom unpickler (no pyannote.audio dependency required). Key transformations:
56
+
57
+ - **SincNet**: pre-compute 80 sinc bandpass filters (40 cos + 40 sin) from 40 learned `(low_hz, band_hz)` parameter pairs
58
+ - **Conv1d**: transpose weights `[O, I, K]` → `[O, K, I]` for MLX channels-last
59
+ - **BiLSTM**: split into forward/backward stacks, sum `bias_ih + bias_hh`
60
+ - **Linear/classifier**: kept as-is
61
+
62
+ ## Weight Mapping
63
+
64
+ | PyTorch Key | MLX Key | Shape |
65
+ |-------------|---------|-------|
66
+ | `sincnet.conv1d.0.filterbank.*` (computed) | `sincnet.conv.0.weight` | [80, 251, 1] |
67
+ | `sincnet.conv1d.{1,2}.weight` | `sincnet.conv.{1,2}.weight` | [O, K, I] |
68
+ | `sincnet.norm1d.{0-2}.*` | `sincnet.norm.{0-2}.*` | varies |
69
+ | `lstm.weight_ih_l{i}` | `lstm_fwd.layers.{i}.Wx` | [512, I] |
70
+ | `lstm.weight_hh_l{i}` | `lstm_fwd.layers.{i}.Wh` | [512, 128] |
71
+ | `lstm.bias_ih_l{i} + bias_hh_l{i}` | `lstm_fwd.layers.{i}.bias` | [512] |
72
+ | `lstm.*_reverse` | `lstm_bwd.layers.{i}.*` | same |
73
+ | `linear.{0,1}.*` | `linear.{0,1}.*` | varies |
74
+ | `classifier.*` | `classifier.*` | [7, 128] |
75
+
76
+ ## License
77
+
78
+ The original pyannote segmentation model is released under the [MIT License](https://github.com/pyannote/pyannote-audio/blob/develop/LICENSE).
config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "pyannote-segmentation",
3
+ "sample_rate": 16000,
4
+ "sincnet": {
5
+ "n_filters": [
6
+ 80,
7
+ 60,
8
+ 60
9
+ ],
10
+ "kernel_sizes": [
11
+ 251,
12
+ 5,
13
+ 5
14
+ ],
15
+ "strides": [
16
+ 10,
17
+ 1,
18
+ 1
19
+ ],
20
+ "pool_sizes": [
21
+ 3,
22
+ 3,
23
+ 3
24
+ ]
25
+ },
26
+ "lstm": {
27
+ "hidden_size": 128,
28
+ "num_layers": 4,
29
+ "bidirectional": true
30
+ },
31
+ "linear": {
32
+ "hidden_size": 128,
33
+ "num_layers": 2
34
+ },
35
+ "num_classes": 7,
36
+ "max_speakers": 3,
37
+ "powerset_max_classes": 2,
38
+ "num_frames_per_chunk": 589,
39
+ "chunk_duration": 10.0,
40
+ "chunk_step_ratio": 0.1,
41
+ "warm_up": [
42
+ 0.0,
43
+ 0.0
44
+ ]
45
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1630fa2c22f47e4c89034f8d5e3aff99884f55347d48ce70dd306328b4421f5
3
+ size 5960404