BMP commited on
Commit
2e82ca2
·
verified ·
1 Parent(s): 91d46f7

Convert iic/speech_campplus_sv_zh_en_16k-common_advanced to MLX format

Browse files
Files changed (6) hide show
  1. README.md +53 -0
  2. __pycache__/model.cpython-312.pyc +0 -0
  3. config.json +12 -0
  4. model.py +372 -0
  5. usage_example.py +43 -0
  6. weights.npz +3 -0
README.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CAM++ Speaker Recognition Model (MLX)
2
+
3
+ Converted from: `iic/speech_campplus_sv_zh_en_16k-common_advanced`
4
+
5
+ ## Model Details
6
+
7
+ - **Architecture**: CAM++ (Context-Aware Masking++)
8
+ - **Framework**: MLX (Apple Silicon optimized)
9
+ - **Input**: Mel-spectrogram features (320 dimensions)
10
+ - **Output**: Speaker embedding (192 dimensions)
11
+ - **Quantized**: False
12
+
13
+ ## Usage
14
+
15
+ ```python
16
+ from huggingface_hub import snapshot_download
17
+ import mlx.core as mx
18
+ import sys
19
+
20
+ # Download model
21
+ model_path = snapshot_download("mlx-community/campp-mlx")
22
+ sys.path.append(model_path)
23
+
24
+ from model import CAMPPModel
25
+ import json
26
+
27
+ # Load model
28
+ with open(f"{model_path}/config.json") as f:
29
+ config = json.load(f)
30
+
31
+ model = CAMPPModel(
32
+ input_dim=config["input_dim"],
33
+ embedding_dim=config["embedding_dim"],
34
+ input_channels=config.get("input_channels", 64)
35
+ )
36
+ weights = mx.load(f"{model_path}/weights.npz")
37
+ model.load_weights(weights)
38
+
39
+ # Use model
40
+ audio_features = mx.random.normal((1, 320, 200)) # Your audio features
41
+ embedding = model(audio_features)
42
+ ```
43
+
44
+ ## Performance
45
+
46
+ - Optimized for Apple Silicon (M1/M2/M3/M4)
47
+ - Faster inference than PyTorch on Mac
48
+ - Lower memory usage with MLX unified memory
49
+
50
+ ## Original Paper
51
+
52
+ CAM++: A Fast and Efficient Network for Speaker Verification Using Context-Aware Masking
53
+ https://arxiv.org/abs/2303.00332
__pycache__/model.cpython-312.pyc ADDED
Binary file (13.1 kB). View file
 
config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "campp",
3
+ "architecture": "d-tdnn",
4
+ "framework": "mlx",
5
+ "input_dim": 320,
6
+ "input_channels": 64,
7
+ "embedding_dim": 192,
8
+ "num_classes": null,
9
+ "converted_from": "iic/speech_campplus_sv_zh_en_16k-common_advanced",
10
+ "quantized": false,
11
+ "conversion_date": "2026-01-16T12:06:47.419878"
12
+ }
model.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MLX implementation of CAM++ model - ModelScope architecture (Clean implementation)
3
+
4
+ Based on analysis of iic/speech_campplus_sv_zh_en_16k-common_advanced:
5
+ - Dense connections: each layer's output is concatenated with all previous outputs
6
+ - TDNN layers use kernel_size=1 (no temporal context in main conv)
7
+ - CAM layers provide the actual feature extraction
8
+ - Architecture: Input → Dense Blocks (with CAM) → Transitions → Dense Layer
9
+ """
10
+
11
+ import mlx.core as mx
12
+ import mlx.nn as nn
13
+ from typing import Dict, List, Optional
14
+ import json
15
+
16
+
17
+ class EmbeddedCAM(nn.Module):
18
+ """
19
+ Context-Aware Masking module embedded within TDNN layers
20
+
21
+ Architecture (verified from ModelScope weights):
22
+ - linear1: 1x1 Conv (in_channels → cam_channels//2) with bias
23
+ - linear2: 1x1 Conv (cam_channels//2 → cam_channels//4) with bias
24
+ - linear_local: 3x3 Conv (in_channels → cam_channels//4) without bias
25
+ - Output: cam_channels//4 channels (e.g., 32 for cam_channels=128)
26
+ """
27
+
28
+ def __init__(self, in_channels: int, cam_channels: int = 128):
29
+ super().__init__()
30
+
31
+ # Global context path: 1x1 → 1x1
32
+ self.linear1 = nn.Conv1d(
33
+ in_channels=in_channels,
34
+ out_channels=cam_channels // 2, # 128 → 64
35
+ kernel_size=1,
36
+ bias=True
37
+ )
38
+
39
+ self.linear2 = nn.Conv1d(
40
+ in_channels=cam_channels // 2, # 64
41
+ out_channels=cam_channels // 4, # 64 → 32
42
+ kernel_size=1,
43
+ bias=True
44
+ )
45
+
46
+ # Local context path: 3x3 conv
47
+ self.linear_local = nn.Conv1d(
48
+ in_channels=in_channels,
49
+ out_channels=cam_channels // 4, # 128 → 32
50
+ kernel_size=3,
51
+ padding=1,
52
+ bias=False
53
+ )
54
+
55
+ def __call__(self, x: mx.array) -> mx.array:
56
+ """
57
+ Apply context-aware masking
58
+
59
+ Args:
60
+ x: Input (batch, length, in_channels) - channels_last format
61
+
62
+ Returns:
63
+ Output (batch, length, cam_channels//4)
64
+ """
65
+ # Global context: 1x1 → relu → 1x1
66
+ global_context = self.linear1(x)
67
+ global_context = nn.relu(global_context)
68
+ global_context = self.linear2(global_context)
69
+
70
+ # Local context: 3x3 conv
71
+ local_context = self.linear_local(x)
72
+
73
+ # Apply sigmoid mask
74
+ mask = nn.sigmoid(global_context)
75
+ output = local_context * mask
76
+
77
+ return output
78
+
79
+
80
+ class TDNNLayerWithCAM(nn.Module):
81
+ """
82
+ TDNN layer with embedded CAM (verified architecture)
83
+
84
+ Flow:
85
+ 1. Main conv: kernel_size=1 (channels projection)
86
+ 2. BatchNorm
87
+ 3. ReLU
88
+ 4. CAM: extracts features and outputs cam_channels//4
89
+
90
+ Note: The main conv projects to a fixed channel size (e.g., 128),
91
+ then CAM reduces to cam_channels//4 (e.g., 32) for dense connection.
92
+ """
93
+
94
+ def __init__(
95
+ self,
96
+ in_channels: int,
97
+ out_channels: int = 128,
98
+ cam_channels: int = 128
99
+ ):
100
+ super().__init__()
101
+
102
+ # Main TDNN: 1x1 conv (no temporal context)
103
+ self.conv = nn.Conv1d(
104
+ in_channels=in_channels,
105
+ out_channels=out_channels,
106
+ kernel_size=1,
107
+ padding=0,
108
+ bias=False
109
+ )
110
+
111
+ # BatchNorm on the conv output
112
+ self.bn = nn.BatchNorm(out_channels, affine=True)
113
+
114
+ # ReLU activation
115
+ self.activation = nn.ReLU()
116
+
117
+ # Embedded CAM (takes conv output, produces cam_channels//4)
118
+ self.cam = EmbeddedCAM(
119
+ in_channels=out_channels,
120
+ cam_channels=cam_channels
121
+ )
122
+
123
+ def __call__(self, x: mx.array) -> mx.array:
124
+ """
125
+ Forward pass
126
+
127
+ Args:
128
+ x: Input (batch, length, in_channels)
129
+
130
+ Returns:
131
+ CAM output (batch, length, cam_channels//4)
132
+ """
133
+ # Main conv + bn + relu
134
+ out = self.conv(x)
135
+ out = self.bn(out)
136
+ out = self.activation(out)
137
+
138
+ # CAM feature extraction
139
+ out = self.cam(out)
140
+
141
+ return out
142
+
143
+
144
+ class TransitionLayer(nn.Module):
145
+ """
146
+ Transition layer between dense blocks
147
+
148
+ Reduces the accumulated channels back to base channel count.
149
+ Architecture: BatchNorm → ReLU → 1x1 Conv
150
+ """
151
+
152
+ def __init__(self, in_channels: int, out_channels: int):
153
+ super().__init__()
154
+
155
+ self.bn = nn.BatchNorm(in_channels, affine=True)
156
+ self.activation = nn.ReLU()
157
+ self.conv = nn.Conv1d(
158
+ in_channels=in_channels,
159
+ out_channels=out_channels,
160
+ kernel_size=1,
161
+ bias=False
162
+ )
163
+
164
+ def __call__(self, x: mx.array) -> mx.array:
165
+ out = self.bn(x)
166
+ out = self.activation(out)
167
+ out = self.conv(out)
168
+ return out
169
+
170
+
171
+ class CAMPPModelScopeV2(nn.Module):
172
+ """
173
+ Clean CAM++ implementation matching ModelScope architecture
174
+
175
+ Key features:
176
+ - Dense connections: each layer's output is concatenated
177
+ - TDNN layers use kernel_size=1
178
+ - CAM provides feature extraction (outputs cam_channels//4 per layer)
179
+ - Transitions reduce accumulated channels back to base
180
+
181
+ Args:
182
+ input_dim: Input feature dimension (e.g., 80 or 320)
183
+ channels: Base channel count (e.g., 128 or 512)
184
+ block_layers: Layers per block (e.g., [12, 24, 16])
185
+ embedding_dim: Output embedding dimension (e.g., 192)
186
+ cam_channels: CAM channel count (e.g., 128)
187
+ input_kernel_size: Input layer kernel size (e.g., 5)
188
+ """
189
+
190
+ def __init__(
191
+ self,
192
+ input_dim: int = 80,
193
+ channels: int = 512,
194
+ block_layers: List[int] = None,
195
+ embedding_dim: int = 192,
196
+ cam_channels: int = 128,
197
+ input_kernel_size: int = 5
198
+ ):
199
+ super().__init__()
200
+
201
+ if block_layers is None:
202
+ block_layers = [4, 9, 16]
203
+
204
+ self.input_dim = input_dim
205
+ self.channels = channels
206
+ self.block_layers = block_layers
207
+ self.embedding_dim = embedding_dim
208
+ self.cam_channels = cam_channels
209
+ self.growth_rate = cam_channels // 4 # Each layer adds this many channels
210
+
211
+ # Input layer
212
+ self.input_conv = nn.Conv1d(
213
+ in_channels=input_dim,
214
+ out_channels=channels,
215
+ kernel_size=input_kernel_size,
216
+ padding=input_kernel_size // 2,
217
+ bias=False
218
+ )
219
+ self.input_bn = nn.BatchNorm(channels, affine=True)
220
+ self.input_activation = nn.ReLU()
221
+
222
+ # Dense Block 0
223
+ for i in range(block_layers[0]):
224
+ in_ch = channels + i * self.growth_rate
225
+ layer = TDNNLayerWithCAM(
226
+ in_channels=in_ch,
227
+ out_channels=channels,
228
+ cam_channels=cam_channels
229
+ )
230
+ setattr(self, f'block0_{i}', layer)
231
+ self._block0_size = block_layers[0]
232
+
233
+ # Transition 1 - doubles channel count
234
+ transit1_in = channels + block_layers[0] * self.growth_rate
235
+ transit1_out = channels * 2
236
+ self.transit1 = TransitionLayer(transit1_in, transit1_out)
237
+
238
+ # Dense Block 1 - starts with doubled channels
239
+ for i in range(block_layers[1]):
240
+ in_ch = transit1_out + i * self.growth_rate
241
+ layer = TDNNLayerWithCAM(
242
+ in_channels=in_ch,
243
+ out_channels=channels,
244
+ cam_channels=cam_channels
245
+ )
246
+ setattr(self, f'block1_{i}', layer)
247
+ self._block1_size = block_layers[1]
248
+
249
+ # Transition 2 - doubles channel count again
250
+ transit2_in = transit1_out + block_layers[1] * self.growth_rate
251
+ transit2_out = transit1_out * 2 # 4x original channels
252
+ self.transit2 = TransitionLayer(transit2_in, transit2_out)
253
+
254
+ # Dense Block 2 - starts with quadrupled channels
255
+ for i in range(block_layers[2]):
256
+ in_ch = transit2_out + i * self.growth_rate
257
+ layer = TDNNLayerWithCAM(
258
+ in_channels=in_ch,
259
+ out_channels=channels,
260
+ cam_channels=cam_channels
261
+ )
262
+ setattr(self, f'block2_{i}', layer)
263
+ self._block2_size = block_layers[2]
264
+
265
+ # Final dense layer
266
+ dense_in = transit2_out + block_layers[2] * self.growth_rate
267
+ self.dense = nn.Conv1d(
268
+ in_channels=dense_in,
269
+ out_channels=embedding_dim,
270
+ kernel_size=1,
271
+ bias=False
272
+ )
273
+
274
+ def __call__(self, x: mx.array) -> mx.array:
275
+ """
276
+ Forward pass
277
+
278
+ Args:
279
+ x: Input (batch, length, in_channels) - channels_last format
280
+
281
+ Returns:
282
+ Embeddings (batch, length, embedding_dim)
283
+ """
284
+ # Handle input format
285
+ if x.ndim == 2:
286
+ x = mx.expand_dims(x, axis=0)
287
+
288
+ # MLX Conv1d expects (batch, length, in_channels)
289
+ if x.shape[2] != self.input_dim:
290
+ x = mx.transpose(x, (0, 2, 1))
291
+
292
+ # Input layer
293
+ out = self.input_conv(x)
294
+ out = self.input_bn(out)
295
+ out = self.input_activation(out)
296
+
297
+ # Dense Block 0 (with concatenation)
298
+ for i in range(self._block0_size):
299
+ layer = getattr(self, f'block0_{i}')
300
+ layer_out = layer(out)
301
+ out = mx.concatenate([out, layer_out], axis=2)
302
+
303
+ # Transition 1
304
+ out = self.transit1(out)
305
+
306
+ # Dense Block 1
307
+ for i in range(self._block1_size):
308
+ layer = getattr(self, f'block1_{i}')
309
+ layer_out = layer(out)
310
+ out = mx.concatenate([out, layer_out], axis=2)
311
+
312
+ # Transition 2
313
+ out = self.transit2(out)
314
+
315
+ # Dense Block 2
316
+ for i in range(self._block2_size):
317
+ layer = getattr(self, f'block2_{i}')
318
+ layer_out = layer(out)
319
+ out = mx.concatenate([out, layer_out], axis=2)
320
+
321
+ # Final dense layer
322
+ embeddings = self.dense(out)
323
+
324
+ return embeddings
325
+
326
+ def extract_embedding(self, x: mx.array, pooling: str = "mean") -> mx.array:
327
+ """
328
+ Extract fixed-size speaker embedding
329
+
330
+ Args:
331
+ x: Input (batch, length, in_channels)
332
+ pooling: "mean", "max", or "both"
333
+
334
+ Returns:
335
+ Embedding (batch, embedding_dim)
336
+ """
337
+ frame_embeddings = self(x) # (batch, length, embedding_dim)
338
+
339
+ if pooling == "mean":
340
+ embedding = mx.mean(frame_embeddings, axis=1)
341
+ elif pooling == "max":
342
+ embedding = mx.max(frame_embeddings, axis=1)
343
+ elif pooling == "both":
344
+ mean_pool = mx.mean(frame_embeddings, axis=1)
345
+ max_pool = mx.max(frame_embeddings, axis=1)
346
+ embedding = mx.concatenate([mean_pool, max_pool], axis=1)
347
+ else:
348
+ raise ValueError(f"Unknown pooling: {pooling}")
349
+
350
+ return embedding
351
+
352
+
353
+ def load_model(weights_path: str, config_path: Optional[str] = None) -> CAMPPModelScopeV2:
354
+ """Load model from weights and config"""
355
+ if config_path:
356
+ with open(config_path, 'r') as f:
357
+ config = json.load(f)
358
+ else:
359
+ config = {
360
+ 'input_dim': 80,
361
+ 'channels': 512,
362
+ 'block_layers': [4, 9, 16],
363
+ 'embedding_dim': 192,
364
+ 'cam_channels': 128,
365
+ 'input_kernel_size': 5
366
+ }
367
+
368
+ model = CAMPPModelScopeV2(**config)
369
+ weights = mx.load(weights_path)
370
+ model.load_weights(list(weights.items()))
371
+
372
+ return model
usage_example.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CAM++ MLX Model Usage Example (ModelScope Architecture)
2
+
3
+ import mlx.core as mx
4
+ import numpy as np
5
+ from model import CAMPPModelScopeV2
6
+ import json
7
+
8
+ def load_model(model_path="."):
9
+ # Load config
10
+ with open(f"{model_path}/config.json", "r") as f:
11
+ config = json.load(f)
12
+
13
+ # Initialize model
14
+ model = CAMPPModelScopeV2(
15
+ input_dim=config["input_dim"],
16
+ channels=config.get("channels", 512),
17
+ block_layers=config.get("block_layers", [4, 9, 16]),
18
+ embedding_dim=config["embedding_dim"],
19
+ cam_channels=config.get("cam_channels", 128),
20
+ input_kernel_size=config.get("input_kernel_size", 5)
21
+ )
22
+
23
+ # Load weights
24
+ weights = mx.load(f"{model_path}/weights.npz")
25
+ model.load_weights(weights)
26
+
27
+ return model
28
+
29
+ def extract_speaker_embedding(model, audio_features):
30
+ # audio_features: (batch, features, time) - e.g., mel-spectrogram
31
+ # Returns: speaker embedding vector
32
+
33
+ mx.eval(model.parameters()) # Ensure weights are loaded
34
+ with mx.no_grad():
35
+ embedding = model(audio_features)
36
+
37
+ return embedding
38
+
39
+ # Example usage:
40
+ # model = load_model()
41
+ # features = mx.random.normal((1, 320, 200)) # Example input
42
+ # embedding = extract_speaker_embedding(model, features)
43
+ # print(f"Speaker embedding shape: {embedding.shape}")
weights.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f7e173eb843c4cca555801b82a5358fb8a51279ada455b7b9cb7924ab3b868a
3
+ size 24886146