Convert iic/speech_campplus_sv_zh_en_16k-common_advanced to MLX format

Browse files

Files changed (6) hide show

README.md +53 -0
__pycache__/model.cpython-312.pyc +0 -0
config.json +12 -0
model.py +372 -0
usage_example.py +43 -0
weights.npz +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,53 @@

+# CAM++ Speaker Recognition Model (MLX)
+Converted from: `iic/speech_campplus_sv_zh_en_16k-common_advanced`
+## Model Details
+- **Architecture**: CAM++ (Context-Aware Masking++)
+- **Framework**: MLX (Apple Silicon optimized)
+- **Input**: Mel-spectrogram features (320 dimensions)
+- **Output**: Speaker embedding (192 dimensions)
+- **Quantized**: False
+## Usage
+```python
+from huggingface_hub import snapshot_download
+import mlx.core as mx
+import sys
+# Download model
+model_path = snapshot_download("mlx-community/campp-mlx")
+sys.path.append(model_path)
+from model import CAMPPModel
+import json
+# Load model
+with open(f"{model_path}/config.json") as f:
+    config = json.load(f)
+model = CAMPPModel(
+    input_dim=config["input_dim"],
+    embedding_dim=config["embedding_dim"],
+    input_channels=config.get("input_channels", 64)
+)
+weights = mx.load(f"{model_path}/weights.npz")
+model.load_weights(weights)
+# Use model
+audio_features = mx.random.normal((1, 320, 200))  # Your audio features
+embedding = model(audio_features)
+```
+## Performance
+- Optimized for Apple Silicon (M1/M2/M3/M4)
+- Faster inference than PyTorch on Mac
+- Lower memory usage with MLX unified memory
+## Original Paper
+CAM++: A Fast and Efficient Network for Speaker Verification Using Context-Aware Masking
+https://arxiv.org/abs/2303.00332

__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (13.1 kB). View file

config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "model_type": "campp",
+  "architecture": "d-tdnn",
+  "framework": "mlx",
+  "input_dim": 320,
+  "input_channels": 64,
+  "embedding_dim": 192,
+  "num_classes": null,
+  "converted_from": "iic/speech_campplus_sv_zh_en_16k-common_advanced",
+  "quantized": false,
+  "conversion_date": "2026-01-16T12:06:47.419878"
+}

model.py ADDED Viewed

	@@ -0,0 +1,372 @@

+"""
+MLX implementation of CAM++ model - ModelScope architecture (Clean implementation)
+Based on analysis of iic/speech_campplus_sv_zh_en_16k-common_advanced:
+- Dense connections: each layer's output is concatenated with all previous outputs
+- TDNN layers use kernel_size=1 (no temporal context in main conv)
+- CAM layers provide the actual feature extraction
+- Architecture: Input → Dense Blocks (with CAM) → Transitions → Dense Layer
+"""
+import mlx.core as mx
+import mlx.nn as nn
+from typing import Dict, List, Optional
+import json
+class EmbeddedCAM(nn.Module):
+    """
+    Context-Aware Masking module embedded within TDNN layers
+    Architecture (verified from ModelScope weights):
+    - linear1: 1x1 Conv (in_channels → cam_channels//2) with bias
+    - linear2: 1x1 Conv (cam_channels//2 → cam_channels//4) with bias
+    - linear_local: 3x3 Conv (in_channels → cam_channels//4) without bias
+    - Output: cam_channels//4 channels (e.g., 32 for cam_channels=128)
+    """
+    def __init__(self, in_channels: int, cam_channels: int = 128):
+        super().__init__()
+        # Global context path: 1x1 → 1x1
+        self.linear1 = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=cam_channels // 2,  # 128 → 64
+            kernel_size=1,
+            bias=True
+        )
+        self.linear2 = nn.Conv1d(
+            in_channels=cam_channels // 2,  # 64
+            out_channels=cam_channels // 4,  # 64 → 32
+            kernel_size=1,
+            bias=True
+        )
+        # Local context path: 3x3 conv
+        self.linear_local = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=cam_channels // 4,  # 128 → 32
+            kernel_size=3,
+            padding=1,
+            bias=False
+        )
+    def __call__(self, x: mx.array) -> mx.array:
+        """
+        Apply context-aware masking
+        Args:
+            x: Input (batch, length, in_channels) - channels_last format
+        Returns:
+            Output (batch, length, cam_channels//4)
+        """
+        # Global context: 1x1 → relu → 1x1
+        global_context = self.linear1(x)
+        global_context = nn.relu(global_context)
+        global_context = self.linear2(global_context)
+        # Local context: 3x3 conv
+        local_context = self.linear_local(x)
+        # Apply sigmoid mask
+        mask = nn.sigmoid(global_context)
+        output = local_context * mask
+        return output
+class TDNNLayerWithCAM(nn.Module):
+    """
+    TDNN layer with embedded CAM (verified architecture)
+    Flow:
+    1. Main conv: kernel_size=1 (channels projection)
+    2. BatchNorm
+    3. ReLU
+    4. CAM: extracts features and outputs cam_channels//4
+    Note: The main conv projects to a fixed channel size (e.g., 128),
+    then CAM reduces to cam_channels//4 (e.g., 32) for dense connection.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int = 128,
+        cam_channels: int = 128
+    ):
+        super().__init__()
+        # Main TDNN: 1x1 conv (no temporal context)
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            padding=0,
+            bias=False
+        )
+        # BatchNorm on the conv output
+        self.bn = nn.BatchNorm(out_channels, affine=True)
+        # ReLU activation
+        self.activation = nn.ReLU()
+        # Embedded CAM (takes conv output, produces cam_channels//4)
+        self.cam = EmbeddedCAM(
+            in_channels=out_channels,
+            cam_channels=cam_channels
+        )
+    def __call__(self, x: mx.array) -> mx.array:
+        """
+        Forward pass
+        Args:
+            x: Input (batch, length, in_channels)
+        Returns:
+            CAM output (batch, length, cam_channels//4)
+        """
+        # Main conv + bn + relu
+        out = self.conv(x)
+        out = self.bn(out)
+        out = self.activation(out)
+        # CAM feature extraction
+        out = self.cam(out)
+        return out
+class TransitionLayer(nn.Module):
+    """
+    Transition layer between dense blocks
+    Reduces the accumulated channels back to base channel count.
+    Architecture: BatchNorm → ReLU → 1x1 Conv
+    """
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.bn = nn.BatchNorm(in_channels, affine=True)
+        self.activation = nn.ReLU()
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            bias=False
+        )
+    def __call__(self, x: mx.array) -> mx.array:
+        out = self.bn(x)
+        out = self.activation(out)
+        out = self.conv(out)
+        return out
+class CAMPPModelScopeV2(nn.Module):
+    """
+    Clean CAM++ implementation matching ModelScope architecture
+    Key features:
+    - Dense connections: each layer's output is concatenated
+    - TDNN layers use kernel_size=1
+    - CAM provides feature extraction (outputs cam_channels//4 per layer)
+    - Transitions reduce accumulated channels back to base
+    Args:
+        input_dim: Input feature dimension (e.g., 80 or 320)
+        channels: Base channel count (e.g., 128 or 512)
+        block_layers: Layers per block (e.g., [12, 24, 16])
+        embedding_dim: Output embedding dimension (e.g., 192)
+        cam_channels: CAM channel count (e.g., 128)
+        input_kernel_size: Input layer kernel size (e.g., 5)
+    """
+    def __init__(
+        self,
+        input_dim: int = 80,
+        channels: int = 512,
+        block_layers: List[int] = None,
+        embedding_dim: int = 192,
+        cam_channels: int = 128,
+        input_kernel_size: int = 5
+    ):
+        super().__init__()
+        if block_layers is None:
+            block_layers = [4, 9, 16]
+        self.input_dim = input_dim
+        self.channels = channels
+        self.block_layers = block_layers
+        self.embedding_dim = embedding_dim
+        self.cam_channels = cam_channels
+        self.growth_rate = cam_channels // 4  # Each layer adds this many channels
+        # Input layer
+        self.input_conv = nn.Conv1d(
+            in_channels=input_dim,
+            out_channels=channels,
+            kernel_size=input_kernel_size,
+            padding=input_kernel_size // 2,
+            bias=False
+        )
+        self.input_bn = nn.BatchNorm(channels, affine=True)
+        self.input_activation = nn.ReLU()
+        # Dense Block 0
+        for i in range(block_layers[0]):
+            in_ch = channels + i * self.growth_rate
+            layer = TDNNLayerWithCAM(
+                in_channels=in_ch,
+                out_channels=channels,
+                cam_channels=cam_channels
+            )
+            setattr(self, f'block0_{i}', layer)
+        self._block0_size = block_layers[0]
+        # Transition 1 - doubles channel count
+        transit1_in = channels + block_layers[0] * self.growth_rate
+        transit1_out = channels * 2
+        self.transit1 = TransitionLayer(transit1_in, transit1_out)
+        # Dense Block 1 - starts with doubled channels
+        for i in range(block_layers[1]):
+            in_ch = transit1_out + i * self.growth_rate
+            layer = TDNNLayerWithCAM(
+                in_channels=in_ch,
+                out_channels=channels,
+                cam_channels=cam_channels
+            )
+            setattr(self, f'block1_{i}', layer)
+        self._block1_size = block_layers[1]
+        # Transition 2 - doubles channel count again
+        transit2_in = transit1_out + block_layers[1] * self.growth_rate
+        transit2_out = transit1_out * 2  # 4x original channels
+        self.transit2 = TransitionLayer(transit2_in, transit2_out)
+        # Dense Block 2 - starts with quadrupled channels
+        for i in range(block_layers[2]):
+            in_ch = transit2_out + i * self.growth_rate
+            layer = TDNNLayerWithCAM(
+                in_channels=in_ch,
+                out_channels=channels,
+                cam_channels=cam_channels
+            )
+            setattr(self, f'block2_{i}', layer)
+        self._block2_size = block_layers[2]
+        # Final dense layer
+        dense_in = transit2_out + block_layers[2] * self.growth_rate
+        self.dense = nn.Conv1d(
+            in_channels=dense_in,
+            out_channels=embedding_dim,
+            kernel_size=1,
+            bias=False
+        )
+    def __call__(self, x: mx.array) -> mx.array:
+        """
+        Forward pass
+        Args:
+            x: Input (batch, length, in_channels) - channels_last format
+        Returns:
+            Embeddings (batch, length, embedding_dim)
+        """
+        # Handle input format
+        if x.ndim == 2:
+            x = mx.expand_dims(x, axis=0)
+        # MLX Conv1d expects (batch, length, in_channels)
+        if x.shape[2] != self.input_dim:
+            x = mx.transpose(x, (0, 2, 1))
+        # Input layer
+        out = self.input_conv(x)
+        out = self.input_bn(out)
+        out = self.input_activation(out)
+        # Dense Block 0 (with concatenation)
+        for i in range(self._block0_size):
+            layer = getattr(self, f'block0_{i}')
+            layer_out = layer(out)
+            out = mx.concatenate([out, layer_out], axis=2)
+        # Transition 1
+        out = self.transit1(out)
+        # Dense Block 1
+        for i in range(self._block1_size):
+            layer = getattr(self, f'block1_{i}')
+            layer_out = layer(out)
+            out = mx.concatenate([out, layer_out], axis=2)
+        # Transition 2
+        out = self.transit2(out)
+        # Dense Block 2
+        for i in range(self._block2_size):
+            layer = getattr(self, f'block2_{i}')
+            layer_out = layer(out)
+            out = mx.concatenate([out, layer_out], axis=2)
+        # Final dense layer
+        embeddings = self.dense(out)
+        return embeddings
+    def extract_embedding(self, x: mx.array, pooling: str = "mean") -> mx.array:
+        """
+        Extract fixed-size speaker embedding
+        Args:
+            x: Input (batch, length, in_channels)
+            pooling: "mean", "max", or "both"
+        Returns:
+            Embedding (batch, embedding_dim)
+        """
+        frame_embeddings = self(x)  # (batch, length, embedding_dim)
+        if pooling == "mean":
+            embedding = mx.mean(frame_embeddings, axis=1)
+        elif pooling == "max":
+            embedding = mx.max(frame_embeddings, axis=1)
+        elif pooling == "both":
+            mean_pool = mx.mean(frame_embeddings, axis=1)
+            max_pool = mx.max(frame_embeddings, axis=1)
+            embedding = mx.concatenate([mean_pool, max_pool], axis=1)
+        else:
+            raise ValueError(f"Unknown pooling: {pooling}")
+        return embedding
+def load_model(weights_path: str, config_path: Optional[str] = None) -> CAMPPModelScopeV2:
+    """Load model from weights and config"""
+    if config_path:
+        with open(config_path, 'r') as f:
+            config = json.load(f)
+    else:
+        config = {
+            'input_dim': 80,
+            'channels': 512,
+            'block_layers': [4, 9, 16],
+            'embedding_dim': 192,
+            'cam_channels': 128,
+            'input_kernel_size': 5
+        }
+    model = CAMPPModelScopeV2(**config)
+    weights = mx.load(weights_path)
+    model.load_weights(list(weights.items()))
+    return model

usage_example.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# CAM++ MLX Model Usage Example (ModelScope Architecture)
+import mlx.core as mx
+import numpy as np
+from model import CAMPPModelScopeV2
+import json
+def load_model(model_path="."):
+    # Load config
+    with open(f"{model_path}/config.json", "r") as f:
+        config = json.load(f)
+    # Initialize model
+    model = CAMPPModelScopeV2(
+        input_dim=config["input_dim"],
+        channels=config.get("channels", 512),
+        block_layers=config.get("block_layers", [4, 9, 16]),
+        embedding_dim=config["embedding_dim"],
+        cam_channels=config.get("cam_channels", 128),
+        input_kernel_size=config.get("input_kernel_size", 5)
+    )
+    # Load weights
+    weights = mx.load(f"{model_path}/weights.npz")
+    model.load_weights(weights)
+    return model
+def extract_speaker_embedding(model, audio_features):
+    # audio_features: (batch, features, time) - e.g., mel-spectrogram
+    # Returns: speaker embedding vector
+    mx.eval(model.parameters())  # Ensure weights are loaded
+    with mx.no_grad():
+        embedding = model(audio_features)
+    return embedding
+# Example usage:
+# model = load_model()
+# features = mx.random.normal((1, 320, 200))  # Example input
+# embedding = extract_speaker_embedding(model, features)
+# print(f"Speaker embedding shape: {embedding.shape}")

weights.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f7e173eb843c4cca555801b82a5358fb8a51279ada455b7b9cb7924ab3b868a
+size 24886146