Text-to-Speech
MLX
Safetensors
English
dots_tts
tts
quantized
4-bit precision
8-bit precision
apple-silicon
dots.tts
Instructions to use smcleod/dots.tts-soar-mlx with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- MLX
How to use smcleod/dots.tts-soar-mlx with MLX:
# Download the model from the Hub pip install huggingface_hub[hf_xet] huggingface-cli download --local-dir dots.tts-soar-mlx smcleod/dots.tts-soar-mlx
- Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- LM Studio
| #!/usr/bin/env python3 | |
| """Convert dots.tts-soar CAM++ speaker encoder weights to an MLX-friendly safetensors. | |
| Source : speaker_encoder.safetensors (PyTorch state dict, key prefix `model.`) | |
| Output : speaker_encoder_mlx.safetensors | |
| Transforms applied: | |
| - Strip the leading `model.` prefix from every CAM++ key (the torch wrapper | |
| nests the CAMPPlus under `self.model`). The torchaudio resample buffer | |
| `resample.kernel` is dropped: it is NOT used at inference for the soar model | |
| because the encoder is built with sample_rate == vocoder.sample_rate and the | |
| real path resamples 48k -> 16k via this buffer. We document the resample in | |
| the spec and the Swift port performs resampling itself (see spec); the kernel | |
| is a torchaudio sinc design we reproduce independently, so it is not exported. | |
| - Conv1d weight (out, in/groups, k) -> MLX Conv1d (out, k, in/groups): permute (0,2,1). | |
| - Conv2d weight (out, in, kH, kW) -> MLX Conv2d (out, kH, kW, in): permute (0,2,3,1). | |
| - BatchNorm/Linear/LayerNorm 1D and 2D tensors are copied unchanged. | |
| - `num_batches_tracked` int64 scalars are dropped (not needed for inference). | |
| Run on CPU only. No MPS / MLX import here. | |
| """ | |
| from __future__ import annotations | |
| import collections | |
| from pathlib import Path | |
| import torch | |
| from safetensors import safe_open | |
| from safetensors.torch import save_file | |
| SNAPSHOT = Path( | |
| "/Users/samm/.cache/huggingface/hub/models--rednote-hilab--dots.tts-soar/" | |
| "snapshots/1fd9452e55c2c9f38fe1a8ee09eaf7448c222d35" | |
| ) | |
| SRC = SNAPSHOT / "speaker_encoder.safetensors" | |
| DST = Path("/Users/samm/git/dots-mlx-spike/speaker_encoder_mlx.safetensors") | |
| def is_conv1d_weight(key: str, tensor: torch.Tensor) -> bool: | |
| return key.endswith(".weight") and tensor.ndim == 3 | |
| def is_conv2d_weight(key: str, tensor: torch.Tensor) -> bool: | |
| return key.endswith(".weight") and tensor.ndim == 4 | |
| def main() -> None: | |
| f = safe_open(str(SRC), "pt") | |
| out: dict[str, torch.Tensor] = {} | |
| stats = collections.Counter() | |
| for key in f.keys(): | |
| t = f.get_tensor(key).contiguous() | |
| # Drop torchaudio resample kernel (reproduced independently in Swift). | |
| if key.startswith("resample."): | |
| stats["dropped_resample"] += 1 | |
| continue | |
| # Drop BN step counters (inference uses running stats only). | |
| if key.endswith("num_batches_tracked"): | |
| stats["dropped_num_batches_tracked"] += 1 | |
| continue | |
| # Strip the `model.` wrapper prefix. | |
| clean = key[len("model."):] if key.startswith("model.") else key | |
| if is_conv2d_weight(clean, t): | |
| # (out, in, kH, kW) -> (out, kH, kW, in) | |
| t = t.permute(0, 2, 3, 1).contiguous() | |
| stats["conv2d_transposed"] += 1 | |
| elif is_conv1d_weight(clean, t): | |
| # (out, in/groups, k) -> (out, k, in/groups) | |
| t = t.permute(0, 2, 1).contiguous() | |
| stats["conv1d_transposed"] += 1 | |
| else: | |
| stats["copied"] += 1 | |
| out[clean] = t.to(torch.float32) | |
| DST.parent.mkdir(parents=True, exist_ok=True) | |
| save_file(out, str(DST)) | |
| total_params = sum(v.numel() for v in out.values()) | |
| print(f"wrote {DST}") | |
| print(f"output tensors: {len(out)}") | |
| print(f"output params : {total_params}") | |
| print("stats:", dict(stats)) | |
| # spot-check a couple of transposed shapes | |
| for k in ("head.conv1.weight", "xvector.tdnn.linear.weight", | |
| "xvector.dense.linear.weight"): | |
| if k in out: | |
| print(f" {k}: {tuple(out[k].shape)}") | |
| if __name__ == "__main__": | |
| main() | |