MeanVC (code, models, paper)
Browse files- .gitattributes +3 -0
- MeanVC. Lightweight and Streaming Zero-Shot Voice Conversion via Mean Flows.pdf +3 -0
- code/MeanVC [Zemacs] +1 Streaming ONNX export.zip +3 -0
- code/MeanVC [sutungpo] +5 -2.zip +3 -0
- code/MeanVC [y12rf] +6.zip +3 -0
- code/MeanVC-experiments [benhsampson] +1 -1 Notebook.zip +3 -0
- code/MeanVC.zip +3 -0
- models/en,zh/MeanVC/.gitattributes +37 -0
- models/en,zh/MeanVC/README.md +109 -0
- models/en,zh/MeanVC/config.json +14 -0
- models/en,zh/MeanVC/fastu2++.pt +3 -0
- models/en,zh/MeanVC/figs/model.png +3 -0
- models/en,zh/MeanVC/figs/npu@aslp.jpeg +3 -0
- models/en,zh/MeanVC/meanvc_200ms.pt +3 -0
- models/en,zh/MeanVC/model_200ms.safetensors +3 -0
- models/en,zh/MeanVC/source.txt +1 -0
- models/en,zh/MeanVC/vocos.pt +3 -0
- models/wavlm_large_finetune.pth +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
MeanVC.[[:space:]]Lightweight[[:space:]]and[[:space:]]Streaming[[:space:]]Zero-Shot[[:space:]]Voice[[:space:]]Conversion[[:space:]]via[[:space:]]Mean[[:space:]]Flows.pdf filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
models/en,zh/MeanVC/figs/model.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
models/en,zh/MeanVC/figs/npu@aslp.jpeg filter=lfs diff=lfs merge=lfs -text
|
MeanVC. Lightweight and Streaming Zero-Shot Voice Conversion via Mean Flows.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bfef007767b7af2c7d0cd0d12a3069518568fdb17394fad19df0eba85cece251
|
| 3 |
+
size 852776
|
code/MeanVC [Zemacs] +1 Streaming ONNX export.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79e94feed1289797fe18e501452897893748ca29be12681063c41597abde1a05
|
| 3 |
+
size 5401663
|
code/MeanVC [sutungpo] +5 -2.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83412d4c866b0c7410651fd26e88790599bf639999af6a09c2a07fd1d389e5dc
|
| 3 |
+
size 28387187
|
code/MeanVC [y12rf] +6.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:18d821f32793c9c16206d5f2b7780b392457202c745f2a938ee425ec6f330d0a
|
| 3 |
+
size 5434841
|
code/MeanVC-experiments [benhsampson] +1 -1 Notebook.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6fc1771c4a78f11307e61d18d4baa1b2e30d8c121f90873bf02d789ece5a8b0c
|
| 3 |
+
size 6456970
|
code/MeanVC.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b9bb04097804f772c2ce104b31418cb5cc7333b61edf48ccad4830c1bd334b58
|
| 3 |
+
size 29525745
|
models/en,zh/MeanVC/.gitattributes
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
models/en,zh/MeanVC/README.md
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- en
|
| 4 |
+
- zh
|
| 5 |
+
license: apache-2.0
|
| 6 |
+
pipeline_tag: audio-to-audio
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
# MeanVC: Lightweight and Streaming Zero-Shot Voice Conversion via Mean Flows
|
| 10 |
+
|
| 11 |
+
<div align="center">
|
| 12 |
+
|
| 13 |
+
[](https://arxiv.org/pdf/2510.08392)
|
| 14 |
+
[](https://github.com/ASLP-lab/MeanVC)
|
| 15 |
+
[](https://aslp-lab.github.io/MeanVC/)
|
| 16 |
+
|
| 17 |
+
</div>
|
| 18 |
+
|
| 19 |
+
**MeanVC** is a lightweight and streaming zero-shot voice conversion system that enables real-time timbre transfer from any source speaker to any target speaker while preserving linguistic content. The system introduces a diffusion transformer with a chunk-wise autoregressive denoising strategy and mean flows for efficient single-step inference.
|
| 20 |
+
|
| 21 |
+

|
| 22 |
+
|
| 23 |
+
## โจ Key Features
|
| 24 |
+
|
| 25 |
+
- **๐ Streaming Inference**: Real-time voice conversion with chunk-wise processing.
|
| 26 |
+
- **โก Single-Step Generation**: Direct mapping from start to endpoint via mean flows for fast generation.
|
| 27 |
+
- **๐ฏ Zero-Shot Capability**: Convert to any unseen target speaker without re-training.
|
| 28 |
+
- **๐พ Lightweight**: Significantly fewer parameters than existing methods.
|
| 29 |
+
- **๐ High Fidelity**: Superior speech quality and speaker similarity.
|
| 30 |
+
|
| 31 |
+
## ๐ป Sample Usage
|
| 32 |
+
|
| 33 |
+
### 1. Environment Setup
|
| 34 |
+
First, follow these steps to clone the repository and install the required environment.
|
| 35 |
+
|
| 36 |
+
```bash
|
| 37 |
+
# Clone the repository and enter the directory
|
| 38 |
+
git clone https://github.com/ASLP-lab/MeanVC.git
|
| 39 |
+
cd MeanVC
|
| 40 |
+
|
| 41 |
+
# Create and activate a Conda environment
|
| 42 |
+
conda create -n meanvc python=3.11 -y
|
| 43 |
+
conda activate meanvc
|
| 44 |
+
|
| 45 |
+
# Install dependencies
|
| 46 |
+
pip install -r requirements.txt
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
### 2. Download Pre-trained Models
|
| 50 |
+
Run the provided script to automatically download all necessary pre-trained models.
|
| 51 |
+
|
| 52 |
+
```bash
|
| 53 |
+
python download_ckpt.py
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
This will download the main VC model, vocoder, and ASR model into the `src/ckpt/` directories.
|
| 57 |
+
The speaker verification model (`wavlm_large_finetune.pth`) must be downloaded manually from Google Drive. Download the file from [this link](https://drive.google.com/file/d/1-aE1NfzpRCLxA4GUxX9ITI3F9LlbtEGP/view). Place the downloaded `wavlm_large_finetune.pth` file into the `src/runtime/speaker_verification/ckpt/` directory.
|
| 58 |
+
|
| 59 |
+
### 3. Real-Time Voice Conversion
|
| 60 |
+
This script captures audio from your microphone and converts it in real-time to the voice of a target speaker.
|
| 61 |
+
|
| 62 |
+
```bash
|
| 63 |
+
python src/runtime/run_rt.py --target-path "path/to/target_voice.wav"
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
- `--target-path`: Path to a clean audio file of the target speaker. This voice will be used as the conversion target. An example file is provided at `src/runtime/example/test.wav`.
|
| 67 |
+
|
| 68 |
+
When you run the script, you will be prompted to select your audio input (microphone) and output (speaker) devices from a list.
|
| 69 |
+
|
| 70 |
+
### 4. Offline Voice Conversion
|
| 71 |
+
For batch processing or converting pre-recorded audio files, use the offline conversion script.
|
| 72 |
+
|
| 73 |
+
```bash
|
| 74 |
+
bash scripts/infer_ref.sh
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
Before running the script, you need to configure the following paths in `scripts/infer_ref.sh`:
|
| 78 |
+
|
| 79 |
+
- `source_path`: Path to the source audio file or directory containing multiple audio files to be converted
|
| 80 |
+
- `reference_path`: Path to a clean audio file of the target speaker (used as voice reference)
|
| 81 |
+
- `output_dir`: Directory where converted audio files will be saved (default: `src/outputs`)
|
| 82 |
+
- `steps`: Number of denoising steps (default: 2)
|
| 83 |
+
|
| 84 |
+
## ๐ License & Disclaimer
|
| 85 |
+
|
| 86 |
+
MeanVC is released under the Apache License 2.0. This open-source license allows you to freely use, modify, and distribute the model, as long as you include the appropriate copyright notice and disclaimer.
|
| 87 |
+
|
| 88 |
+
MeanVC is designed for research and legitimate applications in voice conversion technology. Users must obtain proper consent from individuals whose voices are being converted or used as references. We strongly discourage any malicious use including impersonation, fraud, or creating misleading audio content. Users are solely responsible for ensuring their use cases comply with ethical standards and legal requirements.
|
| 89 |
+
|
| 90 |
+
## ๐ Citation
|
| 91 |
+
|
| 92 |
+
If you find our work helpful, please cite our paper:
|
| 93 |
+
|
| 94 |
+
```bibtex
|
| 95 |
+
@article{ma2025meanvc,
|
| 96 |
+
title={MeanVC: Lightweight and Streaming Zero-Shot Voice Conversion via Mean Flows},
|
| 97 |
+
author={Ma, Guobin and Yao, Jixun and Ning, Ziqian and Jiang, Yuepeng and Xiong, Lingxin and Xie, Lei and Zhu, Pengcheng},
|
| 98 |
+
journal={arXiv preprint arXiv:2510.08392},
|
| 99 |
+
year={2025}
|
| 100 |
+
}
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
## ๐ง Contact
|
| 104 |
+
|
| 105 |
+
If you are interested in leaving a message to our research team, feel free to email guobin.ma@mail.nwpu.edu.cn
|
| 106 |
+
|
| 107 |
+
<p align="center">
|
| 108 |
+
<img src="https://huggingface.co/ASLP-lab/MeanVC/resolve/main/figs/npu@aslp.jpeg" width="500"/>
|
| 109 |
+
</p>
|
models/en,zh/MeanVC/config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "DiT",
|
| 3 |
+
"model": {
|
| 4 |
+
"dim": 512,
|
| 5 |
+
"depth": 4,
|
| 6 |
+
"heads": 2,
|
| 7 |
+
"ff_mult": 2,
|
| 8 |
+
"bn_dim": 256,
|
| 9 |
+
"conv_layers": 4,
|
| 10 |
+
"chunk_size": 20,
|
| 11 |
+
"dropout": 0.0,
|
| 12 |
+
"qk_norm": "rms_norm"
|
| 13 |
+
}
|
| 14 |
+
}
|
models/en,zh/MeanVC/fastu2++.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4d6bc4290c4d489ed50b6ffbbcda33bd3ba9551506852c7f2fa683f9fe9512a1
|
| 3 |
+
size 91972768
|
models/en,zh/MeanVC/figs/model.png
ADDED
|
Git LFS Details
|
models/en,zh/MeanVC/figs/npu@aslp.jpeg
ADDED
|
Git LFS Details
|
models/en,zh/MeanVC/meanvc_200ms.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17a234944c7e63bfc94e71eea7de8dfb9f7f2e990cde9fd8df12ddad5237c68f
|
| 3 |
+
size 56355402
|
models/en,zh/MeanVC/model_200ms.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5c2d9ed6c8c149d4fdf9ba6f17ebbc675784010585344448136261c874decb0f
|
| 3 |
+
size 56271424
|
models/en,zh/MeanVC/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/ASLP-lab/MeanVC
|
models/en,zh/MeanVC/vocos.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e8aba28aa9cea0813e571a25ee33ef35bd74c803da34a65e683d9b0f7e2f281
|
| 3 |
+
size 33223980
|
models/wavlm_large_finetune.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51f07e3b94d9e0262a6a675ef5a087be3dd09e8c62e9d886827f44f82fe7f94b
|
| 3 |
+
size 1301926579
|