niobures commited on
Commit
3b71815
ยท
verified ยท
1 Parent(s): b6d1c6b

MeanVC (code, models, paper)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ MeanVC.[[:space:]]Lightweight[[:space:]]and[[:space:]]Streaming[[:space:]]Zero-Shot[[:space:]]Voice[[:space:]]Conversion[[:space:]]via[[:space:]]Mean[[:space:]]Flows.pdf filter=lfs diff=lfs merge=lfs -text
37
+ models/en,zh/MeanVC/figs/model.png filter=lfs diff=lfs merge=lfs -text
38
+ models/en,zh/MeanVC/figs/npu@aslp.jpeg filter=lfs diff=lfs merge=lfs -text
MeanVC. Lightweight and Streaming Zero-Shot Voice Conversion via Mean Flows.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfef007767b7af2c7d0cd0d12a3069518568fdb17394fad19df0eba85cece251
3
+ size 852776
code/MeanVC [Zemacs] +1 Streaming ONNX export.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79e94feed1289797fe18e501452897893748ca29be12681063c41597abde1a05
3
+ size 5401663
code/MeanVC [sutungpo] +5 -2.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83412d4c866b0c7410651fd26e88790599bf639999af6a09c2a07fd1d389e5dc
3
+ size 28387187
code/MeanVC [y12rf] +6.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18d821f32793c9c16206d5f2b7780b392457202c745f2a938ee425ec6f330d0a
3
+ size 5434841
code/MeanVC-experiments [benhsampson] +1 -1 Notebook.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fc1771c4a78f11307e61d18d4baa1b2e30d8c121f90873bf02d789ece5a8b0c
3
+ size 6456970
code/MeanVC.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9bb04097804f772c2ce104b31418cb5cc7333b61edf48ccad4830c1bd334b58
3
+ size 29525745
models/en,zh/MeanVC/.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.png filter=lfs diff=lfs merge=lfs -text
37
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
models/en,zh/MeanVC/README.md ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - zh
5
+ license: apache-2.0
6
+ pipeline_tag: audio-to-audio
7
+ ---
8
+
9
+ # MeanVC: Lightweight and Streaming Zero-Shot Voice Conversion via Mean Flows
10
+
11
+ <div align="center">
12
+
13
+ [![Paper](https://img.shields.io/badge/arXiv-2510.08392-b31b1b.svg)](https://arxiv.org/pdf/2510.08392)
14
+ [![Github](https://img.shields.io/badge/Github-Page-green)](https://github.com/ASLP-lab/MeanVC)
15
+ [![Demo Page](https://img.shields.io/badge/Demo-Audio%20Samples-green)](https://aslp-lab.github.io/MeanVC/)
16
+
17
+ </div>
18
+
19
+ **MeanVC** is a lightweight and streaming zero-shot voice conversion system that enables real-time timbre transfer from any source speaker to any target speaker while preserving linguistic content. The system introduces a diffusion transformer with a chunk-wise autoregressive denoising strategy and mean flows for efficient single-step inference.
20
+
21
+ ![img](https://huggingface.co/ASLP-lab/MeanVC/resolve/main/figs/model.png)
22
+
23
+ ## โœจ Key Features
24
+
25
+ - **๐Ÿš€ Streaming Inference**: Real-time voice conversion with chunk-wise processing.
26
+ - **โšก Single-Step Generation**: Direct mapping from start to endpoint via mean flows for fast generation.
27
+ - **๐ŸŽฏ Zero-Shot Capability**: Convert to any unseen target speaker without re-training.
28
+ - **๐Ÿ’พ Lightweight**: Significantly fewer parameters than existing methods.
29
+ - **๐Ÿ”Š High Fidelity**: Superior speech quality and speaker similarity.
30
+
31
+ ## ๐Ÿ’ป Sample Usage
32
+
33
+ ### 1. Environment Setup
34
+ First, follow these steps to clone the repository and install the required environment.
35
+
36
+ ```bash
37
+ # Clone the repository and enter the directory
38
+ git clone https://github.com/ASLP-lab/MeanVC.git
39
+ cd MeanVC
40
+
41
+ # Create and activate a Conda environment
42
+ conda create -n meanvc python=3.11 -y
43
+ conda activate meanvc
44
+
45
+ # Install dependencies
46
+ pip install -r requirements.txt
47
+ ```
48
+
49
+ ### 2. Download Pre-trained Models
50
+ Run the provided script to automatically download all necessary pre-trained models.
51
+
52
+ ```bash
53
+ python download_ckpt.py
54
+ ```
55
+
56
+ This will download the main VC model, vocoder, and ASR model into the `src/ckpt/` directories.
57
+ The speaker verification model (`wavlm_large_finetune.pth`) must be downloaded manually from Google Drive. Download the file from [this link](https://drive.google.com/file/d/1-aE1NfzpRCLxA4GUxX9ITI3F9LlbtEGP/view). Place the downloaded `wavlm_large_finetune.pth` file into the `src/runtime/speaker_verification/ckpt/` directory.
58
+
59
+ ### 3. Real-Time Voice Conversion
60
+ This script captures audio from your microphone and converts it in real-time to the voice of a target speaker.
61
+
62
+ ```bash
63
+ python src/runtime/run_rt.py --target-path "path/to/target_voice.wav"
64
+ ```
65
+
66
+ - `--target-path`: Path to a clean audio file of the target speaker. This voice will be used as the conversion target. An example file is provided at `src/runtime/example/test.wav`.
67
+
68
+ When you run the script, you will be prompted to select your audio input (microphone) and output (speaker) devices from a list.
69
+
70
+ ### 4. Offline Voice Conversion
71
+ For batch processing or converting pre-recorded audio files, use the offline conversion script.
72
+
73
+ ```bash
74
+ bash scripts/infer_ref.sh
75
+ ```
76
+
77
+ Before running the script, you need to configure the following paths in `scripts/infer_ref.sh`:
78
+
79
+ - `source_path`: Path to the source audio file or directory containing multiple audio files to be converted
80
+ - `reference_path`: Path to a clean audio file of the target speaker (used as voice reference)
81
+ - `output_dir`: Directory where converted audio files will be saved (default: `src/outputs`)
82
+ - `steps`: Number of denoising steps (default: 2)
83
+
84
+ ## ๐Ÿ“œ License & Disclaimer
85
+
86
+ MeanVC is released under the Apache License 2.0. This open-source license allows you to freely use, modify, and distribute the model, as long as you include the appropriate copyright notice and disclaimer.
87
+
88
+ MeanVC is designed for research and legitimate applications in voice conversion technology. Users must obtain proper consent from individuals whose voices are being converted or used as references. We strongly discourage any malicious use including impersonation, fraud, or creating misleading audio content. Users are solely responsible for ensuring their use cases comply with ethical standards and legal requirements.
89
+
90
+ ## ๐Ÿ“„ Citation
91
+
92
+ If you find our work helpful, please cite our paper:
93
+
94
+ ```bibtex
95
+ @article{ma2025meanvc,
96
+ title={MeanVC: Lightweight and Streaming Zero-Shot Voice Conversion via Mean Flows},
97
+ author={Ma, Guobin and Yao, Jixun and Ning, Ziqian and Jiang, Yuepeng and Xiong, Lingxin and Xie, Lei and Zhu, Pengcheng},
98
+ journal={arXiv preprint arXiv:2510.08392},
99
+ year={2025}
100
+ }
101
+ ```
102
+
103
+ ## ๐Ÿ“ง Contact
104
+
105
+ If you are interested in leaving a message to our research team, feel free to email guobin.ma@mail.nwpu.edu.cn
106
+
107
+ <p align="center">
108
+ <img src="https://huggingface.co/ASLP-lab/MeanVC/resolve/main/figs/npu@aslp.jpeg" width="500"/>
109
+ </p>
models/en,zh/MeanVC/config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "DiT",
3
+ "model": {
4
+ "dim": 512,
5
+ "depth": 4,
6
+ "heads": 2,
7
+ "ff_mult": 2,
8
+ "bn_dim": 256,
9
+ "conv_layers": 4,
10
+ "chunk_size": 20,
11
+ "dropout": 0.0,
12
+ "qk_norm": "rms_norm"
13
+ }
14
+ }
models/en,zh/MeanVC/fastu2++.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d6bc4290c4d489ed50b6ffbbcda33bd3ba9551506852c7f2fa683f9fe9512a1
3
+ size 91972768
models/en,zh/MeanVC/figs/model.png ADDED

Git LFS Details

  • SHA256: 0514278f969b291eceab1c2fa4c4171008ba47688798e9ecaa4c6d3cb9c2b826
  • Pointer size: 131 Bytes
  • Size of remote file: 292 kB
models/en,zh/MeanVC/figs/npu@aslp.jpeg ADDED

Git LFS Details

  • SHA256: 41eae6df7b8458e13ffd2de14876f95cd3f7fad91f8527a751a7dd63347c6a71
  • Pointer size: 132 Bytes
  • Size of remote file: 1.56 MB
models/en,zh/MeanVC/meanvc_200ms.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17a234944c7e63bfc94e71eea7de8dfb9f7f2e990cde9fd8df12ddad5237c68f
3
+ size 56355402
models/en,zh/MeanVC/model_200ms.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c2d9ed6c8c149d4fdf9ba6f17ebbc675784010585344448136261c874decb0f
3
+ size 56271424
models/en,zh/MeanVC/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/ASLP-lab/MeanVC
models/en,zh/MeanVC/vocos.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e8aba28aa9cea0813e571a25ee33ef35bd74c803da34a65e683d9b0f7e2f281
3
+ size 33223980
models/wavlm_large_finetune.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51f07e3b94d9e0262a6a675ef5a087be3dd09e8c62e9d886827f44f82fe7f94b
3
+ size 1301926579