Update README.md
Browse files
README.md
CHANGED
|
@@ -17,11 +17,45 @@ tags:
|
|
| 17 |
- music
|
| 18 |
---
|
| 19 |
|
| 20 |
-
# Vevo2
|
| 21 |
|
| 22 |
-
[:
|
| 40 |
-
if timbre_ref_wav_path is None:
|
| 41 |
-
timbre_ref_wav_path = ref_wav_path
|
| 42 |
-
|
| 43 |
-
gen_audio = inference_pipeline.inference_ar_and_fm(
|
| 44 |
-
target_text=tgt_text,
|
| 45 |
-
style_ref_wav_path=ref_wav_path,
|
| 46 |
-
style_ref_wav_text=ref_text,
|
| 47 |
-
timbre_ref_wav_path=timbre_ref_wav_path,
|
| 48 |
-
use_prosody_code=False,
|
| 49 |
-
)
|
| 50 |
-
|
| 51 |
-
assert output_path is not None
|
| 52 |
-
save_audio(gen_audio, output_path=output_path)
|
| 53 |
-
|
| 54 |
-
|
| 55 |
def vevo2_editing(
|
| 56 |
tgt_text,
|
| 57 |
raw_wav_path,
|
|
@@ -71,54 +83,11 @@ def vevo2_editing(
|
|
| 71 |
save_audio(gen_audio, output_path=output_path)
|
| 72 |
|
| 73 |
|
| 74 |
-
def vevo2_singing_style_conversion(
|
| 75 |
-
raw_wav_path,
|
| 76 |
-
style_ref_wav_path,
|
| 77 |
-
output_path=None,
|
| 78 |
-
raw_text=None,
|
| 79 |
-
style_ref_text=None,
|
| 80 |
-
):
|
| 81 |
-
gen_audio = inference_pipeline.inference_ar_and_fm(
|
| 82 |
-
target_text=raw_text,
|
| 83 |
-
prosody_wav_path=raw_wav_path,
|
| 84 |
-
style_ref_wav_path=style_ref_wav_path,
|
| 85 |
-
style_ref_wav_text=style_ref_text,
|
| 86 |
-
timbre_ref_wav_path=raw_wav_path,
|
| 87 |
-
use_prosody_code=True,
|
| 88 |
-
use_pitch_shift=True,
|
| 89 |
-
)
|
| 90 |
-
|
| 91 |
-
assert output_path is not None
|
| 92 |
-
save_audio(gen_audio, output_path=output_path)
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
def vevo2_melody_control(
|
| 96 |
-
tgt_text,
|
| 97 |
-
tgt_melody_wav_path,
|
| 98 |
-
output_path=None,
|
| 99 |
-
style_ref_wav_path=None,
|
| 100 |
-
style_ref_text=None,
|
| 101 |
-
timbre_ref_wav_path=None,
|
| 102 |
-
):
|
| 103 |
-
gen_audio = inference_pipeline.inference_ar_and_fm(
|
| 104 |
-
target_text=tgt_text,
|
| 105 |
-
prosody_wav_path=tgt_melody_wav_path,
|
| 106 |
-
style_ref_wav_path=style_ref_wav_path,
|
| 107 |
-
style_ref_wav_text=style_ref_text,
|
| 108 |
-
timbre_ref_wav_path=timbre_ref_wav_path,
|
| 109 |
-
use_prosody_code=True,
|
| 110 |
-
use_pitch_shift=True,
|
| 111 |
-
)
|
| 112 |
-
|
| 113 |
-
assert output_path is not None
|
| 114 |
-
save_audio(gen_audio, output_path=output_path)
|
| 115 |
-
|
| 116 |
-
|
| 117 |
def load_inference_pipeline():
|
| 118 |
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
| 119 |
|
| 120 |
local_dir = snapshot_download(
|
| 121 |
-
repo_id="
|
| 122 |
repo_type="model",
|
| 123 |
local_dir="./ckpts/Vevo2",
|
| 124 |
resume_download=True,
|
|
@@ -166,47 +135,6 @@ if __name__ == "__main__":
|
|
| 166 |
output_dir = "./models/svc/vevo2/output"
|
| 167 |
os.makedirs(output_dir, exist_ok=True)
|
| 168 |
|
| 169 |
-
### Zero-shot Text-to-Speech and Text-to-Singing ###
|
| 170 |
-
tgt_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences."
|
| 171 |
-
ref_wav_path = "./models/vc/vevo/wav/arabic_male.wav"
|
| 172 |
-
ref_text = "Flip stood undecided, his ears strained to catch the slightest sound."
|
| 173 |
-
|
| 174 |
-
jaychou_path = "./models/svc/vevosing/wav/jaychou.wav"
|
| 175 |
-
jaychou_text = (
|
| 176 |
-
"对这个世界如果你有太多的抱怨,跌倒了就不该继续往前走,为什么,人要这么的脆弱堕"
|
| 177 |
-
)
|
| 178 |
-
taiyizhenren_path = "./models/svc/vevosing/wav/taiyizhenren.wav"
|
| 179 |
-
taiyizhenren_text = (
|
| 180 |
-
"对,这就是我,万人敬仰的太乙真人。虽然有点婴儿肥,但也掩不住我,逼人的帅气。"
|
| 181 |
-
)
|
| 182 |
-
|
| 183 |
-
# the style reference and timbre reference are same
|
| 184 |
-
vevo2_tts(
|
| 185 |
-
tgt_text=tgt_text,
|
| 186 |
-
ref_wav_path=ref_wav_path,
|
| 187 |
-
timbre_ref_wav_path=ref_wav_path,
|
| 188 |
-
output_path=os.path.join(output_dir, "zstts.wav"),
|
| 189 |
-
ref_text=ref_text,
|
| 190 |
-
)
|
| 191 |
-
|
| 192 |
-
# the style reference and timbre reference are different
|
| 193 |
-
vevo2_tts(
|
| 194 |
-
tgt_text=tgt_text,
|
| 195 |
-
ref_wav_path=ref_wav_path,
|
| 196 |
-
timbre_ref_wav_path=jaychou_path,
|
| 197 |
-
output_path=os.path.join(output_dir, "zstts_disentangled.wav"),
|
| 198 |
-
ref_text=ref_text,
|
| 199 |
-
)
|
| 200 |
-
|
| 201 |
-
# the style reference is a singing voice
|
| 202 |
-
vevo2_tts(
|
| 203 |
-
tgt_text="顿时,气氛变得沉郁起来。乍看之下,一切的困扰仿佛都围绕在我身边。我皱着眉头,感受着那份压力,但我知道我不能放弃,不能认输。于是,我深吸一口气,心底的声音告诉我:“无论如何,都要冷静下来,重新开始。”",
|
| 204 |
-
ref_wav_path=jaychou_path,
|
| 205 |
-
ref_text=jaychou_text,
|
| 206 |
-
timbre_ref_wav_path=taiyizhenren_path,
|
| 207 |
-
output_path=os.path.join(output_dir, "zstts_singing.wav"),
|
| 208 |
-
)
|
| 209 |
-
|
| 210 |
### Zero-shot Singing Editing ###
|
| 211 |
adele_path = "./models/svc/vevosing/wav/adele.wav"
|
| 212 |
adele_text = "Never mind, I'll find someone like you. I wish nothing but."
|
|
@@ -224,46 +152,6 @@ if __name__ == "__main__":
|
|
| 224 |
raw_text=jaychou_text, # "对这个世界如果你有太多的抱怨,跌倒了就不该继续往前走,为什么,人要这么的脆弱堕"
|
| 225 |
output_path=os.path.join(output_dir, "editing_jaychou.wav"),
|
| 226 |
)
|
| 227 |
-
|
| 228 |
-
### Zero-shot Singing Style Conversion ###
|
| 229 |
-
breathy_path = "./models/svc/vevosing/wav/breathy.wav"
|
| 230 |
-
breathy_text = "离别没说再见你是否心酸"
|
| 231 |
-
|
| 232 |
-
vibrato_path = "./models/svc/vevosing/wav/vibrato.wav"
|
| 233 |
-
vibrato_text = "玫瑰的红,容易受伤的梦,握在手中却流失于指缝"
|
| 234 |
-
|
| 235 |
-
vevo2_singing_style_conversion(
|
| 236 |
-
raw_wav_path=breathy_path,
|
| 237 |
-
raw_text=breathy_text,
|
| 238 |
-
style_ref_wav_path=vibrato_path,
|
| 239 |
-
style_ref_text=vibrato_text,
|
| 240 |
-
output_path=os.path.join(output_dir, "ssc_breathy2vibrato.wav"),
|
| 241 |
-
)
|
| 242 |
-
|
| 243 |
-
### Melody Control for Singing Synthesis ##
|
| 244 |
-
humming_path = "./models/svc/vevosing/wav/humming.wav"
|
| 245 |
-
piano_path = "./models/svc/vevosing/wav/piano.wav"
|
| 246 |
-
|
| 247 |
-
# Humming to control the melody
|
| 248 |
-
vevo2_melody_control(
|
| 249 |
-
tgt_text="你是我的小呀小苹果,怎么爱,不嫌多",
|
| 250 |
-
tgt_melody_wav_path=humming_path,
|
| 251 |
-
output_path=os.path.join(output_dir, "melody_humming.wav"),
|
| 252 |
-
style_ref_wav_path=taiyizhenren_path,
|
| 253 |
-
style_ref_text=taiyizhenren_text,
|
| 254 |
-
timbre_ref_wav_path=taiyizhenren_path,
|
| 255 |
-
)
|
| 256 |
-
|
| 257 |
-
# Piano to control the melody
|
| 258 |
-
vevo2_melody_control(
|
| 259 |
-
tgt_text="你是我的小呀小苹果,怎么爱,不嫌多",
|
| 260 |
-
tgt_melody_wav_path=piano_path,
|
| 261 |
-
output_path=os.path.join(output_dir, "melody_piano.wav"),
|
| 262 |
-
style_ref_wav_path=taiyizhenren_path,
|
| 263 |
-
style_ref_text=taiyizhenren_text,
|
| 264 |
-
timbre_ref_wav_path=taiyizhenren_path,
|
| 265 |
-
)
|
| 266 |
-
|
| 267 |
```
|
| 268 |
|
| 269 |
## Citations
|
|
|
|
| 17 |
- music
|
| 18 |
---
|
| 19 |
|
| 20 |
+
# Vevo2: A Unified and Controllable Framework for Speech and Singing Voice Generation
|
| 21 |
|
| 22 |
+
[](https://arxiv.org/abs/2508.16332)
|
| 23 |
+
[](https://huggingface.co/RMSnow/Vevo2)
|
| 24 |
+
[](https://versasinger.github.io/)
|
| 25 |
+
|
| 26 |
+
We present **Vevo2**, a unified and controllable framework for speech and singing voice generation. Vevo2 bridges controllable speech and singing voice generation via unified prosody learning, and supports a comprehensive set of generation tasks, including:
|
| 27 |
+
|
| 28 |
+
1. Zero-shot Text-to-Speech (TTS), Text-to-Singing, and Singing Voice Synthesis (SVS)
|
| 29 |
+
2. Style-preserved Voice/Singing Voice Conversion (VC/SVC)
|
| 30 |
+
3. Style-converted Voice/Singing Voice Conversion (VC/SVC)
|
| 31 |
+
4. Speech/Singing Voice Editing
|
| 32 |
+
5. Singing Style Conversion
|
| 33 |
+
6. Humming-to-Singing and Instrument-to-Singing
|
| 34 |
+
|
| 35 |
+

|
| 36 |
+
|
| 37 |
+
## Pre-trained Models
|
| 38 |
+
|
| 39 |
+
We have included the following pre-trained models at [🤗 RMSnow/Vevo2](https://huggingface.co/RMSnow/Vevo2):
|
| 40 |
+
|
| 41 |
+
| Model | Description | Pre-trained Data and Checkpoint |
|
| 42 |
+
| ------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
| 43 |
+
| **Prosody Tokenizer** | Converting speech/singing waveform to **coarse-grained prosody tokens** (which can also be interpreted as *melody contour* from a musical perspective). It is a single codebook VQ-VAE with a vocabulary size of 512. The frame rate is 6.25 Hz. (i.e., **56.25 bps**) | [🤗 Emilia-101k, SingNet-7k](https://huggingface.co/RMSnow/Vevo2/tree/main/tokenizer/prosody_fvq512_6.25hz) |
|
| 44 |
+
| **Content-Style Tokenizer** | Converting speech/singing waveform to **fine-grained content-style tokens**. It is a single codebook VQ-VAE with a vocabulary size of 16384. The frame rate is 12.5 Hz. (i.e., **175 bps**) | [🤗 Emilia-101k, SingNet-7k](https://huggingface.co/RMSnow/Vevo2/tree/main/tokenizer/contentstyle_fvq16384_12.5hz) |
|
| 45 |
+
| **AR Model** | A Qwen-based (Qwen2.5-0.5B) large language model post-trained to predict content-style tokens from text tokens and optionally prosody tokens, with unified prosody learning across speech and singing. | [🤗 Emilia-101k, SingNet-7k](https://huggingface.co/RMSnow/Vevo2/tree/main/contentstyle_modeling/posttrained) |
|
| 46 |
+
| **Flow-matching Transformer** | Predicting mel-spectrogram from content-style tokens with a flow-matching transformer (350M). | [🤗 Emilia-101k, SingNet-7k](https://huggingface.co/RMSnow/Vevo2/tree/main/acoustic_modeling/fm_emilia101k_singnet7k_repa) |
|
| 47 |
+
| **Vocoder** | Predicting audio from mel-spectrogram with a Vocos-based vocoder (250M). | [🤗 Emilia-101k, SingNet-7k](https://huggingface.co/RMSnow/Vevo2/tree/main/vocoder) |
|
| 48 |
+
|
| 49 |
+
The training data includes:
|
| 50 |
+
|
| 51 |
+
- **Emilia-101k**: about 101k hours of speech data
|
| 52 |
+
|
| 53 |
+
- **SingNet-7k**: about 7,000 hours of internal singing voice data, preprocessed using the [SingNet pipeline](https://openreview.net/pdf?id=X6ffdf6nh3).
|
| 54 |
|
| 55 |
## Usage
|
| 56 |
+
|
| 57 |
+
You can refer to our [recipe](https://github.com/open-mmlab/Amphion/blob/main/models/svc/vevo2/README.md) at GitHub for more usage details. For example, to use the speech/singing voice editing, after you clone the Amphion github repository, you can use the script like:
|
| 58 |
+
|
| 59 |
```python
|
| 60 |
import os
|
| 61 |
import torch
|
|
|
|
| 64 |
from models.svc.vevo2.vevo2_utils import *
|
| 65 |
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
def vevo2_editing(
|
| 68 |
tgt_text,
|
| 69 |
raw_wav_path,
|
|
|
|
| 83 |
save_audio(gen_audio, output_path=output_path)
|
| 84 |
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
def load_inference_pipeline():
|
| 87 |
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
| 88 |
|
| 89 |
local_dir = snapshot_download(
|
| 90 |
+
repo_id="RMSnow/Vevo2",
|
| 91 |
repo_type="model",
|
| 92 |
local_dir="./ckpts/Vevo2",
|
| 93 |
resume_download=True,
|
|
|
|
| 135 |
output_dir = "./models/svc/vevo2/output"
|
| 136 |
os.makedirs(output_dir, exist_ok=True)
|
| 137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
### Zero-shot Singing Editing ###
|
| 139 |
adele_path = "./models/svc/vevosing/wav/adele.wav"
|
| 140 |
adele_text = "Never mind, I'll find someone like you. I wish nothing but."
|
|
|
|
| 152 |
raw_text=jaychou_text, # "对这个世界如果你有太多的抱怨,跌倒了就不该继续往前走,为什么,人要这么的脆弱堕"
|
| 153 |
output_path=os.path.join(output_dir, "editing_jaychou.wav"),
|
| 154 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
```
|
| 156 |
|
| 157 |
## Citations
|