Safetensors
tts
vc
svs
svc
music
RMSnow commited on
Commit
0c97490
·
verified ·
1 Parent(s): fa99f66

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +37 -149
README.md CHANGED
@@ -17,11 +17,45 @@ tags:
17
  - music
18
  ---
19
 
20
- # Vevo2
21
 
22
- [![arXiv](https://img.shields.io/badge/Vevo-Paper-COLOR.svg)](https://arxiv.org/abs/2508.16332)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  ## Usage
 
 
 
25
  ```python
26
  import os
27
  import torch
@@ -30,28 +64,6 @@ from huggingface_hub import snapshot_download
30
  from models.svc.vevo2.vevo2_utils import *
31
 
32
 
33
- def vevo2_tts(
34
- tgt_text,
35
- ref_wav_path,
36
- ref_text=None,
37
- timbre_ref_wav_path=None,
38
- output_path=None,
39
- ):
40
- if timbre_ref_wav_path is None:
41
- timbre_ref_wav_path = ref_wav_path
42
-
43
- gen_audio = inference_pipeline.inference_ar_and_fm(
44
- target_text=tgt_text,
45
- style_ref_wav_path=ref_wav_path,
46
- style_ref_wav_text=ref_text,
47
- timbre_ref_wav_path=timbre_ref_wav_path,
48
- use_prosody_code=False,
49
- )
50
-
51
- assert output_path is not None
52
- save_audio(gen_audio, output_path=output_path)
53
-
54
-
55
  def vevo2_editing(
56
  tgt_text,
57
  raw_wav_path,
@@ -71,54 +83,11 @@ def vevo2_editing(
71
  save_audio(gen_audio, output_path=output_path)
72
 
73
 
74
- def vevo2_singing_style_conversion(
75
- raw_wav_path,
76
- style_ref_wav_path,
77
- output_path=None,
78
- raw_text=None,
79
- style_ref_text=None,
80
- ):
81
- gen_audio = inference_pipeline.inference_ar_and_fm(
82
- target_text=raw_text,
83
- prosody_wav_path=raw_wav_path,
84
- style_ref_wav_path=style_ref_wav_path,
85
- style_ref_wav_text=style_ref_text,
86
- timbre_ref_wav_path=raw_wav_path,
87
- use_prosody_code=True,
88
- use_pitch_shift=True,
89
- )
90
-
91
- assert output_path is not None
92
- save_audio(gen_audio, output_path=output_path)
93
-
94
-
95
- def vevo2_melody_control(
96
- tgt_text,
97
- tgt_melody_wav_path,
98
- output_path=None,
99
- style_ref_wav_path=None,
100
- style_ref_text=None,
101
- timbre_ref_wav_path=None,
102
- ):
103
- gen_audio = inference_pipeline.inference_ar_and_fm(
104
- target_text=tgt_text,
105
- prosody_wav_path=tgt_melody_wav_path,
106
- style_ref_wav_path=style_ref_wav_path,
107
- style_ref_wav_text=style_ref_text,
108
- timbre_ref_wav_path=timbre_ref_wav_path,
109
- use_prosody_code=True,
110
- use_pitch_shift=True,
111
- )
112
-
113
- assert output_path is not None
114
- save_audio(gen_audio, output_path=output_path)
115
-
116
-
117
  def load_inference_pipeline():
118
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
119
 
120
  local_dir = snapshot_download(
121
- repo_id="amphion/Vevo2",
122
  repo_type="model",
123
  local_dir="./ckpts/Vevo2",
124
  resume_download=True,
@@ -166,47 +135,6 @@ if __name__ == "__main__":
166
  output_dir = "./models/svc/vevo2/output"
167
  os.makedirs(output_dir, exist_ok=True)
168
 
169
- ### Zero-shot Text-to-Speech and Text-to-Singing ###
170
- tgt_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences."
171
- ref_wav_path = "./models/vc/vevo/wav/arabic_male.wav"
172
- ref_text = "Flip stood undecided, his ears strained to catch the slightest sound."
173
-
174
- jaychou_path = "./models/svc/vevosing/wav/jaychou.wav"
175
- jaychou_text = (
176
- "对这个世界如果你有太多的抱怨,跌倒了就不该继续往前走,为什么,人要这么的脆弱堕"
177
- )
178
- taiyizhenren_path = "./models/svc/vevosing/wav/taiyizhenren.wav"
179
- taiyizhenren_text = (
180
- "对,这就是我,万人敬仰的太乙真人。虽然有点婴儿肥,但也掩不住我,逼人的帅气。"
181
- )
182
-
183
- # the style reference and timbre reference are same
184
- vevo2_tts(
185
- tgt_text=tgt_text,
186
- ref_wav_path=ref_wav_path,
187
- timbre_ref_wav_path=ref_wav_path,
188
- output_path=os.path.join(output_dir, "zstts.wav"),
189
- ref_text=ref_text,
190
- )
191
-
192
- # the style reference and timbre reference are different
193
- vevo2_tts(
194
- tgt_text=tgt_text,
195
- ref_wav_path=ref_wav_path,
196
- timbre_ref_wav_path=jaychou_path,
197
- output_path=os.path.join(output_dir, "zstts_disentangled.wav"),
198
- ref_text=ref_text,
199
- )
200
-
201
- # the style reference is a singing voice
202
- vevo2_tts(
203
- tgt_text="顿时,气氛变得沉郁起来。乍看之下,一切的困扰仿佛都围绕在我身边。我皱着眉头,感受着那份压力,但我知道我不能放弃,不能认输。于是,我深吸一口气,心底的声音告诉我:“无论如何,都要冷静下来,重新开始。”",
204
- ref_wav_path=jaychou_path,
205
- ref_text=jaychou_text,
206
- timbre_ref_wav_path=taiyizhenren_path,
207
- output_path=os.path.join(output_dir, "zstts_singing.wav"),
208
- )
209
-
210
  ### Zero-shot Singing Editing ###
211
  adele_path = "./models/svc/vevosing/wav/adele.wav"
212
  adele_text = "Never mind, I'll find someone like you. I wish nothing but."
@@ -224,46 +152,6 @@ if __name__ == "__main__":
224
  raw_text=jaychou_text, # "对这个世界如果你有太多的抱怨,跌倒了就不该继续往前走,为什么,人要这么的脆弱堕"
225
  output_path=os.path.join(output_dir, "editing_jaychou.wav"),
226
  )
227
-
228
- ### Zero-shot Singing Style Conversion ###
229
- breathy_path = "./models/svc/vevosing/wav/breathy.wav"
230
- breathy_text = "离别没说再见你是否心酸"
231
-
232
- vibrato_path = "./models/svc/vevosing/wav/vibrato.wav"
233
- vibrato_text = "玫瑰的红,容易受伤的梦,握在手中却流失于指缝"
234
-
235
- vevo2_singing_style_conversion(
236
- raw_wav_path=breathy_path,
237
- raw_text=breathy_text,
238
- style_ref_wav_path=vibrato_path,
239
- style_ref_text=vibrato_text,
240
- output_path=os.path.join(output_dir, "ssc_breathy2vibrato.wav"),
241
- )
242
-
243
- ### Melody Control for Singing Synthesis ##
244
- humming_path = "./models/svc/vevosing/wav/humming.wav"
245
- piano_path = "./models/svc/vevosing/wav/piano.wav"
246
-
247
- # Humming to control the melody
248
- vevo2_melody_control(
249
- tgt_text="你是我的小呀小苹果,怎么爱,不嫌多",
250
- tgt_melody_wav_path=humming_path,
251
- output_path=os.path.join(output_dir, "melody_humming.wav"),
252
- style_ref_wav_path=taiyizhenren_path,
253
- style_ref_text=taiyizhenren_text,
254
- timbre_ref_wav_path=taiyizhenren_path,
255
- )
256
-
257
- # Piano to control the melody
258
- vevo2_melody_control(
259
- tgt_text="你是我的小呀小苹果,怎么爱,不嫌多",
260
- tgt_melody_wav_path=piano_path,
261
- output_path=os.path.join(output_dir, "melody_piano.wav"),
262
- style_ref_wav_path=taiyizhenren_path,
263
- style_ref_text=taiyizhenren_text,
264
- timbre_ref_wav_path=taiyizhenren_path,
265
- )
266
-
267
  ```
268
 
269
  ## Citations
 
17
  - music
18
  ---
19
 
20
+ # Vevo2: A Unified and Controllable Framework for Speech and Singing Voice Generation
21
 
22
+ [![arXiv](https://img.shields.io/badge/arXiv-2508.16332-brightgreen.svg?style=flat-square)](https://arxiv.org/abs/2508.16332)
23
+ [![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-model-yellow)](https://huggingface.co/RMSnow/Vevo2)
24
+ [![vevo](https://img.shields.io/badge/WebPage-Demo-red.svg)](https://versasinger.github.io/)
25
+
26
+ We present **Vevo2**, a unified and controllable framework for speech and singing voice generation. Vevo2 bridges controllable speech and singing voice generation via unified prosody learning, and supports a comprehensive set of generation tasks, including:
27
+
28
+ 1. Zero-shot Text-to-Speech (TTS), Text-to-Singing, and Singing Voice Synthesis (SVS)
29
+ 2. Style-preserved Voice/Singing Voice Conversion (VC/SVC)
30
+ 3. Style-converted Voice/Singing Voice Conversion (VC/SVC)
31
+ 4. Speech/Singing Voice Editing
32
+ 5. Singing Style Conversion
33
+ 6. Humming-to-Singing and Instrument-to-Singing
34
+
35
+ ![Vevo2](../../../imgs/svc/vevo1.5.png)
36
+
37
+ ## Pre-trained Models
38
+
39
+ We have included the following pre-trained models at [🤗 RMSnow/Vevo2](https://huggingface.co/RMSnow/Vevo2):
40
+
41
+ | Model | Description | Pre-trained Data and Checkpoint |
42
+ | ------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------ |
43
+ | **Prosody Tokenizer** | Converting speech/singing waveform to **coarse-grained prosody tokens** (which can also be interpreted as *melody contour* from a musical perspective). It is a single codebook VQ-VAE with a vocabulary size of 512. The frame rate is 6.25 Hz. (i.e., **56.25 bps**) | [🤗 Emilia-101k, SingNet-7k](https://huggingface.co/RMSnow/Vevo2/tree/main/tokenizer/prosody_fvq512_6.25hz) |
44
+ | **Content-Style Tokenizer** | Converting speech/singing waveform to **fine-grained content-style tokens**. It is a single codebook VQ-VAE with a vocabulary size of 16384. The frame rate is 12.5 Hz. (i.e., **175 bps**) | [🤗 Emilia-101k, SingNet-7k](https://huggingface.co/RMSnow/Vevo2/tree/main/tokenizer/contentstyle_fvq16384_12.5hz) |
45
+ | **AR Model** | A Qwen-based (Qwen2.5-0.5B) large language model post-trained to predict content-style tokens from text tokens and optionally prosody tokens, with unified prosody learning across speech and singing. | [🤗 Emilia-101k, SingNet-7k](https://huggingface.co/RMSnow/Vevo2/tree/main/contentstyle_modeling/posttrained) |
46
+ | **Flow-matching Transformer** | Predicting mel-spectrogram from content-style tokens with a flow-matching transformer (350M). | [🤗 Emilia-101k, SingNet-7k](https://huggingface.co/RMSnow/Vevo2/tree/main/acoustic_modeling/fm_emilia101k_singnet7k_repa) |
47
+ | **Vocoder** | Predicting audio from mel-spectrogram with a Vocos-based vocoder (250M). | [🤗 Emilia-101k, SingNet-7k](https://huggingface.co/RMSnow/Vevo2/tree/main/vocoder) |
48
+
49
+ The training data includes:
50
+
51
+ - **Emilia-101k**: about 101k hours of speech data
52
+
53
+ - **SingNet-7k**: about 7,000 hours of internal singing voice data, preprocessed using the [SingNet pipeline](https://openreview.net/pdf?id=X6ffdf6nh3).
54
 
55
  ## Usage
56
+
57
+ You can refer to our [recipe](https://github.com/open-mmlab/Amphion/blob/main/models/svc/vevo2/README.md) at GitHub for more usage details. For example, to use the speech/singing voice editing, after you clone the Amphion github repository, you can use the script like:
58
+
59
  ```python
60
  import os
61
  import torch
 
64
  from models.svc.vevo2.vevo2_utils import *
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  def vevo2_editing(
68
  tgt_text,
69
  raw_wav_path,
 
83
  save_audio(gen_audio, output_path=output_path)
84
 
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  def load_inference_pipeline():
87
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
88
 
89
  local_dir = snapshot_download(
90
+ repo_id="RMSnow/Vevo2",
91
  repo_type="model",
92
  local_dir="./ckpts/Vevo2",
93
  resume_download=True,
 
135
  output_dir = "./models/svc/vevo2/output"
136
  os.makedirs(output_dir, exist_ok=True)
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  ### Zero-shot Singing Editing ###
139
  adele_path = "./models/svc/vevosing/wav/adele.wav"
140
  adele_text = "Never mind, I'll find someone like you. I wish nothing but."
 
152
  raw_text=jaychou_text, # "对这个世界如果你有太多的抱怨,跌倒了就不该继续往前走,为什么,人要这么的脆弱堕"
153
  output_path=os.path.join(output_dir, "editing_jaychou.wav"),
154
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  ```
156
 
157
  ## Citations