RMSnow
/

Vevo2

@@ -17,11 +17,45 @@ tags:
 - music
 ---
-# Vevo2
-[![arXiv](https://img.shields.io/badge/Vevo-Paper-COLOR.svg)](https://arxiv.org/abs/2508.16332)
 ## Usage
 ```python
 import os
 import torch
@@ -30,28 +64,6 @@ from huggingface_hub import snapshot_download
 from models.svc.vevo2.vevo2_utils import *
-def vevo2_tts(
-    tgt_text,
-    ref_wav_path,
-    ref_text=None,
-    timbre_ref_wav_path=None,
-    output_path=None,
-):
-    if timbre_ref_wav_path is None:
-        timbre_ref_wav_path = ref_wav_path
-    gen_audio = inference_pipeline.inference_ar_and_fm(
-        target_text=tgt_text,
-        style_ref_wav_path=ref_wav_path,
-        style_ref_wav_text=ref_text,
-        timbre_ref_wav_path=timbre_ref_wav_path,
-        use_prosody_code=False,
-    )
-    assert output_path is not None
-    save_audio(gen_audio, output_path=output_path)
 def vevo2_editing(
     tgt_text,
     raw_wav_path,
@@ -71,54 +83,11 @@ def vevo2_editing(
     save_audio(gen_audio, output_path=output_path)
-def vevo2_singing_style_conversion(
-    raw_wav_path,
-    style_ref_wav_path,
-    output_path=None,
-    raw_text=None,
-    style_ref_text=None,
-):
-    gen_audio = inference_pipeline.inference_ar_and_fm(
-        target_text=raw_text,
-        prosody_wav_path=raw_wav_path,
-        style_ref_wav_path=style_ref_wav_path,
-        style_ref_wav_text=style_ref_text,
-        timbre_ref_wav_path=raw_wav_path,
-        use_prosody_code=True,
-        use_pitch_shift=True,
-    )
-    assert output_path is not None
-    save_audio(gen_audio, output_path=output_path)
-def vevo2_melody_control(
-    tgt_text,
-    tgt_melody_wav_path,
-    output_path=None,
-    style_ref_wav_path=None,
-    style_ref_text=None,
-    timbre_ref_wav_path=None,
-):
-    gen_audio = inference_pipeline.inference_ar_and_fm(
-        target_text=tgt_text,
-        prosody_wav_path=tgt_melody_wav_path,
-        style_ref_wav_path=style_ref_wav_path,
-        style_ref_wav_text=style_ref_text,
-        timbre_ref_wav_path=timbre_ref_wav_path,
-        use_prosody_code=True,
-        use_pitch_shift=True,
-    )
-    assert output_path is not None
-    save_audio(gen_audio, output_path=output_path)
 def load_inference_pipeline():
     device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
     local_dir = snapshot_download(
-        repo_id="amphion/Vevo2",
         repo_type="model",
         local_dir="./ckpts/Vevo2",
         resume_download=True,
@@ -166,47 +135,6 @@ if __name__ == "__main__":
     output_dir = "./models/svc/vevo2/output"
     os.makedirs(output_dir, exist_ok=True)
-    ### Zero-shot Text-to-Speech and Text-to-Singing  ###
-    tgt_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences."
-    ref_wav_path = "./models/vc/vevo/wav/arabic_male.wav"
-    ref_text = "Flip stood undecided, his ears strained to catch the slightest sound."
-    jaychou_path = "./models/svc/vevosing/wav/jaychou.wav"
-    jaychou_text = (
-        "对这个世界如果你有太多的抱怨，跌倒了就不该继续往前走，为什么，人要这么的脆弱堕"
-    )
-    taiyizhenren_path = "./models/svc/vevosing/wav/taiyizhenren.wav"
-    taiyizhenren_text = (
-        "对，这就是我，万人敬仰的太乙真人。虽然有点婴儿肥，但也掩不住我，逼人的帅气。"
-    )
-    # the style reference and timbre reference are same
-    vevo2_tts(
-        tgt_text=tgt_text,
-        ref_wav_path=ref_wav_path,
-        timbre_ref_wav_path=ref_wav_path,
-        output_path=os.path.join(output_dir, "zstts.wav"),
-        ref_text=ref_text,
-    )
-    # the style reference and timbre reference are different
-    vevo2_tts(
-        tgt_text=tgt_text,
-        ref_wav_path=ref_wav_path,
-        timbre_ref_wav_path=jaychou_path,
-        output_path=os.path.join(output_dir, "zstts_disentangled.wav"),
-        ref_text=ref_text,
-    )
-    # the style reference is a singing voice
-    vevo2_tts(
-        tgt_text="顿时，气氛变得沉郁起来。乍看之下，一切的困扰仿佛都围绕在我身边。我皱着眉头，感受着那份压力，但我知道我不能放弃，不能认输。于是，我深吸一口气，心底的声音告诉我：“无论如何，都要冷静下来，重新开始。”",
-        ref_wav_path=jaychou_path,
-        ref_text=jaychou_text,
-        timbre_ref_wav_path=taiyizhenren_path,
-        output_path=os.path.join(output_dir, "zstts_singing.wav"),
-    )
     ### Zero-shot Singing Editing ###
     adele_path = "./models/svc/vevosing/wav/adele.wav"
     adele_text = "Never mind, I'll find someone like you. I wish nothing but."
@@ -224,46 +152,6 @@ if __name__ == "__main__":
         raw_text=jaychou_text,  # "对这个世界如果你有太多的抱怨，跌倒了就不该继续往前走，为什么，人要这么的脆弱堕"
         output_path=os.path.join(output_dir, "editing_jaychou.wav"),
     )
-    ### Zero-shot Singing Style Conversion ###
-    breathy_path = "./models/svc/vevosing/wav/breathy.wav"
-    breathy_text = "离别没说再见你是否心酸"
-    vibrato_path = "./models/svc/vevosing/wav/vibrato.wav"
-    vibrato_text = "玫瑰的红，容易受伤的梦，握在手中却流失于指缝"
-    vevo2_singing_style_conversion(
-        raw_wav_path=breathy_path,
-        raw_text=breathy_text,
-        style_ref_wav_path=vibrato_path,
-        style_ref_text=vibrato_text,
-        output_path=os.path.join(output_dir, "ssc_breathy2vibrato.wav"),
-    )
-    ### Melody Control for Singing Synthesis ##
-    humming_path = "./models/svc/vevosing/wav/humming.wav"
-    piano_path = "./models/svc/vevosing/wav/piano.wav"
-    # Humming to control the melody
-    vevo2_melody_control(
-        tgt_text="你是我的小呀小苹果，怎么爱，不嫌多",
-        tgt_melody_wav_path=humming_path,
-        output_path=os.path.join(output_dir, "melody_humming.wav"),
-        style_ref_wav_path=taiyizhenren_path,
-        style_ref_text=taiyizhenren_text,
-        timbre_ref_wav_path=taiyizhenren_path,
-    )
-    # Piano to control the melody
-    vevo2_melody_control(
-        tgt_text="你是我的小呀小苹果，怎么爱，不嫌多",
-        tgt_melody_wav_path=piano_path,
-        output_path=os.path.join(output_dir, "melody_piano.wav"),
-        style_ref_wav_path=taiyizhenren_path,
-        style_ref_text=taiyizhenren_text,
-        timbre_ref_wav_path=taiyizhenren_path,
-    )
 ```
 ## Citations

 - music
 ---
+# Vevo2: A Unified and Controllable Framework for Speech and Singing Voice Generation
+[![arXiv](https://img.shields.io/badge/arXiv-2508.16332-brightgreen.svg?style=flat-square)](https://arxiv.org/abs/2508.16332)
+[![hf](https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-model-yellow)](https://huggingface.co/RMSnow/Vevo2)
+[![vevo](https://img.shields.io/badge/WebPage-Demo-red.svg)](https://versasinger.github.io/)
+We present **Vevo2**, a unified and controllable framework for speech and singing voice generation. Vevo2 bridges controllable speech and singing voice generation via unified prosody learning, and supports a comprehensive set of generation tasks, including:
+1. Zero-shot Text-to-Speech (TTS), Text-to-Singing, and Singing Voice Synthesis (SVS)
+2. Style-preserved Voice/Singing Voice Conversion (VC/SVC)
+3. Style-converted Voice/Singing Voice Conversion (VC/SVC)
+4. Speech/Singing Voice Editing
+5. Singing Style Conversion
+6. Humming-to-Singing and Instrument-to-Singing
+![Vevo2](../../../imgs/svc/vevo1.5.png)
+## Pre-trained Models
+We have included the following pre-trained models at [🤗 RMSnow/Vevo2](https://huggingface.co/RMSnow/Vevo2):
+| Model                           | Description                                                                                                                                                                                                                                                           | Pre-trained Data and Checkpoint                                                                                          |
+| ------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| **Prosody Tokenizer**           | Converting speech/singing waveform to **coarse-grained prosody tokens** (which can also be interpreted as *melody contour* from a musical perspective). It is a single codebook VQ-VAE with a vocabulary size of 512. The frame rate is 6.25 Hz. (i.e., **56.25 bps**) | [🤗 Emilia-101k, SingNet-7k](https://huggingface.co/RMSnow/Vevo2/tree/main/tokenizer/prosody_fvq512_6.25hz)              |
+| **Content-Style Tokenizer**     | Converting speech/singing waveform to **fine-grained content-style tokens**. It is a single codebook VQ-VAE with a vocabulary size of 16384. The frame rate is 12.5 Hz. (i.e., **175 bps**)                                                                           | [🤗 Emilia-101k, SingNet-7k](https://huggingface.co/RMSnow/Vevo2/tree/main/tokenizer/contentstyle_fvq16384_12.5hz)       |
+| **AR Model** | A Qwen-based (Qwen2.5-0.5B) large language model post-trained to predict content-style tokens from text tokens and optionally prosody tokens, with unified prosody learning across speech and singing.                                                                               | [🤗 Emilia-101k, SingNet-7k](https://huggingface.co/RMSnow/Vevo2/tree/main/contentstyle_modeling/posttrained)            |
+| **Flow-matching Transformer**   | Predicting mel-spectrogram from content-style tokens with a flow-matching transformer (350M).                                                                                                                                                                         | [🤗 Emilia-101k, SingNet-7k](https://huggingface.co/RMSnow/Vevo2/tree/main/acoustic_modeling/fm_emilia101k_singnet7k_repa) |
+| **Vocoder**                     | Predicting audio from mel-spectrogram with a Vocos-based vocoder (250M).                                                                                                                                                                                             | [🤗 Emilia-101k, SingNet-7k](https://huggingface.co/RMSnow/Vevo2/tree/main/vocoder)                                      |
+The training data includes:
+- **Emilia-101k**: about 101k hours of speech data
+- **SingNet-7k**: about 7,000 hours of internal singing voice data, preprocessed using the [SingNet pipeline](https://openreview.net/pdf?id=X6ffdf6nh3).
 ## Usage
+You can refer to our [recipe](https://github.com/open-mmlab/Amphion/blob/main/models/svc/vevo2/README.md) at GitHub for more usage details. For example, to use the speech/singing voice editing, after you clone the Amphion github repository, you can use the script like:
 ```python
 import os
 import torch
 from models.svc.vevo2.vevo2_utils import *
 def vevo2_editing(
     tgt_text,
     raw_wav_path,
     save_audio(gen_audio, output_path=output_path)
 def load_inference_pipeline():
     device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
     local_dir = snapshot_download(
+        repo_id="RMSnow/Vevo2",
         repo_type="model",
         local_dir="./ckpts/Vevo2",
         resume_download=True,
     output_dir = "./models/svc/vevo2/output"
     os.makedirs(output_dir, exist_ok=True)
     ### Zero-shot Singing Editing ###
     adele_path = "./models/svc/vevosing/wav/adele.wav"
     adele_text = "Never mind, I'll find someone like you. I wish nothing but."
         raw_text=jaychou_text,  # "对这个世界如果你有太多的抱怨，跌倒了就不该继续往前走，为什么，人要这么的脆弱堕"
         output_path=os.path.join(output_dir, "editing_jaychou.wav"),
     )
 ```
 ## Citations