Spaces:

Difficult-Burger
/

vevo-test

Build error

App Files Files Community

积极的屁孩 commited on Apr 10, 2025

Commit

c3e56e6

1 Parent(s): 5e1a778

test

Browse files

Files changed (2) hide show

app.py +378 -4
requirements.txt +12 -0

app.py CHANGED Viewed

@@ -1,7 +1,381 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import os
+import sys
 import gradio as gr
+import torch
+import tempfile
+from pathlib import Path
+from huggingface_hub import snapshot_download, hf_hub_download
+# 添加模型目录到系统路径
+sys.path.append(".")
+# 导入Vevo工具类
+from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio
+# 模型配置常量
+REPO_ID = "amphion/Vevo"
+CACHE_DIR = "./ckpts/Vevo"
+class VevoGradioApp:
+    def __init__(self):
+        # 设备设置
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.pipelines = {}
+        # 配置文件路径
+        self.config_paths = {
+            "vq32tovq8192": "./models/vc/vevo/config/Vq32ToVq8192.json",
+            "vq8192tomels": "./models/vc/vevo/config/Vq8192ToMels.json",
+            "phonetovq8192": "./models/vc/vevo/config/PhoneToVq8192.json",
+            "vocoder": "./models/vc/vevo/config/Vocoder.json"
+        }
+        # 确保配置文件存在
+        self.download_configs()
+    def download_configs(self):
+        """下载必要的配置文件"""
+        os.makedirs("./models/vc/vevo/config", exist_ok=True)
+        config_files = {
+            "Vq32ToVq8192.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/Vq32ToVq8192.json",
+            "Vq8192ToMels.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/Vq8192ToMels.json",
+            "PhoneToVq8192.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/PhoneToVq8192.json",
+            "Vocoder.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/Vocoder.json"
+        }
+        for filename, url in config_files.items():
+            target_path = f"./models/vc/vevo/config/{filename}"
+            if not os.path.exists(target_path):
+                try:
+                    hf_hub_download(repo_id="Amphion/Vevo-configs", filename=filename, repo_type="dataset", local_dir="./models/vc/vevo/config/")
+                except:
+                    # 如果从Hugging Face下载失败，创建一个占位符文件
+                    with open(target_path, 'w') as f:
+                        f.write('{}')
+                    print(f"无法下载配置文件 {filename}，已创建占位符。请手动添加配置。")
+    def init_voice_conversion_pipeline(self):
+        """初始化语音转换管道"""
+        if "voice" not in self.pipelines:
+            # 内容标记器
+            local_dir = snapshot_download(
+                repo_id=REPO_ID,
+                repo_type="model",
+                cache_dir=CACHE_DIR,
+                allow_patterns=["tokenizer/vq32/*"],
+            )
+            content_tokenizer_ckpt_path = os.path.join(
+                local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
+            )
+            # 内容-风格标记器
+            local_dir = snapshot_download(
+                repo_id=REPO_ID,
+                repo_type="model",
+                cache_dir=CACHE_DIR,
+                allow_patterns=["tokenizer/vq8192/*"],
+            )
+            content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
+            # 自回归变换器
+            local_dir = snapshot_download(
+                repo_id=REPO_ID,
+                repo_type="model",
+                cache_dir=CACHE_DIR,
+                allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
+            )
+            ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
+            # 流匹配变换器
+            local_dir = snapshot_download(
+                repo_id=REPO_ID,
+                repo_type="model",
+                cache_dir=CACHE_DIR,
+                allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
+            )
+            fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
+            # 声码器
+            local_dir = snapshot_download(
+                repo_id=REPO_ID,
+                repo_type="model",
+                cache_dir=CACHE_DIR,
+                allow_patterns=["acoustic_modeling/Vocoder/*"],
+            )
+            vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
+            # 创建推理管道
+            self.pipelines["voice"] = VevoInferencePipeline(
+                content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
+                content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
+                ar_cfg_path=self.config_paths["vq32tovq8192"],
+                ar_ckpt_path=ar_ckpt_path,
+                fmt_cfg_path=self.config_paths["vq8192tomels"],
+                fmt_ckpt_path=fmt_ckpt_path,
+                vocoder_cfg_path=self.config_paths["vocoder"],
+                vocoder_ckpt_path=vocoder_ckpt_path,
+                device=self.device,
+            )
+        return self.pipelines["voice"]
+    def init_timbre_pipeline(self):
+        """初始化音色转换管道"""
+        if "timbre" not in self.pipelines:
+            # 内容-风格标记器
+            local_dir = snapshot_download(
+                repo_id=REPO_ID,
+                repo_type="model",
+                cache_dir=CACHE_DIR,
+                allow_patterns=["tokenizer/vq8192/*"],
+            )
+            tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
+            # 流匹配变换器
+            local_dir = snapshot_download(
+                repo_id=REPO_ID,
+                repo_type="model",
+                cache_dir=CACHE_DIR,
+                allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
+            )
+            fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
+            # 声码器
+            local_dir = snapshot_download(
+                repo_id=REPO_ID,
+                repo_type="model",
+                cache_dir=CACHE_DIR,
+                allow_patterns=["acoustic_modeling/Vocoder/*"],
+            )
+            vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
+            # 创建推理管道
+            self.pipelines["timbre"] = VevoInferencePipeline(
+                content_style_tokenizer_ckpt_path=tokenizer_ckpt_path,
+                fmt_cfg_path=self.config_paths["vq8192tomels"],
+                fmt_ckpt_path=fmt_ckpt_path,
+                vocoder_cfg_path=self.config_paths["vocoder"],
+                vocoder_ckpt_path=vocoder_ckpt_path,
+                device=self.device,
+            )
+        return self.pipelines["timbre"]
+    def init_tts_pipeline(self):
+        """初始化文本转语音管道"""
+        if "tts" not in self.pipelines:
+            # 内容-风格标记器
+            local_dir = snapshot_download(
+                repo_id=REPO_ID,
+                repo_type="model",
+                cache_dir=CACHE_DIR,
+                allow_patterns=["tokenizer/vq8192/*"],
+            )
+            content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
+            # 自回归变换器
+            local_dir = snapshot_download(
+                repo_id=REPO_ID,
+                repo_type="model",
+                cache_dir=CACHE_DIR,
+                allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
+            )
+            ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
+            # 流匹配变换器
+            local_dir = snapshot_download(
+                repo_id=REPO_ID,
+                repo_type="model",
+                cache_dir=CACHE_DIR,
+                allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
+            )
+            fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
+            # 声码器
+            local_dir = snapshot_download(
+                repo_id=REPO_ID,
+                repo_type="model",
+                cache_dir=CACHE_DIR,
+                allow_patterns=["acoustic_modeling/Vocoder/*"],
+            )
+            vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
+            # 创建推理管道
+            self.pipelines["tts"] = VevoInferencePipeline(
+                content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
+                ar_cfg_path=self.config_paths["phonetovq8192"],
+                ar_ckpt_path=ar_ckpt_path,
+                fmt_cfg_path=self.config_paths["vq8192tomels"],
+                fmt_ckpt_path=fmt_ckpt_path,
+                vocoder_cfg_path=self.config_paths["vocoder"],
+                vocoder_ckpt_path=vocoder_ckpt_path,
+                device=self.device,
+            )
+        return self.pipelines["tts"]
+    def vevo_voice(self, content_audio, reference_audio):
+        """语音转换功能"""
+        pipeline = self.init_voice_conversion_pipeline()
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as content_file, \
+             tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as reference_file, \
+             tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
+            content_path = content_file.name
+            reference_path = reference_file.name
+            output_path = output_file.name
+            # 保存上传的音频文件
+            content_audio.save(content_path)
+            reference_audio.save(reference_path)
+            # 执行语音转换
+            gen_audio = pipeline.inference_ar_and_fm(
+                src_wav_path=content_path,
+                src_text=None,
+                style_ref_wav_path=reference_path,
+                timbre_ref_wav_path=reference_path,
+            )
+            save_audio(gen_audio, output_path=output_path)
+            return output_path
+    def vevo_style(self, content_audio, style_audio):
+        """风格转换功能"""
+        pipeline = self.init_voice_conversion_pipeline()
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as content_file, \
+             tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as style_file, \
+             tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
+            content_path = content_file.name
+            style_path = style_file.name
+            output_path = output_file.name
+            # 保存上传的音频文件
+            content_audio.save(content_path)
+            style_audio.save(style_path)
+            # 执行风格转换
+            gen_audio = pipeline.inference_ar_and_fm(
+                src_wav_path=content_path,
+                src_text=None,
+                style_ref_wav_path=style_path,
+                timbre_ref_wav_path=content_path,
+            )
+            save_audio(gen_audio, output_path=output_path)
+            return output_path
+    def vevo_timbre(self, content_audio, reference_audio):
+        """音色转换功能"""
+        pipeline = self.init_timbre_pipeline()
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as content_file, \
+             tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as reference_file, \
+             tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
+            content_path = content_file.name
+            reference_path = reference_file.name
+            output_path = output_file.name
+            # 保存上传的音频文件
+            content_audio.save(content_path)
+            reference_audio.save(reference_path)
+            # 执行音色转换
+            gen_audio = pipeline.inference_fm(
+                src_wav_path=content_path,
+                timbre_ref_wav_path=reference_path,
+                flow_matching_steps=32,
+            )
+            save_audio(gen_audio, output_path=output_path)
+            return output_path
+    def vevo_tts(self, text, ref_audio, src_language, ref_language, ref_text):
+        """文本转语音功能"""
+        pipeline = self.init_tts_pipeline()
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref_file, \
+             tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
+            ref_path = ref_file.name
+            output_path = output_file.name
+            # 保存上传的音频文件
+            ref_audio.save(ref_path)
+            # 执行文本转语音
+            gen_audio = pipeline.inference_ar_and_fm(
+                src_wav_path=None,
+                src_text=text,
+                style_ref_wav_path=ref_path,
+                timbre_ref_wav_path=ref_path,
+                style_ref_wav_text=ref_text if ref_text else None,
+                src_text_language=src_language,
+                style_ref_wav_text_language=ref_language,
+            )
+            save_audio(gen_audio, output_path=output_path)
+            return output_path
+def create_interface():
+    app = VevoGradioApp()
+    with gr.Blocks(title="Vevo 语音转换演示") as demo:
+        gr.Markdown("# Vevo 语音转换模型演示")
+        gr.Markdown("Vevo是一个强大的语音转换模型，支持语音转换、风格转换、音色转换和文本转语音功能。")
+        with gr.Tab("语音转换"):
+            gr.Markdown("## 语音转换 (VevoVoice)")
+            gr.Markdown("将内容音频的内容转换为参考音频的风格和音色。")
+            with gr.Row():
+                content_audio_voice = gr.Audio(label="内容音频", type="filepath")
+                reference_audio_voice = gr.Audio(label="参考音频", type="filepath")
+            voice_btn = gr.Button("转换")
+            voice_output = gr.Audio(label="转换结果")
+            voice_btn.click(fn=app.vevo_voice, inputs=[content_audio_voice, reference_audio_voice], outputs=voice_output)
+        with gr.Tab("风格转换"):
+            gr.Markdown("## 风格转换 (VevoStyle)")
+            gr.Markdown("将内容音频的风格转换为参考音频的风格，保留原始音色。")
+            with gr.Row():
+                content_audio_style = gr.Audio(label="内容音频", type="filepath")
+                style_audio = gr.Audio(label="风格参考音频", type="filepath")
+            style_btn = gr.Button("转换")
+            style_output = gr.Audio(label="转换结果")
+            style_btn.click(fn=app.vevo_style, inputs=[content_audio_style, style_audio], outputs=style_output)
+        with gr.Tab("音色转换"):
+            gr.Markdown("## 音色转换 (VevoTimbre)")
+            gr.Markdown("将内容音频的音色转换为参考音频的音色，保留内容和风格。")
+            with gr.Row():
+                content_audio_timbre = gr.Audio(label="内容音频", type="filepath")
+                reference_audio_timbre = gr.Audio(label="音色参考音频", type="filepath")
+            timbre_btn = gr.Button("转换")
+            timbre_output = gr.Audio(label="转换结果")
+            timbre_btn.click(fn=app.vevo_timbre, inputs=[content_audio_timbre, reference_audio_timbre], outputs=timbre_output)
+        with gr.Tab("文本转语音"):
+            gr.Markdown("## 文本转语音 (VevoTTS)")
+            gr.Markdown("将输入文本转换为语音，使用参考音频的风格和音色。")
+            text_input = gr.Textbox(label="输入文本", lines=3)
+            with gr.Row():
+                ref_audio_tts = gr.Audio(label="参考音频", type="filepath")
+                src_language = gr.Dropdown(["en", "zh", "ja", "ko"], label="源文本语言", value="en")
+            with gr.Row():
+                ref_language = gr.Dropdown(["en", "zh", "ja", "ko"], label="参考文本语言", value="en")
+                ref_text = gr.Textbox(label="参考文本（可选）", lines=2)
+            tts_btn = gr.Button("生成")
+            tts_output = gr.Audio(label="生成结果")
+            tts_btn.click(fn=app.vevo_tts, inputs=[text_input, ref_audio_tts, src_language, ref_language, ref_text], outputs=tts_output)
+        gr.Markdown("## 关于")
+        gr.Markdown("本演示基于 [Vevo模型](https://huggingface.co/amphion/Vevo)，由[Amphion](https://github.com/open-mmlab/Amphion)开发。")
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+gradio>=4.14.0
+huggingface_hub>=0.20.0
+torch>=2.0.0
+torchaudio>=2.0.0
+numpy>=1.23.0
+librosa>=0.10.0
+accelerate>=0.21.0
+PySoundFile>=0.9.0
+safetensors>=0.4.0
+yaml>=0.2.5
+whisper>=1.1.10
+IPython>=8.0.0