Spaces:

Difficult-Burger
/

vevo-test

Build error

App Files Files Community

积极的屁孩 commited on Apr 11, 2025

Commit

f4115c6

1 Parent(s): e593e60

test

Browse files

Files changed (2) hide show

app.py +373 -687
requirements.txt +8 -30

app.py CHANGED Viewed

@@ -1,724 +1,410 @@
 import os
 import sys
-import gradio as gr
 import torch
-import tempfile
-from pathlib import Path
-import importlib.util
-import shutil
 from huggingface_hub import snapshot_download, hf_hub_download
-import requests
 import subprocess
-# 检查并安装必要的依赖
-def install_dependencies():
-    required_packages = ["pyworld", "torchaudio", "scipy", "librosa", "g2p_en"]
-    for package in required_packages:
-        try:
-            importlib.import_module(package)
-            print(f"已安装: {package}")
-        except ImportError:
-            print(f"安装: {package}")
-            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
-            print(f"安装完成: {package}")
-# 安装必要的依赖
-install_dependencies()
-# 下载必要的模型代码
-def download_amphion_code():
-    base_url = "https://raw.githubusercontent.com/open-mmlab/Amphion/main/"
-    required_files = [
-        # 基础目录结构
-        "models/__init__.py",
-        "models/base/__init__.py",
-        "models/codec/__init__.py",
-        "models/codec/kmeans/__init__.py",
-        "models/codec/vevo/__init__.py",
-        "models/codec/melvqgan/__init__.py",
-        "models/codec/amphion_codec/__init__.py",
-        "models/codec/amphion_codec/quantize/__init__.py",
-        "models/vc/__init__.py",
-        "models/vc/flow_matching_transformer/__init__.py",
-        "models/vc/autoregressive_transformer/__init__.py",
-        "models/tts/__init__.py",
-        "models/tts/maskgct/__init__.py",
-        "models/tts/maskgct/g2p/__init__.py",
-        "utils/__init__.py",
-        # 核心文件
-        "models/vc/vevo/vevo_utils.py",
-        "models/vc/flow_matching_transformer/fmt_model.py",
-        "models/vc/flow_matching_transformer/llama_nar.py",
-        "models/vc/autoregressive_transformer/ar_model.py",
-        "models/vc/autoregressive_transformer/global_encoder.py",
-        "models/codec/kmeans/repcodec_model.py",
-        "models/codec/vevo/vevo_repcodec.py",
-        "models/codec/melvqgan/melspec.py",
-        "models/codec/amphion_codec/vocos.py",
-        "models/codec/amphion_codec/codec.py",
-        "models/codec/amphion_codec/quantize/factorized_vector_quantize.py",
-        "models/codec/amphion_codec/quantize/lookup_free_quantize.py",
-        "models/codec/amphion_codec/quantize/residual_vq.py",
-        "models/codec/amphion_codec/quantize/vector_quantize.py",
-        "utils/util.py",
-        "utils/hparam.py",
-        "models/tts/maskgct/g2p/g2p_generation.py",
-        "models/vc/vevo/config/Vq32ToVq8192.json",
-        "models/vc/vevo/config/Vq8192ToMels.json",
-        "models/vc/vevo/config/PhoneToVq8192.json",
-        "models/vc/vevo/config/Vocoder.json",
     ]
-    for file_path in required_files:
-        local_path = os.path.join(os.getcwd(), file_path)
-        os.makedirs(os.path.dirname(local_path), exist_ok=True)
-        # 跳过空的__init__.py文件，直接创建
-        if file_path.endswith("__init__.py"):
-            if not os.path.exists(local_path):
-                with open(local_path, "w") as f:
-                    f.write("# Auto-generated file\n")
-            continue
-        # 下载其他文件
-        try:
-            response = requests.get(base_url + file_path)
-            if response.status_code == 200:
-                with open(local_path, "wb") as f:
-                    f.write(response.content)
-                print(f"成功下载: {file_path}")
-            else:
-                print(f"无法下载 {file_path}, 状态码: {response.status_code}")
-                # 创建空文件防止导入错误
-                if not os.path.exists(local_path):
-                    with open(local_path, "w") as f:
-                        f.write("# Placeholder file\n")
-        except Exception as e:
-            print(f"下载 {file_path} 时出错: {str(e)}")
-            # 创建空文件防止导入错误
-            if not os.path.exists(local_path):
-                with open(local_path, "w") as f:
-                    f.write("# Placeholder file\n")
-# 先下载必要的代码文件
-download_amphion_code()
-# 添加当前目录到系统路径
-sys.path.insert(0, os.getcwd())
-# 手动导入必要的类，解决导入问题
-try:
-    from models.codec.amphion_codec.quantize.residual_vq import ResidualVQ
-    # 添加到quantize模块的命名空间
-    import models.codec.amphion_codec.quantize
-    models.codec.amphion_codec.quantize.ResidualVQ = ResidualVQ
-    # 解决vocos模块导入问题
-    import models.codec.amphion_codec.vocos
-    import sys
-    import types
-    # 创建虚拟模块
-    kmeans_vocos_module = types.ModuleType('models.codec.kmeans.vocos')
-    # 将amphion_codec中的vocos赋值给kmeans.vocos
-    sys.modules['models.codec.kmeans.vocos'] = models.codec.amphion_codec.vocos
-    # 修复VevoInferencePipeline中的yaml文件路径引用
-    from models.vc.vevo import vevo_utils
-    original_load_vevo_vqvae = vevo_utils.load_vevo_vqvae_checkpoint
-    # 重定义函数处理路径问题
-    def patched_load_vevo_vqvae_checkpoint(repcodec_cfg, device):
-        # 备份原始路径
-        original_config_path = repcodec_cfg.config_path
-        # 尝试多个可能的路径
-        possible_paths = [
-            original_config_path,
-            original_config_path.replace('./models/vc/vevo/config/', './tokenizer/vq32/'),
-            os.path.join(os.getcwd(), 'tokenizer/vq32/hubert_large_l18_c32.yaml'),
-            os.path.join(os.getcwd(), 'models/vc/vevo/config/hubert_large_l18_c32.yaml'),
-            os.path.join(os.getcwd(), 'Amphion/models/vc/vevo/config/hubert_large_l18_c32.yaml')
-        ]
-        # 尝试每个路径
-        for path in possible_paths:
-            if os.path.exists(path):
-                print(f"找到yaml配置文件: {path}")
-                repcodec_cfg.config_path = path
-                break
-        else:
-            print(f"警告: 无法找到任何yaml配置文件, 尝试的路径: {possible_paths}")
-        # 调用原始函数
-        try:
-            result = original_load_vevo_vqvae(repcodec_cfg, device)
-            return result
-        except Exception as e:
-            print(f"加载VQVAE时出错: {str(e)}")
-            # 如果失败，尝试创建一个简单的对象作为替代
-            class DummyVQVAE:
-                def __init__(self):
-                    self.device = device
-                def encode(self, x):
-                    # 返回一个简单的占位符编码
-                    return torch.zeros((x.shape[0], 100, 32), device=device)
-            return DummyVQVAE()
-    # 替换原始函数
-    vevo_utils.load_vevo_vqvae_checkpoint = patched_load_vevo_vqvae_checkpoint
-except ImportError as e:
-    print(f"导入模块时出错: {str(e)}")
-# 现在尝试导入
-try:
-    from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio
-except ImportError as e:
-    print(f"导入错误: {str(e)}")
-    # 如果还是不能导入，使用一个最小版本的必要函数
-    class VevoInferencePipeline:
-        def __init__(self, **kwargs):
-            self.device = kwargs.get("device", "cpu")
-            print("警告: 使用VevoInferencePipeline占位符!")
-        def inference_ar_and_fm(self, **kwargs):
-            return torch.randn(1, 24000)
-        def inference_fm(self, **kwargs):
-            return torch.randn(1, 24000)
-    def save_audio(waveform, sr=24000, output_path=None, **kwargs):
-        if output_path:
-            import torchaudio
-            torchaudio.save(output_path, waveform, sr)
-        return output_path
-# 修复可能存在的递归调用问题
-# 检查是否在运行时发生了transformers库中的注意力机制递归
-try:
-    import transformers
-    from transformers.models.llama.modeling_llama import LlamaAttention, LlamaModel
-    # 保存原始的注意力前向函数
-    if hasattr(LlamaAttention, "forward"):
-        original_attention_forward = LlamaAttention.forward
-        # 创建防止递归的补丁函数
-        def safe_attention_forward(self, *args, **kwargs):
-            # 使用原始函数，但避免递归调用
-            return original_attention_forward(self, *args, **kwargs)
-        # 替换原始函数
-        LlamaAttention.forward = safe_attention_forward
-        print("已修复LlamaAttention.forward，防止递归")
-    # 可能存在其他递归路径
-    if hasattr(transformers.models.llama.modeling_llama, "LlamaAttention"):
-        for attr_name in dir(transformers.models.llama.modeling_llama.LlamaAttention):
-            if attr_name.startswith("_") and "forward" in attr_name:
-                attr = getattr(transformers.models.llama.modeling_llama.LlamaAttention, attr_name)
-                if callable(attr):
-                    # 保存原始函数
-                    setattr(transformers.models.llama.modeling_llama.LlamaAttention,
-                            f"original_{attr_name}", attr)
-                    # 创建安全函数
-                    def create_safe_function(original_func, attr_name):
-                        def safe_function(self, *args, **kwargs):
-                            return original_func(self, *args, **kwargs)
-                        return safe_function
-                    # 替换函数
-                    setattr(transformers.models.llama.modeling_llama.LlamaAttention,
-                            attr_name,
-                            create_safe_function(attr, attr_name))
-                    print(f"已修复潜在的递归函数: {attr_name}")
-except Exception as e:
-    print(f"应用注意力机制补丁时出错: {str(e)}")
-# 模型配置常量
-REPO_ID = "amphion/Vevo"
-CACHE_DIR = "./ckpts/Vevo"
-class VevoGradioApp:
-    def __init__(self):
-        # 设备设置
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.pipelines = {}
-        # 配置文件路径
-        self.config_paths = {
-            "vq32tovq8192": "./models/vc/vevo/config/Vq32ToVq8192.json",
-            "vq8192tomels": "./models/vc/vevo/config/Vq8192ToMels.json",
-            "phonetovq8192": "./models/vc/vevo/config/PhoneToVq8192.json",
-            "vocoder": "./models/vc/vevo/config/Vocoder.json"
-        }
-        # 确保配置文件存在
-        self.download_configs()
-    def download_configs(self):
-        """下载必要的配置文件"""
-        os.makedirs("./models/vc/vevo/config", exist_ok=True)
-        config_files = {
-            "Vq32ToVq8192.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/Vq32ToVq8192.json",
-            "Vq8192ToMels.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/Vq8192ToMels.json",
-            "PhoneToVq8192.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/PhoneToVq8192.json",
-            "Vocoder.json": "https://raw.githubusercontent.com/open-mmlab/Amphion/main/models/vc/vevo/config/Vocoder.json"
-        }
-        # 额外下载必要的统计文件
-        stat_files = {
-            "hubert_large_l18_mean_std.npz": "https://huggingface.co/amphion/Vevo/resolve/main/tokenizer/vq32/hubert_large_l18_mean_std.npz",
-            "hubert_large_l18_c32.yaml": "https://huggingface.co/amphion/Vevo/resolve/main/tokenizer/vq32/hubert_large_l18_c32.yaml"
-        }
-        for filename, url in config_files.items():
-            target_path = f"./models/vc/vevo/config/{filename}"
-            if not os.path.exists(target_path):
-                try:
-                    response = requests.get(url)
-                    if response.status_code == 200:
-                        with open(target_path, "wb") as f:
-                            f.write(response.content)
-                        print(f"成功下载配置文件: {filename}")
-                    else:
-                        # 如果从GitHub下载失败，创建一个占位符文件
-                        with open(target_path, 'w') as f:
-                            f.write('{}')
-                        print(f"无法下载配置文件 {filename}，已创建占位符")
-                except:
-                    # 如果下载失败，创建一个占位符文件
-                    with open(target_path, 'w') as f:
-                        f.write('{}')
-                    print(f"无法下载配置文件 {filename}，已创建占位符")
-        # 下载统计文件
-        for filename, url in stat_files.items():
-            # 同时支持两个位置：配置目录和标准位置
-            target_paths = [
-                f"./models/vc/vevo/config/{filename}",  # 配置文件夹中
-                f"./tokenizer/vq32/{filename}"  # HuggingFace仓库标准位置
-            ]
-            # 确保目录存在
-            for target_path in target_paths:
-                os.makedirs(os.path.dirname(target_path), exist_ok=True)
-                if not os.path.exists(target_path):
-                    try:
-                        response = requests.get(url)
-                        if response.status_code == 200:
-                            with open(target_path, "wb") as f:
-                                f.write(response.content)
-                            print(f"成功下载统计文件到: {target_path}")
-                        else:
-                            print(f"无法下载统计文件 {filename} 到 {target_path}, 状态码: {response.status_code}")
-                    except Exception as e:
-                        print(f"下载统计文件 {filename} 到 {target_path} 时出错: {str(e)}")
-        # 修复配置文件中的路径
-        self.fix_config_paths()
-    def fix_config_paths(self):
-        """修复配置文件中的相对路径"""
-        try:
-            for config_name, config_path in self.config_paths.items():
-                if os.path.exists(config_path):
-                    with open(config_path, 'r') as f:
-                        config_data = f.read()
-                    # 获取当前工作目录的绝对路径
-                    base_dir = os.path.abspath(os.getcwd())
-                    # 统计文件的可能路径
-                    possible_stats = [
-                        f"{base_dir}/models/vc/vevo/config/hubert_large_l18_mean_std.npz",
-                        f"{base_dir}/tokenizer/vq32/hubert_large_l18_mean_std.npz",
-                        f"{base_dir}/Amphion/models/vc/vevo/config/hubert_large_l18_mean_std.npz"
-                    ]
-                    # 找到一个确实存在的文件路径
-                    stat_file_path = None
-                    for path in possible_stats:
-                        if os.path.exists(path):
-                            stat_file_path = path
-                            break
-                    if not stat_file_path:
-                        # 如果都不存在，默认使用第一个路径
-                        stat_file_path = possible_stats[0]
-                    # 替换配置中的相对路径
-                    if 'representation_stat_mean_var_path' in config_data:
-                        # 替换所有可能的路径格式
-                        replacements = [
-                            ('"representation_stat_mean_var_path": "./models/vc/vevo/config/hubert_large_l18_mean_std.npz"', f'"representation_stat_mean_var_path": "{stat_file_path}"'),
-                            ('"representation_stat_mean_var_path": "models/vc/vevo/config/hubert_large_l18_mean_std.npz"', f'"representation_stat_mean_var_path": "{stat_file_path}"'),
-                            ('"representation_stat_mean_var_path": "./tokenizer/vq32/hubert_large_l18_mean_std.npz"', f'"representation_stat_mean_var_path": "{stat_file_path}"'),
-                            ('"representation_stat_mean_var_path": "tokenizer/vq32/hubert_large_l18_mean_std.npz"', f'"representation_stat_mean_var_path": "{stat_file_path}"'),
-                        ]
-                        for old, new in replacements:
-                            config_data = config_data.replace(old, new)
-                    # 保存修复后的配置
-                    with open(config_path, 'w') as f:
-                        f.write(config_data)
-                    print(f"已修复配置文件路径: {config_path}")
-        except Exception as e:
-            print(f"修复配置文件路径时出错: {str(e)}")
-    def init_voice_conversion_pipeline(self):
-        """初始化语音转换管道"""
-        if "voice" not in self.pipelines:
-            try:
-                # 确保配置文件路径是绝对路径
-                absolute_config_paths = {}
-                for key, path in self.config_paths.items():
-                    if path and not os.path.isabs(path):
-                        absolute_config_paths[key] = os.path.abspath(path)
-                    else:
-                        absolute_config_paths[key] = path
-                # 内容标记器
-                local_dir = snapshot_download(
-                    repo_id=REPO_ID,
-                    repo_type="model",
-                    cache_dir=CACHE_DIR,
-                    allow_patterns=["tokenizer/vq32/*"],
-                )
-                content_tokenizer_ckpt_path = os.path.join(
-                    local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
-                )
-                # 内容-风格标记器
-                local_dir = snapshot_download(
-                    repo_id=REPO_ID,
-                    repo_type="model",
-                    cache_dir=CACHE_DIR,
-                    allow_patterns=["tokenizer/vq8192/*"],
-                )
-                content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
-                # 自回归变换器
-                local_dir = snapshot_download(
-                    repo_id=REPO_ID,
-                    repo_type="model",
-                    cache_dir=CACHE_DIR,
-                    allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
-                )
-                ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
-                # 流匹配变换器
-                local_dir = snapshot_download(
-                    repo_id=REPO_ID,
-                    repo_type="model",
-                    cache_dir=CACHE_DIR,
-                    allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
-                )
-                fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
-                # 声码器
-                local_dir = snapshot_download(
-                    repo_id=REPO_ID,
-                    repo_type="model",
-                    cache_dir=CACHE_DIR,
-                    allow_patterns=["acoustic_modeling/Vocoder/*"],
-                )
-                vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
-                # 确保统计文件存在
-                possible_stat_file_paths = [
-                    os.path.join(os.getcwd(), "models/vc/vevo/config/hubert_large_l18_mean_std.npz"),
-                    os.path.join(os.getcwd(), "tokenizer/vq32/hubert_large_l18_mean_std.npz")
-                ]
-                # 检查是否有任一路径存在
-                stat_file_exists = any(os.path.exists(path) for path in possible_stat_file_paths)
-                if not stat_file_exists:
-                    print(f"警告: 找不到统计文件，将尝试创建空文件")
-                    try:
-                        import numpy as np
-                        # 在两个位置都创建一个简单的统计文件
-                        for stat_path in possible_stat_file_paths:
-                            os.makedirs(os.path.dirname(stat_path), exist_ok=True)
-                            np.savez(stat_path, mean=np.zeros(1024), std=np.ones(1024))
-                            print(f"已创建占位符统计文件: {stat_path}")
-                    except Exception as e:
-                        print(f"创建统计文件时出错: {str(e)}")
-                # 创建推理管道
-                self.pipelines["voice"] = VevoInferencePipeline(
-                    content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
-                    content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
-                    ar_cfg_path=absolute_config_paths["vq32tovq8192"],
-                    ar_ckpt_path=ar_ckpt_path,
-                    fmt_cfg_path=absolute_config_paths["vq8192tomels"],
-                    fmt_ckpt_path=fmt_ckpt_path,
-                    vocoder_cfg_path=absolute_config_paths["vocoder"],
-                    vocoder_ckpt_path=vocoder_ckpt_path,
-                    device=self.device,
-                )
-            except Exception as e:
-                print(f"初始化语音转换管道时出错: {str(e)}")
-                # 创建一个占位符管道
-                self.pipelines["voice"] = VevoInferencePipeline(device=self.device)
-        return self.pipelines["voice"]
-    def init_timbre_pipeline(self):
-        """初始化音色转换管道"""
-        if "timbre" not in self.pipelines:
-            try:
-                # 确保配置文件路径是绝对路径
-                absolute_config_paths = {}
-                for key, path in self.config_paths.items():
-                    if path and not os.path.isabs(path):
-                        absolute_config_paths[key] = os.path.abspath(path)
-                    else:
-                        absolute_config_paths[key] = path
-                # 内容-风格标记器
-                local_dir = snapshot_download(
-                    repo_id=REPO_ID,
-                    repo_type="model",
-                    cache_dir=CACHE_DIR,
-                    allow_patterns=["tokenizer/vq8192/*"],
-                )
-                tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
-                # 流匹配变换器
-                local_dir = snapshot_download(
-                    repo_id=REPO_ID,
-                    repo_type="model",
-                    cache_dir=CACHE_DIR,
-                    allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
-                )
-                fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
-                # 声码器
-                local_dir = snapshot_download(
-                    repo_id=REPO_ID,
-                    repo_type="model",
-                    cache_dir=CACHE_DIR,
-                    allow_patterns=["acoustic_modeling/Vocoder/*"],
-                )
-                vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
-                # 创建推理管道
-                self.pipelines["timbre"] = VevoInferencePipeline(
-                    content_style_tokenizer_ckpt_path=tokenizer_ckpt_path,
-                    fmt_cfg_path=absolute_config_paths["vq8192tomels"],
-                    fmt_ckpt_path=fmt_ckpt_path,
-                    vocoder_cfg_path=absolute_config_paths["vocoder"],
-                    vocoder_ckpt_path=vocoder_ckpt_path,
-                    device=self.device,
-                )
-            except Exception as e:
-                print(f"初始化音色转换管道时出错: {str(e)}")
-                # 创建一个占位符管道
-                self.pipelines["timbre"] = VevoInferencePipeline(device=self.device)
-        return self.pipelines["timbre"]
-    def init_tts_pipeline(self):
-        """初始化文本转语音管道"""
-        if "tts" not in self.pipelines:
-            try:
-                # 确保配置文件路径是绝对路径
-                absolute_config_paths = {}
-                for key, path in self.config_paths.items():
-                    if path and not os.path.isabs(path):
-                        absolute_config_paths[key] = os.path.abspath(path)
-                    else:
-                        absolute_config_paths[key] = path
-                # 内容-风格标记器
-                local_dir = snapshot_download(
-                    repo_id=REPO_ID,
-                    repo_type="model",
-                    cache_dir=CACHE_DIR,
-                    allow_patterns=["tokenizer/vq8192/*"],
-                )
-                content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
-                # 自回归变换器
-                local_dir = snapshot_download(
-                    repo_id=REPO_ID,
-                    repo_type="model",
-                    cache_dir=CACHE_DIR,
-                    allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
-                )
-                ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
-                # 流匹配变换器
-                local_dir = snapshot_download(
-                    repo_id=REPO_ID,
-                    repo_type="model",
-                    cache_dir=CACHE_DIR,
-                    allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
-                )
-                fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
-                # 声码器
-                local_dir = snapshot_download(
-                    repo_id=REPO_ID,
-                    repo_type="model",
-                    cache_dir=CACHE_DIR,
-                    allow_patterns=["acoustic_modeling/Vocoder/*"],
-                )
-                vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
-                # 创建推理管道
-                self.pipelines["tts"] = VevoInferencePipeline(
-                    content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
-                    ar_cfg_path=absolute_config_paths["phonetovq8192"],
-                    ar_ckpt_path=ar_ckpt_path,
-                    fmt_cfg_path=absolute_config_paths["vq8192tomels"],
-                    fmt_ckpt_path=fmt_ckpt_path,
-                    vocoder_cfg_path=absolute_config_paths["vocoder"],
-                    vocoder_ckpt_path=vocoder_ckpt_path,
-                    device=self.device,
-                )
-            except Exception as e:
-                print(f"初始化TTS管道时出错: {str(e)}")
-                # 创建一个占位符管道
-                self.pipelines["tts"] = VevoInferencePipeline(device=self.device)
-        return self.pipelines["tts"]
-    def vevo_voice(self, content_audio, reference_audio):
-        """语音转换功能"""
-        pipeline = self.init_voice_conversion_pipeline()
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
-            output_path = output_file.name
-            # 执行语音转换
-            gen_audio = pipeline.inference_ar_and_fm(
-                src_wav_path=content_audio,  # 直接使用路径
-                src_text=None,
-                style_ref_wav_path=reference_audio,  # 直接使用路径
-                timbre_ref_wav_path=reference_audio,
-            )
-            save_audio(gen_audio, output_path=output_path)
-            return output_path
-    def vevo_style(self, content_audio, style_audio):
-        """风格转换功能"""
-        pipeline = self.init_voice_conversion_pipeline()
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
-            output_path = output_file.name
-            # 执行风格转换
-            gen_audio = pipeline.inference_ar_and_fm(
-                src_wav_path=content_audio,  # 直接使用路径
-                src_text=None,
-                style_ref_wav_path=style_audio,  # 直接使用路径
-                timbre_ref_wav_path=content_audio,
-            )
-            save_audio(gen_audio, output_path=output_path)
-            return output_path
-    def vevo_timbre(self, content_audio, reference_audio):
-        """音色转换功能"""
-        pipeline = self.init_timbre_pipeline()
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
-            output_path = output_file.name
-            # 执行音色转换
-            gen_audio = pipeline.inference_fm(
-                src_wav_path=content_audio,  # 直接使用路径
-                timbre_ref_wav_path=reference_audio,  # 直接使用路径
-                flow_matching_steps=32,
-            )
-            save_audio(gen_audio, output_path=output_path)
-            return output_path
-    def vevo_tts(self, text, ref_audio, src_language, ref_language, ref_text):
-        """文本转语音功能"""
-        pipeline = self.init_tts_pipeline()
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as output_file:
-            output_path = output_file.name
-            # 执行文本转语音
-            gen_audio = pipeline.inference_ar_and_fm(
-                src_wav_path=None,
-                src_text=text,
-                style_ref_wav_path=ref_audio,  # 直接使用路径
-                timbre_ref_wav_path=ref_audio,
-                style_ref_wav_text=ref_text if ref_text else None,
-                src_text_language=src_language,
-                style_ref_wav_text_language=ref_language,
-            )
-            save_audio(gen_audio, output_path=output_path)
-            return output_path
-def create_interface():
-    app = VevoGradioApp()
-    with gr.Blocks(title="Vevo 语音转换演示") as demo:
-        gr.Markdown("# Vevo 语音转换模型演示")
-        gr.Markdown("Vevo是一个强大的语音转换模型，支持语音转换、风格转换、音色转换和文本转语音功能。")
-        with gr.Tab("语音转换"):
-            gr.Markdown("## 语音转换 (VevoVoice)")
-            gr.Markdown("将内容音频的内容转换为参考音频的风格和音色。")
-            with gr.Row():
-                content_audio_voice = gr.Audio(label="内容音频", type="filepath")
-                reference_audio_voice = gr.Audio(label="参考音频", type="filepath")
-            voice_btn = gr.Button("转换")
-            voice_output = gr.Audio(label="转换结果")
-            voice_btn.click(fn=app.vevo_voice, inputs=[content_audio_voice, reference_audio_voice], outputs=voice_output)
-        with gr.Tab("风格转换"):
-            gr.Markdown("## 风格转换 (VevoStyle)")
-            gr.Markdown("将内容音频的风格转换为参考音频的风格，保留原始音色。")
-            with gr.Row():
-                content_audio_style = gr.Audio(label="内容音频", type="filepath")
-                style_audio = gr.Audio(label="风格参考音频", type="filepath")
-            style_btn = gr.Button("转换")
-            style_output = gr.Audio(label="转换结果")
-            style_btn.click(fn=app.vevo_style, inputs=[content_audio_style, style_audio], outputs=style_output)
-        with gr.Tab("音色转换"):
-            gr.Markdown("## 音色转换 (VevoTimbre)")
-            gr.Markdown("将内容音频的音色转换为参考音频的音色，保留内容和风格。")
-            with gr.Row():
-                content_audio_timbre = gr.Audio(label="内容音频", type="filepath")
-                reference_audio_timbre = gr.Audio(label="音色参考音频", type="filepath")
-            timbre_btn = gr.Button("转换")
-            timbre_output = gr.Audio(label="转换结果")
-            timbre_btn.click(fn=app.vevo_timbre, inputs=[content_audio_timbre, reference_audio_timbre], outputs=timbre_output)
-        with gr.Tab("文本转语音"):
-            gr.Markdown("## 文本转语音 (VevoTTS)")
-            gr.Markdown("将输入文本转换为语音，使用参考音频的风格和音色。")
-            text_input = gr.Textbox(label="输入文本", lines=3)
-            with gr.Row():
-                ref_audio_tts = gr.Audio(label="参考音频", type="filepath")
-                src_language = gr.Dropdown(["en", "zh", "ja", "ko"], label="源文本语言", value="en")
-            with gr.Row():
-                ref_language = gr.Dropdown(["en", "zh", "ja", "ko"], label="参考文本语言", value="en")
-                ref_text = gr.Textbox(label="参考文本（可选）", lines=2)
-            tts_btn = gr.Button("生成")
-            tts_output = gr.Audio(label="生成结果")
-            tts_btn.click(fn=app.vevo_tts, inputs=[text_input, ref_audio_tts, src_language, ref_language, ref_text], outputs=tts_output)
-        gr.Markdown("## 关于")
-        gr.Markdown("本演示基于 [Vevo模型](https://huggingface.co/amphion/Vevo)，由[Amphion](https://github.com/open-mmlab/Amphion)开发。")
-    return demo
-if __name__ == "__main__":
-    demo = create_interface()
-demo.launch()

 import os
 import sys
+import json
 import torch
+import gradio as gr
+import torchaudio
+import numpy as np
 from huggingface_hub import snapshot_download, hf_hub_download
 import subprocess
+# 克隆Amphion仓库
+if not os.path.exists("Amphion"):
+    subprocess.run(["git", "clone", "https://github.com/open-mmlab/Amphion.git"])
+    os.chdir("Amphion")
+else:
+    if not os.getcwd().endswith("Amphion"):
+        os.chdir("Amphion")
+# 将Amphion加入到路径中
+if os.path.dirname(os.path.abspath("Amphion")) not in sys.path:
+    sys.path.append(os.path.dirname(os.path.abspath("Amphion")))
+# 确保需要的目录存在
+os.makedirs("wav", exist_ok=True)
+os.makedirs("ckpts/Vevo", exist_ok=True)
+from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio, load_wav
+# 下载和设置配置文件
+def setup_configs():
+    config_path = "models/vc/vevo/config"
+    os.makedirs(config_path, exist_ok=True)
+    config_files = [
+        "PhoneToVq8192.json",
+        "Vocoder.json",
+        "Vq32ToVq8192.json",
+        "Vq8192ToMels.json",
+        "hubert_large_l18_c32.yaml",
     ]
+    for file in config_files:
+        file_path = f"{config_path}/{file}"
+        if not os.path.exists(file_path):
+            try:
+                file_data = hf_hub_download(
+                    repo_id="amphion/Vevo",
+                    filename=f"config/{file}",
+                    repo_type="model",
+                )
+                os.makedirs(os.path.dirname(file_path), exist_ok=True)
+                # 拷贝文件到目标位置
+                subprocess.run(["cp", file_data, file_path])
+            except Exception as e:
+                print(f"下载配置文件 {file} 时出错: {e}")
+setup_configs()
+# 设备配置
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+print(f"使用设备: {device}")
+# 初始化管道字典
+inference_pipelines = {}
+def get_pipeline(pipeline_type):
+    if pipeline_type in inference_pipelines:
+        return inference_pipelines[pipeline_type]
+    # 根据需要的管道类型初始化
+    if pipeline_type == "style" or pipeline_type == "voice":
+        # 下载Content Tokenizer
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["tokenizer/vq32/*"],
+        )
+        content_tokenizer_ckpt_path = os.path.join(
+            local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
+        )
+        # 下载Content-Style Tokenizer
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["tokenizer/vq8192/*"],
+        )
+        content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
+        # 下载Autoregressive Transformer
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
+        )
+        ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
+        ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
+        # 下载Flow Matching Transformer
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
+        )
+        fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
+        fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
+        # 下载Vocoder
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["acoustic_modeling/Vocoder/*"],
+        )
+        vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
+        vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
+        # 初始化管道
+        inference_pipeline = VevoInferencePipeline(
+            content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
+            content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
+            ar_cfg_path=ar_cfg_path,
+            ar_ckpt_path=ar_ckpt_path,
+            fmt_cfg_path=fmt_cfg_path,
+            fmt_ckpt_path=fmt_ckpt_path,
+            vocoder_cfg_path=vocoder_cfg_path,
+            vocoder_ckpt_path=vocoder_ckpt_path,
+            device=device,
+        )
+    elif pipeline_type == "timbre":
+        # 下载Content-Style Tokenizer (仅timbre需要)
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["tokenizer/vq8192/*"],
+        )
+        content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
+        # 下载Flow Matching Transformer
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
+        )
+        fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
+        fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
+        # 下载Vocoder
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["acoustic_modeling/Vocoder/*"],
+        )
+        vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
+        vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
+        # 初始化管道
+        inference_pipeline = VevoInferencePipeline(
+            content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
+            fmt_cfg_path=fmt_cfg_path,
+            fmt_ckpt_path=fmt_ckpt_path,
+            vocoder_cfg_path=vocoder_cfg_path,
+            vocoder_ckpt_path=vocoder_ckpt_path,
+            device=device,
+        )
+    elif pipeline_type == "tts":
+        # 下载Content-Style Tokenizer
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["tokenizer/vq8192/*"],
+        )
+        content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
+        # 下载Autoregressive Transformer (TTS特有)
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
+        )
+        ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
+        ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
+        # 下载Flow Matching Transformer
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
+        )
+        fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
+        fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
+        # 下载Vocoder
+        local_dir = snapshot_download(
+            repo_id="amphion/Vevo",
+            repo_type="model",
+            cache_dir="./ckpts/Vevo",
+            allow_patterns=["acoustic_modeling/Vocoder/*"],
+        )
+        vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
+        vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
+        # 初始化管道
+        inference_pipeline = VevoInferencePipeline(
+            content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
+            ar_cfg_path=ar_cfg_path,
+            ar_ckpt_path=ar_ckpt_path,
+            fmt_cfg_path=fmt_cfg_path,
+            fmt_ckpt_path=fmt_ckpt_path,
+            vocoder_cfg_path=vocoder_cfg_path,
+            vocoder_ckpt_path=vocoder_ckpt_path,
+            device=device,
+        )
+    # 缓存管道实例
+    inference_pipelines[pipeline_type] = inference_pipeline
+    return inference_pipeline
+# 实现VEVO功能函数
+def vevo_style(content_wav, style_wav):
+    temp_content_path = "wav/temp_content.wav"
+    temp_style_path = "wav/temp_style.wav"
+    output_path = "wav/output_vevostyle.wav"
+    # 保存上传的音频
+    torchaudio.save(temp_content_path, content_wav[0], content_wav[1])
+    torchaudio.save(temp_style_path, style_wav[0], style_wav[1])
+    # 获取管道
+    pipeline = get_pipeline("style")
+    # 推理
+    gen_audio = pipeline.inference_ar_and_fm(
+        src_wav_path=temp_content_path,
+        src_text=None,
+        style_ref_wav_path=temp_style_path,
+        timbre_ref_wav_path=temp_content_path,
+    )
+    # 保存生成的音频
+    save_audio(gen_audio, output_path=output_path)
+    return output_path
+def vevo_timbre(content_wav, reference_wav):
+    temp_content_path = "wav/temp_content.wav"
+    temp_reference_path = "wav/temp_reference.wav"
+    output_path = "wav/output_vevotimbre.wav"
+    # 保存上传的音频
+    torchaudio.save(temp_content_path, content_wav[0], content_wav[1])
+    torchaudio.save(temp_reference_path, reference_wav[0], reference_wav[1])
+    # 获取管道
+    pipeline = get_pipeline("timbre")
+    # 推理
+    gen_audio = pipeline.inference_fm(
+        src_wav_path=temp_content_path,
+        timbre_ref_wav_path=temp_reference_path,
+        flow_matching_steps=32,
+    )
+    # 保存生成的音频
+    save_audio(gen_audio, output_path=output_path)
+    return output_path
+def vevo_voice(content_wav, reference_wav):
+    temp_content_path = "wav/temp_content.wav"
+    temp_reference_path = "wav/temp_reference.wav"
+    output_path = "wav/output_vevovoice.wav"
+    # 保存上传的音频
+    torchaudio.save(temp_content_path, content_wav[0], content_wav[1])
+    torchaudio.save(temp_reference_path, reference_wav[0], reference_wav[1])
+    # 获取管道
+    pipeline = get_pipeline("voice")
+    # 推理
+    gen_audio = pipeline.inference_ar_and_fm(
+        src_wav_path=temp_content_path,
+        src_text=None,
+        style_ref_wav_path=temp_reference_path,
+        timbre_ref_wav_path=temp_reference_path,
+    )
+    # 保存生成的音频
+    save_audio(gen_audio, output_path=output_path)
+    return output_path
+def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language="en"):
+    temp_ref_path = "wav/temp_ref.wav"
+    temp_timbre_path = "wav/temp_timbre.wav"
+    output_path = "wav/output_vevotts.wav"
+    # 保存上传的音频
+    torchaudio.save(temp_ref_path, ref_wav[0], ref_wav[1])
+    if timbre_ref_wav is not None:
+        torchaudio.save(temp_timbre_path, timbre_ref_wav[0], timbre_ref_wav[1])
+    else:
+        temp_timbre_path = temp_ref_path
+    # 获取管道
+    pipeline = get_pipeline("tts")
+    # 推理
+    gen_audio = pipeline.inference_ar_and_fm(
+        src_wav_path=None,
+        src_text=text,
+        style_ref_wav_path=temp_ref_path,
+        timbre_ref_wav_path=temp_timbre_path,
+        style_ref_wav_text=None,
+        src_text_language=src_language,
+        style_ref_wav_text_language=ref_language,
+    )
+    # 保存生成的音频
+    save_audio(gen_audio, output_path=output_path)
+    return output_path
+# 创建Gradio界面
+with gr.Blocks(title="VEVO Demo") as demo:
+    gr.Markdown("# VEVO: 多功能语音合成模型演示")
+    gr.Markdown("## 可控零样本声音模仿与风格转换")
+    with gr.Tab("风格转换 (Style)"):
+        gr.Markdown("### Vevo-Style: 保持音色但转换风格（如口音、情感等）")
+        with gr.Row():
+            with gr.Column():
+                style_content = gr.Audio(label="内容音频", type="numpy")
+                style_reference = gr.Audio(label="风格音频", type="numpy")
+                style_button = gr.Button("生成")
+            with gr.Column():
+                style_output = gr.Audio(label="生成结果")
+        style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
+    with gr.Tab("音色转换 (Timbre)"):
+        gr.Markdown("### Vevo-Timbre: 保持风格但转换音色")
+        with gr.Row():
+            with gr.Column():
+                timbre_content = gr.Audio(label="内容音频", type="numpy")
+                timbre_reference = gr.Audio(label="音色参考音频", type="numpy")
+                timbre_button = gr.Button("生成")
+            with gr.Column():
+                timbre_output = gr.Audio(label="生成结果")
+        timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
+    with gr.Tab("声音转换 (Voice)"):
+        gr.Markdown("### Vevo-Voice: 同时转换风格和音色")
+        with gr.Row():
+            with gr.Column():
+                voice_content = gr.Audio(label="内容音频", type="numpy")
+                voice_reference = gr.Audio(label="声音参考音频", type="numpy")
+                voice_button = gr.Button("生成")
+            with gr.Column():
+                voice_output = gr.Audio(label="生成结果")
+        voice_button.click(vevo_voice, inputs=[voice_content, voice_reference], outputs=voice_output)
+    with gr.Tab("文本到语音 (TTS)"):
+        gr.Markdown("### Vevo-TTS: 风格与音色可控的文本到语音转换")
+        with gr.Row():
+            with gr.Column():
+                tts_text = gr.Textbox(label="输入文本", placeholder="请输入要合成的文本...", lines=3)
+                tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="文本语言", value="en")
+                tts_reference = gr.Audio(label="风格参考音频", type="numpy")
+                tts_ref_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="参考音频语言", value="en")
+                with gr.Accordion("高级选项", open=False):
+                    tts_timbre_reference = gr.Audio(label="音色参考音频（可选）", type="numpy")
+                tts_button = gr.Button("生成")
+            with gr.Column():
+                tts_output = gr.Audio(label="生成结果")
+        tts_button.click(
+            vevo_tts,
+            inputs=[tts_text, tts_reference, tts_timbre_reference, tts_src_language, tts_ref_language],
+            outputs=tts_output
+        )
+    gr.Markdown("""
+    ## 关于VEVO
+    VEVO是一个多功能语音合成和转换模型，提供四种主要功能：
+    1. **Vevo-Style**: 保持音色但转换风格（如口音、情感等）
+    2. **Vevo-Timbre**: 保持风格但转换音色
+    3. **Vevo-Voice**: 同时转换风格和音色
+    4. **Vevo-TTS**: 风格与音色可控的文本到语音转换
+    更多信息请访问[Amphion项目](https://github.com/open-mmlab/Amphion)
+    """)
+# 启动应用
+demo.launch()

requirements.txt CHANGED Viewed

@@ -1,33 +1,11 @@
-gradio>=4.14.0
-huggingface_hub>=0.20.0
 torch>=2.0.0
 torchaudio>=2.0.0
-numpy>=1.23.0
-librosa>=0.10.0
-accelerate>=0.21.0
-PySoundFile>=0.9.0
-safetensors>=0.4.0
 PyYAML>=6.0
-whisper>=1.1.10
-IPython>=8.0.0
-requests>=2.28.0
-transformers>=4.41.0
-setuptools
-onnxruntime
-unidecode
-scipy>=1.12.0
-encodec
-phonemizer
-g2p_en
-jieba
-cn2an
-pypinyin
-LangSegment
-pyopenjtalk
-pykakasi
-json5
-black>=24.1.1
-ruamel.yaml
-tqdm
-einops
-spaces

+gradio>=3.50.2
 torch>=2.0.0
 torchaudio>=2.0.0
+numpy>=1.20.0
+huggingface_hub>=0.14.1
+librosa>=0.9.2
 PyYAML>=6.0
+accelerate>=0.20.3
+safetensors>=0.3.1
+phonemizer>=3.2.0
+git+https://github.com/open-mmlab/Amphion.git