import os import tempfile from argparse import Namespace from pathlib import Path import numpy as np import torch import torchaudio from shared.utils import files_locator as fl from shared.utils.download import process_download_defs SEEDVC_MODE_SPEECH = 1 SEEDVC_MODE_SINGING = 2 SEEDVC_MODE_ACCENT = 3 SEEDVC_CAMPPLUS_FILENAME = "campplus_cn_common.bin" SEEDVC_SPEECH_CHECKPOINT_FILENAME = "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth" SEEDVC_SPEECH_CONFIG_FILENAME = "config_dit_mel_seed_uvit_whisper_small_wavenet.yml" SEEDVC_SINGING_CHECKPOINT_FILENAME = "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema_v2.pth" SEEDVC_SINGING_CONFIG_FILENAME = "config_dit_mel_seed_uvit_whisper_base_f0_44k.yml" SEEDVC_RMVPE_FILENAME = "rmvpe.pt" SEEDVC_V2_AR_CHECKPOINT_FILENAME = "v2/ar_base.pth" SEEDVC_V2_CFM_CHECKPOINT_FILENAME = "v2/cfm_small.pth" SEEDVC_V2_NARROW_CHECKPOINT_FILENAME = "bsq32/bsq32_light.pth" SEEDVC_V2_WIDE_CHECKPOINT_FILENAME = "bsq2048/bsq2048_light.pth" SEEDVC_CHECKPOINT_FILENAME = SEEDVC_SPEECH_CHECKPOINT_FILENAME SEEDVC_CONFIG_FILENAME = SEEDVC_SPEECH_CONFIG_FILENAME SEEDVC_DEFAULT_STEPS = 25 SEEDVC_DEFAULT_CFG_RATE = 0.5 SEEDVC_SAMPLE_RATE = 22050 SEEDVC_MAX_REFERENCE_SECONDS = 25.0 SEEDVC_REPO_ID = "DeepBeepMeep/LTX-2" SEEDVC_ROOT = "seed-vc" SEEDVC_CHECKPOINT_DIR = SEEDVC_ROOT # SeedVC v2 style/AR conversion changes timing, which breaks video remux and speaker masks. SEEDVC_V2_CONVERT_STYLE = False SEEDVC_BIGVGAN_DIR = "bigvgan_v2_22khz_80band_256x" SEEDVC_BIGVGAN_44K_DIR = "bigvgan_v2_44khz_128band_512x" SEEDVC_WHISPER_DIR = "whisper-small" SEEDVC_HUBERT_DIR = "hubert-large-ll60k" SEEDVC_BIGVGAN_FILES = ["config.json", "bigvgan_generator.pt"] SEEDVC_WHISPER_FILES = [ "added_tokens.json", "config.json", "generation_config.json", "merges.txt", "model.safetensors", "normalizer.json", "preprocessor_config.json", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "vocab.json", ] SEEDVC_HUBERT_FILES = ["config.json", "preprocessor_config.json", "pytorch_model.bin"] _MODE_DEFAULTS = { SEEDVC_MODE_SPEECH: {"label": "v1.0 Speech", "steps": 25, "cfg_rate": 0.5}, SEEDVC_MODE_SINGING: {"label": "v1.0 Singing / F0 44k", "steps": 10, "cfg_rate": 0.7}, SEEDVC_MODE_ACCENT: {"label": "v2 Speech", "steps": 30, "cfg_rate": 0.7}, } def normalize_mode(mode: int | str | None) -> int: try: mode = int(mode or SEEDVC_MODE_SPEECH) except (TypeError, ValueError): mode = SEEDVC_MODE_SPEECH return mode if mode in _MODE_DEFAULTS else SEEDVC_MODE_SPEECH def mode_label(mode: int | str | None) -> str: return _MODE_DEFAULTS[normalize_mode(mode)]["label"] def get_default_steps(mode: int | str | None = SEEDVC_MODE_SPEECH) -> int: return int(_MODE_DEFAULTS[normalize_mode(mode)]["steps"]) def get_default_cfg_rate(mode: int | str | None = SEEDVC_MODE_SPEECH) -> float: return float(_MODE_DEFAULTS[normalize_mode(mode)]["cfg_rate"]) def query_required_files(mode: int | str | None = SEEDVC_MODE_SPEECH, root: str = SEEDVC_ROOT) -> list[str]: mode = normalize_mode(mode) if mode == SEEDVC_MODE_SINGING: return [ os.path.join(root, SEEDVC_SINGING_CHECKPOINT_FILENAME), os.path.join(root, SEEDVC_SINGING_CONFIG_FILENAME), os.path.join(root, SEEDVC_CAMPPLUS_FILENAME), os.path.join(root, SEEDVC_RMVPE_FILENAME), *[os.path.join(SEEDVC_BIGVGAN_44K_DIR, filename) for filename in SEEDVC_BIGVGAN_FILES], *[os.path.join(SEEDVC_WHISPER_DIR, filename) for filename in SEEDVC_WHISPER_FILES], ] if mode == SEEDVC_MODE_ACCENT: return [ os.path.join(root, SEEDVC_V2_AR_CHECKPOINT_FILENAME), os.path.join(root, SEEDVC_V2_CFM_CHECKPOINT_FILENAME), os.path.join(root, SEEDVC_V2_NARROW_CHECKPOINT_FILENAME), os.path.join(root, SEEDVC_V2_WIDE_CHECKPOINT_FILENAME), os.path.join(root, SEEDVC_CAMPPLUS_FILENAME), *[os.path.join(SEEDVC_BIGVGAN_DIR, filename) for filename in SEEDVC_BIGVGAN_FILES], *[os.path.join(SEEDVC_WHISPER_DIR, filename) for filename in SEEDVC_WHISPER_FILES], *[os.path.join(SEEDVC_HUBERT_DIR, filename) for filename in SEEDVC_HUBERT_FILES], ] return [ os.path.join(root, SEEDVC_SPEECH_CHECKPOINT_FILENAME), os.path.join(root, SEEDVC_SPEECH_CONFIG_FILENAME), os.path.join(root, SEEDVC_CAMPPLUS_FILENAME), *[os.path.join(SEEDVC_BIGVGAN_DIR, filename) for filename in SEEDVC_BIGVGAN_FILES], *[os.path.join(SEEDVC_WHISPER_DIR, filename) for filename in SEEDVC_WHISPER_FILES], ] def query_download_def(mode: int | str | None = SEEDVC_MODE_SPEECH, root: str = SEEDVC_ROOT) -> list[dict]: mode = normalize_mode(mode) root_files = [SEEDVC_CAMPPLUS_FILENAME] if mode == SEEDVC_MODE_SINGING: root_files += [SEEDVC_SINGING_CHECKPOINT_FILENAME, SEEDVC_SINGING_CONFIG_FILENAME, SEEDVC_RMVPE_FILENAME] bigvgan_dir = SEEDVC_BIGVGAN_44K_DIR elif mode == SEEDVC_MODE_ACCENT: root_files += [ SEEDVC_V2_AR_CHECKPOINT_FILENAME, SEEDVC_V2_CFM_CHECKPOINT_FILENAME, SEEDVC_V2_NARROW_CHECKPOINT_FILENAME, SEEDVC_V2_WIDE_CHECKPOINT_FILENAME, ] bigvgan_dir = SEEDVC_BIGVGAN_DIR else: root_files += [SEEDVC_SPEECH_CHECKPOINT_FILENAME, SEEDVC_SPEECH_CONFIG_FILENAME] bigvgan_dir = SEEDVC_BIGVGAN_DIR download_def = [ {"repoId": SEEDVC_REPO_ID, "sourceFolderList": [root], "fileList": [root_files]}, {"repoId": SEEDVC_REPO_ID, "sourceFolderList": [bigvgan_dir], "fileList": [SEEDVC_BIGVGAN_FILES]}, {"repoId": SEEDVC_REPO_ID, "sourceFolderList": [SEEDVC_WHISPER_DIR], "fileList": [SEEDVC_WHISPER_FILES]}, ] if mode == SEEDVC_MODE_ACCENT: download_def.append({"repoId": SEEDVC_REPO_ID, "sourceFolderList": [SEEDVC_HUBERT_DIR], "fileList": [SEEDVC_HUBERT_FILES]}) return download_def def download_assets(mode: int | str | None = SEEDVC_MODE_SPEECH, root: str = SEEDVC_ROOT) -> list[dict]: download_def = query_download_def(mode, root) process_download_defs(download_def) return download_def def _asset_paths(mode: int | str | None = SEEDVC_MODE_SPEECH, root: str = SEEDVC_ROOT) -> dict[str, str]: mode = normalize_mode(mode) common = { "campplus_path": fl.locate_file(os.path.join(root, SEEDVC_CAMPPLUS_FILENAME)), "whisper_folder": fl.locate_folder(SEEDVC_WHISPER_DIR), } if mode == SEEDVC_MODE_SINGING: return { **common, "checkpoint_path": fl.locate_file(os.path.join(root, SEEDVC_SINGING_CHECKPOINT_FILENAME)), "config_path": fl.locate_file(os.path.join(root, SEEDVC_SINGING_CONFIG_FILENAME)), "rmvpe_path": fl.locate_file(os.path.join(root, SEEDVC_RMVPE_FILENAME)), "bigvgan_folder": fl.locate_folder(SEEDVC_BIGVGAN_44K_DIR), } if mode == SEEDVC_MODE_ACCENT: return { **common, "ar_checkpoint_path": fl.locate_file(os.path.join(root, SEEDVC_V2_AR_CHECKPOINT_FILENAME)), "cfm_checkpoint_path": fl.locate_file(os.path.join(root, SEEDVC_V2_CFM_CHECKPOINT_FILENAME)), "narrow_checkpoint_path": fl.locate_file(os.path.join(root, SEEDVC_V2_NARROW_CHECKPOINT_FILENAME)), "wide_checkpoint_path": fl.locate_file(os.path.join(root, SEEDVC_V2_WIDE_CHECKPOINT_FILENAME)), "bigvgan_folder": fl.locate_folder(SEEDVC_BIGVGAN_DIR), "hubert_folder": fl.locate_folder(SEEDVC_HUBERT_DIR), } return { **common, "checkpoint_path": fl.locate_file(os.path.join(root, SEEDVC_SPEECH_CHECKPOINT_FILENAME)), "config_path": fl.locate_file(os.path.join(root, SEEDVC_SPEECH_CONFIG_FILENAME)), "bigvgan_folder": fl.locate_folder(SEEDVC_BIGVGAN_DIR), } def _closure_modules(fn) -> list[torch.nn.Module]: modules = [] for cell in fn.__closure__ or []: try: value = cell.cell_contents except ValueError: continue if isinstance(value, torch.nn.Module): modules.append(value) return modules def _make_mono(waveform: torch.Tensor) -> torch.Tensor: waveform = waveform.detach().cpu().float() if waveform.ndim == 1: return waveform.unsqueeze(0) return waveform.mean(dim=0, keepdim=True) def _torch_mono_to_numpy(waveform: torch.Tensor) -> np.ndarray: return _make_mono(waveform).squeeze(0).numpy().astype(np.float32, copy=False) def _save_mono_resampled(path: str, waveform: torch.Tensor, source_rate: int, target_rate: int = SEEDVC_SAMPLE_RATE, max_seconds: float | None = None) -> None: import soundfile as sf waveform = _make_mono(waveform) if int(source_rate) != int(target_rate): waveform = torchaudio.functional.resample(waveform, int(source_rate), int(target_rate)) if max_seconds is not None: waveform = waveform[:, : int(round(float(max_seconds) * int(target_rate)))] sf.write(path, waveform.squeeze(0).clamp_(-1.0, 1.0).numpy(), int(target_rate)) def _register_unmanaged_seedvc_tensors(modules) -> None: for module in modules: for submodule in module.modules(): for attr in ("freqs_cis", "causal_mask", "mask_cache", "input_pos"): value = getattr(submodule, attr, None) if isinstance(value, torch.Tensor) and attr not in submodule._buffers: delattr(submodule, attr) submodule.register_buffer(attr, value, persistent=False) def _module_device(module: torch.nn.Module) -> torch.device: for tensor in list(module.parameters(recurse=True)) + list(module.buffers(recurse=True)): return tensor.device return torch.device("cpu") def _runtime_device(pipe: dict[str, torch.nn.Module]) -> torch.device: for module in pipe.values(): for submodule in module.modules(): if hasattr(submodule, "_mm_manager"): return torch.device("cuda" if torch.cuda.is_available() else "cpu") for module in pipe.values(): return _module_device(module) return torch.device("cpu") def _normalise_output(samples: np.ndarray) -> np.ndarray: if samples.dtype == np.int16: samples = samples.astype(np.float32) / 32768.0 elif samples.dtype != np.float32: samples = samples.astype(np.float32) peak = np.abs(samples).max(initial=0.0) return samples / peak if peak > 1.0 else samples def _audio_tuple_to_stereo_tensor(audio_tuple: tuple[int, np.ndarray], output_rate: int) -> torch.Tensor: converted_rate, samples = audio_tuple converted_tensor = torch.from_numpy(_normalise_output(samples)).float().unsqueeze(0) if int(converted_rate) != int(output_rate): converted_tensor = torchaudio.functional.resample(converted_tensor, int(converted_rate), int(output_rate)) return converted_tensor.repeat(2, 1) def _consume_generator_return(generator): try: while True: next(generator) except StopIteration as stop: return stop.value def _configure_pydub_ffmpeg() -> None: from shared.utils.video_decode import resolve_media_binary ffmpeg_path = resolve_media_binary("ffmpeg") ffprobe_path = resolve_media_binary("ffprobe") if ffmpeg_path: ffmpeg_dir = os.path.dirname(os.fspath(ffmpeg_path)) if ffmpeg_dir and ffmpeg_dir not in os.environ.get("PATH", ""): os.environ["PATH"] = ffmpeg_dir + os.pathsep + os.environ.get("PATH", "") from pydub import AudioSegment if ffmpeg_path: AudioSegment.converter = ffmpeg_path if ffprobe_path: AudioSegment.ffprobe = ffprobe_path def _load_seedvc_app(): try: from . import app_vc except ImportError as exc: raise ImportError("SeedVC support requires the bundled `postprocessing/seedvc` package files.") from exc return app_vc def _load_seedvc_svc_app(): try: from . import app_svc except ImportError as exc: raise ImportError("SeedVC singing support requires the bundled `postprocessing/seedvc` package files.") from exc return app_svc class SeedVCVoiceConverter: mode = SEEDVC_MODE_SPEECH default_steps = 25 default_cfg_rate = 0.5 sample_rate = 22050 def __init__( self, checkpoint_path: str, config_path: str, campplus_path: str, bigvgan_folder: str, whisper_folder: str, dtype: torch.dtype = torch.float16, ) -> None: self.checkpoint_path = os.fspath(checkpoint_path) self.config_path = os.fspath(config_path) self.campplus_path = os.fspath(campplus_path) self.bigvgan_folder = os.fspath(bigvgan_folder) self.whisper_folder = os.fspath(whisper_folder) self.dtype = dtype self._app_vc = None self._patched_config_path = None self._load() def _build_local_config(self) -> str: import yaml with open(self.config_path, "r", encoding="utf-8") as reader: config = yaml.safe_load(reader) config["model_params"]["vocoder"]["name"] = self.bigvgan_folder config["model_params"]["speech_tokenizer"]["name"] = self.whisper_folder tmp = tempfile.NamedTemporaryFile("w", suffix=".yml", encoding="utf-8", delete=False) with tmp: yaml.safe_dump(config, tmp, sort_keys=False) self._patched_config_path = tmp.name return tmp.name def _load(self) -> None: _configure_pydub_ffmpeg() app_vc = _load_seedvc_app() app_vc.device = torch.device("cpu") app_vc.load_custom_model_from_hf = self._load_custom_model_from_local_assets os.environ.setdefault("HF_HUB_CACHE", str(Path(self.campplus_path).parent / "hf_cache")) args = Namespace(checkpoint=self.checkpoint_path, config=self._build_local_config(), fp16=self.dtype == torch.float16, gpu=0) ( app_vc.model, app_vc.semantic_fn, app_vc.vocoder_fn, app_vc.campplus_model, app_vc.to_mel, app_vc.mel_fn_args, ) = app_vc.load_models(args) app_vc.max_context_window = app_vc.sr // app_vc.hop_length * 30 app_vc.overlap_wave_len = app_vc.overlap_frame_len * app_vc.hop_length self._app_vc = app_vc self.seedvc_model = torch.nn.ModuleDict({str(name): module for name, module in app_vc.model.items() if isinstance(module, torch.nn.Module)}) self.semantic_modules = torch.nn.ModuleList(_closure_modules(app_vc.semantic_fn)) self.campplus_model = app_vc.campplus_model self.vocoder_fn = app_vc.vocoder_fn _register_unmanaged_seedvc_tensors(self.pipe_modules().values()) for module in self.pipe_modules().values(): for submodule in module.modules(): submodule._lock_dtype = None def pipe_modules(self) -> dict[str, torch.nn.Module]: pipe = {f"seedvc_{name}": module for name, module in self.seedvc_model.items()} if len(self.semantic_modules) == 1: pipe["seedvc_whisper_small"] = self.semantic_modules[0] else: pipe.update({f"seedvc_speech_tokenizer_{idx + 1}": module for idx, module in enumerate(self.semantic_modules)}) if isinstance(self.campplus_model, torch.nn.Module): pipe["seedvc_campplus"] = self.campplus_model if isinstance(self.vocoder_fn, torch.nn.Module): pipe["seedvc_bigvgan"] = self.vocoder_fn return pipe def _load_custom_model_from_local_assets(self, repo_id, model_filename, config_filename=None): if repo_id == "funasr/campplus" and model_filename == SEEDVC_CAMPPLUS_FILENAME: return self.campplus_path raise FileNotFoundError(f"SeedVC asset is not declared for local loading: {repo_id}/{model_filename}") def forward( self, source_wav_path: str, target_wav_path: str, diffusion_steps: int | None = None, cfg_rate: float | None = None, ) -> tuple[np.ndarray, int]: if self._app_vc is None: raise RuntimeError("SeedVC is not loaded.") _configure_pydub_ffmpeg() self._app_vc.device = _runtime_device(self.pipe_modules()) audio_tuple = None for result in self._app_vc.voice_conversion( source=source_wav_path, target=target_wav_path, diffusion_steps=self.default_steps if diffusion_steps is None else int(diffusion_steps), length_adjust=1.0, inference_cfg_rate=self.default_cfg_rate if cfg_rate is None else float(cfg_rate), ): if isinstance(result, tuple) and len(result) == 2: _, audio_tuple = result if audio_tuple is None: raise RuntimeError("SeedVC produced no output.") sample_rate, samples = audio_tuple return int(sample_rate), _normalise_output(samples) def convert_tensor( self, source_audio: torch.Tensor, source_rate: int, reference_audio: torch.Tensor, reference_rate: int, output_rate: int, diffusion_steps: int | None = None, cfg_rate: float | None = None, ) -> torch.Tensor: with tempfile.TemporaryDirectory() as tmpdir: source_path = os.path.join(tmpdir, "source_22k.wav") target_path = os.path.join(tmpdir, "target_22k.wav") _save_mono_resampled(source_path, source_audio, source_rate, target_rate=self.sample_rate) _save_mono_resampled(target_path, reference_audio, reference_rate, target_rate=self.sample_rate, max_seconds=SEEDVC_MAX_REFERENCE_SECONDS) converted = self.forward(source_path, target_path, diffusion_steps=diffusion_steps, cfg_rate=cfg_rate) return _audio_tuple_to_stereo_tensor(converted, output_rate) class SeedVCSingingConverter(SeedVCVoiceConverter): mode = SEEDVC_MODE_SINGING default_steps = 10 default_cfg_rate = 0.7 sample_rate = 44100 def __init__( self, checkpoint_path: str, config_path: str, campplus_path: str, rmvpe_path: str, bigvgan_folder: str, whisper_folder: str, dtype: torch.dtype = torch.float16, ) -> None: self.rmvpe_path = os.fspath(rmvpe_path) super().__init__(checkpoint_path, config_path, campplus_path, bigvgan_folder, whisper_folder, dtype=dtype) def _load(self) -> None: _configure_pydub_ffmpeg() app_svc = _load_seedvc_svc_app() app_svc.device = torch.device("cpu") app_svc.load_custom_model_from_hf = self._load_custom_model_from_local_assets os.environ.setdefault("HF_HUB_CACHE", str(Path(self.campplus_path).parent / "hf_cache")) args = Namespace(checkpoint=self.checkpoint_path, config=self._build_local_config(), fp16=self.dtype == torch.float16, gpu=0) ( app_svc.model_f0, app_svc.semantic_fn, app_svc.vocoder_fn, app_svc.campplus_model, app_svc.to_mel_f0, app_svc.mel_fn_args, app_svc.f0_fn, ) = app_svc.load_models(args) app_svc.max_context_window = app_svc.sr // app_svc.hop_length * 30 app_svc.overlap_wave_len = app_svc.overlap_frame_len * app_svc.hop_length self._app_vc = app_svc self.seedvc_model = torch.nn.ModuleDict({str(name): module for name, module in app_svc.model_f0.items() if isinstance(module, torch.nn.Module)}) self.semantic_modules = torch.nn.ModuleList(_closure_modules(app_svc.semantic_fn)) self.campplus_model = app_svc.campplus_model self.vocoder_fn = app_svc.vocoder_fn self.f0_extractor = getattr(app_svc.f0_fn, "__self__", None) _register_unmanaged_seedvc_tensors(self.pipe_modules().values()) for module in self.pipe_modules().values(): for submodule in module.modules(): submodule._lock_dtype = None def _load_custom_model_from_local_assets(self, repo_id, model_filename, config_filename=None): if repo_id == "funasr/campplus" and model_filename == SEEDVC_CAMPPLUS_FILENAME: return self.campplus_path if repo_id == "lj1995/VoiceConversionWebUI" and model_filename == SEEDVC_RMVPE_FILENAME: return self.rmvpe_path raise FileNotFoundError(f"SeedVC singing asset is not declared for local loading: {repo_id}/{model_filename}") def pipe_modules(self) -> dict[str, torch.nn.Module]: pipe = super().pipe_modules() if self.f0_extractor is not None: for attr in ("mel_extractor", "model"): module = getattr(self.f0_extractor, attr, None) if isinstance(module, torch.nn.Module): pipe[f"seedvc_f0_{attr}"] = module return pipe def forward( self, source_wav_path: str, target_wav_path: str, diffusion_steps: int | None = None, cfg_rate: float | None = None, ) -> tuple[np.ndarray, int]: if self._app_vc is None: raise RuntimeError("SeedVC singing model is not loaded.") _configure_pydub_ffmpeg() self._app_vc.device = _runtime_device(self.pipe_modules()) if self.f0_extractor is not None: self.f0_extractor.device = self._app_vc.device audio_tuple = None for result in self._app_vc.voice_conversion( source=source_wav_path, target=target_wav_path, diffusion_steps=self.default_steps if diffusion_steps is None else int(diffusion_steps), length_adjust=1.0, inference_cfg_rate=self.default_cfg_rate if cfg_rate is None else float(cfg_rate), auto_f0_adjust=True, pitch_shift=0, ): if isinstance(result, tuple) and len(result) == 2: _, audio_tuple = result if audio_tuple is None: raise RuntimeError("SeedVC singing model produced no output.") sample_rate, samples = audio_tuple return int(sample_rate), _normalise_output(samples) class SeedVCAccentConverter: mode = SEEDVC_MODE_ACCENT default_steps = 30 default_cfg_rate = 0.7 sample_rate = 22050 def __init__( self, ar_checkpoint_path: str, cfm_checkpoint_path: str, narrow_checkpoint_path: str, wide_checkpoint_path: str, campplus_path: str, bigvgan_folder: str, whisper_folder: str, hubert_folder: str, dtype: torch.dtype = torch.float16, ) -> None: self.ar_checkpoint_path = os.fspath(ar_checkpoint_path) self.cfm_checkpoint_path = os.fspath(cfm_checkpoint_path) self.narrow_checkpoint_path = os.fspath(narrow_checkpoint_path) self.wide_checkpoint_path = os.fspath(wide_checkpoint_path) self.campplus_path = os.fspath(campplus_path) self.bigvgan_folder = os.fspath(bigvgan_folder) self.whisper_folder = os.fspath(whisper_folder) self.hubert_folder = os.fspath(hubert_folder) self.dtype = dtype self.vc_wrapper = None self._patched_config_path = None self._load() def _build_local_config(self) -> str: import yaml config_path = Path(__file__).resolve().parent / "configs" / "v2" / "vc_wrapper.yaml" with open(config_path, "r", encoding="utf-8") as reader: config = yaml.safe_load(reader) config["vocoder"]["pretrained_model_name_or_path"] = self.bigvgan_folder for key in ("content_extractor_narrow", "content_extractor_wide"): config[key]["tokenizer_name"] = self.whisper_folder config[key]["ssl_model_name"] = self.hubert_folder tmp = tempfile.NamedTemporaryFile("w", suffix=".yaml", encoding="utf-8", delete=False) with tmp: yaml.safe_dump(config, tmp, sort_keys=False) self._patched_config_path = tmp.name return tmp.name def _load(self) -> None: import yaml from hydra.utils import instantiate from omegaconf import DictConfig _configure_pydub_ffmpeg() from .modules.v2 import vc_wrapper as vc_wrapper_module vc_wrapper_module.load_custom_model_from_hf = self._load_custom_model_from_local_assets os.environ.setdefault("HF_HUB_CACHE", str(Path(self.campplus_path).parent / "hf_cache")) with open(self._build_local_config(), "r", encoding="utf-8") as reader: cfg = DictConfig(yaml.safe_load(reader)) self.vc_wrapper = instantiate(cfg) self.vc_wrapper.load_checkpoints(ar_checkpoint_path=self.ar_checkpoint_path, cfm_checkpoint_path=self.cfm_checkpoint_path) self.vc_wrapper.to(torch.device("cpu")) self.vc_wrapper.eval() self.vc_wrapper.setup_ar_caches(max_batch_size=1, max_seq_len=4096, dtype=self.dtype, device=torch.device("cpu")) _register_unmanaged_seedvc_tensors(self.pipe_modules().values()) for module in self.pipe_modules().values(): for submodule in module.modules(): submodule._lock_dtype = None def _load_custom_model_from_local_assets(self, repo_id, model_filename, config_filename=None): if repo_id == "Plachta/ASTRAL-quantization" and model_filename == SEEDVC_V2_NARROW_CHECKPOINT_FILENAME: return self.narrow_checkpoint_path if repo_id == "Plachta/ASTRAL-quantization" and model_filename == SEEDVC_V2_WIDE_CHECKPOINT_FILENAME: return self.wide_checkpoint_path if repo_id == "funasr/campplus" and model_filename == SEEDVC_CAMPPLUS_FILENAME: return self.campplus_path raise FileNotFoundError(f"SeedVC v2 asset is not declared for local loading: {repo_id}/{model_filename}") def pipe_modules(self) -> dict[str, torch.nn.Module]: if self.vc_wrapper is None: return {} return {f"seedvc_v2_{name}": module for name, module in self.vc_wrapper.named_children() if isinstance(module, torch.nn.Module)} def convert_tensor( self, source_audio: torch.Tensor, source_rate: int, reference_audio: torch.Tensor, reference_rate: int, output_rate: int, diffusion_steps: int | None = None, cfg_rate: float | None = None, ) -> torch.Tensor: if self.vc_wrapper is None: raise RuntimeError("SeedVC v2 model is not loaded.") device = _runtime_device(self.pipe_modules()) dtype = self.dtype if device.type == "cuda" else torch.float32 generator = self.vc_wrapper.convert_voice_arrays( source_wave=_torch_mono_to_numpy(source_audio), target_wave=_torch_mono_to_numpy(reference_audio), source_sr=int(source_rate), target_sr=int(reference_rate), diffusion_steps=self.default_steps if diffusion_steps is None else int(diffusion_steps), length_adjust=1.0, intelligebility_cfg_rate=self.default_cfg_rate if cfg_rate is None else float(cfg_rate), similarity_cfg_rate=self.default_cfg_rate if cfg_rate is None else float(cfg_rate), top_p=0.9, temperature=1.0, repetition_penalty=1.0, convert_style=SEEDVC_V2_CONVERT_STYLE, anonymization_only=False, device=device, dtype=dtype, ) audio_tuple = _consume_generator_return(generator) if audio_tuple is None: raise RuntimeError("SeedVC v2 produced no output.") return _audio_tuple_to_stereo_tensor(audio_tuple, output_rate) def get_model(dtype: torch.dtype = torch.float16, root: str = SEEDVC_ROOT, mode: int | str | None = SEEDVC_MODE_SPEECH): mode = normalize_mode(mode) converter_cls = { SEEDVC_MODE_SPEECH: SeedVCVoiceConverter, SEEDVC_MODE_SINGING: SeedVCSingingConverter, SEEDVC_MODE_ACCENT: SeedVCAccentConverter, }[mode] return converter_cls(**_asset_paths(mode, root), dtype=dtype) def get_pipe(profile_no=None, dtype: torch.dtype = torch.float16, root: str = SEEDVC_ROOT, model=None, mode: int | str | None = SEEDVC_MODE_SPEECH) -> dict[str, torch.nn.Module]: seedvc_model = get_model(dtype=dtype, root=root, mode=mode) if model is None else model return seedvc_model.pipe_modules() def get_cotenants_map(pipe: dict[str, torch.nn.Module]) -> dict[str, list[str]]: seedvc_keys = [key for key in pipe if str(key).startswith("seedvc_")] return {key: list(seedvc_keys) for key in seedvc_keys}