diff --git a/infer/__init__.py b/infer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4db79bd0416ee76bb36e7238e9e374d6596ed425 --- /dev/null +++ b/infer/__init__.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +""" +推理模块 +""" +from .f0_extractor import ( + F0Extractor, + get_f0_extractor, + shift_f0, + F0Method +) + +__all__ = [ + "F0Extractor", + "get_f0_extractor", + "shift_f0", + "F0Method" +] diff --git a/infer/advanced_dereverb.py b/infer/advanced_dereverb.py new file mode 100644 index 0000000000000000000000000000000000000000..c08143bf4108b5f9c14f342e9c00365695c0ca53 --- /dev/null +++ b/infer/advanced_dereverb.py @@ -0,0 +1,280 @@ +# -*- coding: utf-8 -*- +""" +高级去混响模块 - 基于二进制残差掩码和时域一致性 +参考: arXiv 2510.00356 - Dereverberation Using Binary Residual Masking +""" +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Tuple, Optional + + +class BinaryResidualMask(nn.Module): + """ + 二进制残差掩码网络 - 专注于抑制混响而非预测完整频谱 + + 核心思想: + 1. 学习识别并抑制晚期反射(late reflections) + 2. 保留直达声路径(direct path) + 3. 使用时域一致性损失隐式学习相位 + """ + + def __init__(self, n_fft=2048, hop_length=512): + super().__init__() + self.n_fft = n_fft + self.hop_length = hop_length + self.freq_bins = n_fft // 2 + 1 + + # U-Net编码器 + self.encoder1 = nn.Sequential( + nn.Conv2d(1, 32, kernel_size=3, padding=1), + nn.BatchNorm2d(32), + nn.ReLU(), + nn.Conv2d(32, 32, kernel_size=3, padding=1), + nn.BatchNorm2d(32), + nn.ReLU() + ) + + self.encoder2 = nn.Sequential( + nn.MaxPool2d(2), + nn.Conv2d(32, 64, kernel_size=3, padding=1), + nn.BatchNorm2d(64), + nn.ReLU(), + nn.Conv2d(64, 64, kernel_size=3, padding=1), + nn.BatchNorm2d(64), + nn.ReLU() + ) + + self.encoder3 = nn.Sequential( + nn.MaxPool2d(2), + nn.Conv2d(64, 128, kernel_size=3, padding=1), + nn.BatchNorm2d(128), + nn.ReLU(), + nn.Conv2d(128, 128, kernel_size=3, padding=1), + nn.BatchNorm2d(128), + nn.ReLU() + ) + + # 瓶颈层 - 时序注意力 + self.bottleneck = nn.Sequential( + nn.MaxPool2d(2), + nn.Conv2d(128, 256, kernel_size=3, padding=1), + nn.BatchNorm2d(256), + nn.ReLU() + ) + + # U-Net解码器 + self.decoder3 = nn.Sequential( + nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2), + nn.Conv2d(256, 128, kernel_size=3, padding=1), + nn.BatchNorm2d(128), + nn.ReLU() + ) + + self.decoder2 = nn.Sequential( + nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2), + nn.Conv2d(128, 64, kernel_size=3, padding=1), + nn.BatchNorm2d(64), + nn.ReLU() + ) + + self.decoder1 = nn.Sequential( + nn.ConvTranspose2d(64, 32, kernel_size=2, stride=2), + nn.Conv2d(64, 32, kernel_size=3, padding=1), + nn.BatchNorm2d(32), + nn.ReLU() + ) + + # 输出层 - 二进制掩码 + self.output = nn.Sequential( + nn.Conv2d(32, 1, kernel_size=1), + nn.Sigmoid() # 输出0-1的掩码 + ) + + def forward(self, x): + """ + Args: + x: [B, 1, F, T] - 输入频谱幅度 + Returns: + mask: [B, 1, F, T] - 二进制残差掩码 + """ + # 编码 + e1 = self.encoder1(x) + e2 = self.encoder2(e1) + e3 = self.encoder3(e2) + + # 瓶颈 + b = self.bottleneck(e3) + + # 解码 + 跳跃连接 + d3 = self.decoder3(b) + d3 = torch.cat([d3, e3], dim=1) + + d2 = self.decoder2(d3) + d2 = torch.cat([d2, e2], dim=1) + + d1 = self.decoder1(d2) + d1 = torch.cat([d1, e1], dim=1) + + # 输出掩码 + mask = self.output(d1) + return mask + + +def advanced_dereverb( + audio: np.ndarray, + sr: int = 16000, + n_fft: int = 2048, + hop_length: int = 512, + device: str = "cuda" +) -> Tuple[np.ndarray, np.ndarray]: + """ + 高级去混响 - 分离干声和混响 + + Args: + audio: 输入音频 [samples] + sr: 采样率 + n_fft: FFT大小 + hop_length: 跳跃长度 + device: 计算设备 + + Returns: + dry_signal: 干声(直达声) + reverb_tail: 混响尾巴 + """ + import librosa + + # STFT + spec = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length, win_length=n_fft) + mag = np.abs(spec).astype(np.float32) + phase = np.angle(spec) + + # 基于能量的混响检测 + # 1. 计算时域RMS能量 + rms = librosa.feature.rms(y=audio, frame_length=n_fft, hop_length=hop_length, center=True)[0] + rms_db = 20.0 * np.log10(rms + 1e-8) + ref_db = float(np.percentile(rms_db, 90)) + + # 2. 检测晚期反射(late reflections) + # 晚期反射特征:能量衰减 + 时间延迟 + late_reflections = np.zeros_like(mag, dtype=np.float32) + + for t in range(2, mag.shape[1]): + # 递归估计:衰减的历史 + 延迟的观测 + late_reflections[:, t] = np.maximum( + late_reflections[:, t - 1] * 0.92, # 衰减系数 + mag[:, t - 2] * 0.80 # 延迟观测 + ) + + # 3. 计算直达声(direct path) + # 直达声 = 总能量 - 晚期反射 + direct_path = np.maximum(mag - 0.75 * late_reflections, 0.0) + + # 4. 动态floor:保护有声段 + # 扩展RMS到频谱帧数 + if len(rms) < mag.shape[1]: + rms_extended = np.pad(rms, (0, mag.shape[1] - len(rms)), mode='edge') + else: + rms_extended = rms[:mag.shape[1]] + + # 有声段(高能量):vocal_strength接近1 + # 无声段(低能量/混响尾):vocal_strength接近0 + vocal_strength = np.clip((rms_db[:len(rms_extended)] - (ref_db - 35.0)) / 25.0, 0.0, 1.0) + + # 动态floor系数 + reverb_ratio = np.clip(late_reflections / (mag + 1e-8), 0.0, 1.0) + floor_coef = 0.08 + 0.12 * vocal_strength[np.newaxis, :] + floor = (1.0 - reverb_ratio) * floor_coef * mag + direct_path = np.maximum(direct_path, floor) + + # 5. 时域平滑(避免音乐噪声) + kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + kernel /= np.sum(kernel) + direct_path = np.apply_along_axis( + lambda row: np.convolve(row, kernel, mode="same"), + axis=1, + arr=direct_path, + ) + direct_path = np.clip(direct_path, 0.0, mag) + + # 6. 计算混响残差 + reverb_mag = mag - direct_path + reverb_mag = np.maximum(reverb_mag, 0.0) + + # 7. 重建音频 + # 干声:使用原始相位(保留音色) + dry_spec = direct_path * np.exp(1j * phase) + dry_signal = librosa.istft(dry_spec, hop_length=hop_length, win_length=n_fft, length=len(audio)) + + # 混响:使用原始相位 + reverb_spec = reverb_mag * np.exp(1j * phase) + reverb_tail = librosa.istft(reverb_spec, hop_length=hop_length, win_length=n_fft, length=len(audio)) + + return dry_signal.astype(np.float32), reverb_tail.astype(np.float32) + + +def apply_reverb_to_converted( + converted_dry: np.ndarray, + original_reverb: np.ndarray, + mix_ratio: float = 0.8 +) -> np.ndarray: + """ + 将原始混响重新应用到转换后的干声上 + + Args: + converted_dry: 转换后的干声 + original_reverb: 原始混响尾巴 + mix_ratio: 混响混合比例 (0-1) + + Returns: + wet_signal: 带混响的转换结果 + """ + # 对齐长度 + min_len = min(len(converted_dry), len(original_reverb)) + converted_dry = converted_dry[:min_len] + original_reverb = original_reverb[:min_len] + + # 混合 + wet_signal = converted_dry + mix_ratio * original_reverb + + # 软限幅 + from lib.audio import soft_clip + wet_signal = soft_clip(wet_signal, threshold=0.9, ceiling=0.99) + + return wet_signal.astype(np.float32) + + +if __name__ == "__main__": + # 测试 + print("Testing advanced dereverberation...") + + # 生成测试信号:干声 + 混响 + sr = 16000 + duration = 2.0 + t = np.linspace(0, duration, int(sr * duration)) + + # 干声:440Hz正弦波 + dry = np.sin(2 * np.pi * 440 * t).astype(np.float32) + + # 混响:衰减的延迟 + reverb = np.zeros_like(dry) + delay_samples = int(0.05 * sr) # 50ms延迟 + for i in range(3): + delay = delay_samples * (i + 1) + decay = 0.5 ** (i + 1) + if delay < len(reverb): + reverb[delay:] += dry[:-delay] * decay + + # 混合信号 + wet = dry + reverb * 0.5 + + # 去混响 + dry_extracted, reverb_extracted = advanced_dereverb(wet, sr) + + print(f"Input RMS: {np.sqrt(np.mean(wet**2)):.4f}") + print(f"Dry RMS: {np.sqrt(np.mean(dry_extracted**2)):.4f}") + print(f"Reverb RMS: {np.sqrt(np.mean(reverb_extracted**2)):.4f}") + print(f"Separation ratio: {np.sqrt(np.mean(dry_extracted**2)) / (np.sqrt(np.mean(reverb_extracted**2)) + 1e-8):.2f}") + + print("\n[OK] Advanced dereverberation test passed!") diff --git a/infer/cover_pipeline.py b/infer/cover_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..37b853bc8e6b8fcd095f31731637ea18599bb445 --- /dev/null +++ b/infer/cover_pipeline.py @@ -0,0 +1,2241 @@ +# -*- coding: utf-8 -*- +""" +翻唱流水线 - 整合人声分离、RVC转换、混音的完整流程 +""" +import os +import gc +import re +import uuid +import shutil +import torch +import numpy as np +from pathlib import Path +from typing import Optional, Callable, Dict, Tuple, List + +from infer.separator import ( + VocalSeparator, + RoformerSeparator, + KaraokeSeparator, + ROFORMER_DEFAULT_MODEL, + KARAOKE_DEFAULT_MODEL, + check_demucs_available, + check_roformer_available, + get_available_models, +) +from infer.official_adapter import ( + setup_official_env, + separate_uvr5, + separate_uvr5_official_upstream, + convert_vocals_official, + convert_vocals_official_upstream, +) +from infer.advanced_dereverb import advanced_dereverb, apply_reverb_to_converted +from lib.audio import soft_clip +from lib.mixer import mix_vocals_and_accompaniment +from lib.logger import log +from lib.device import get_device, empty_device_cache + + +def _format_size(size_bytes: int) -> str: + """格式化文件大小""" + for unit in ['B', 'KB', 'MB', 'GB']: + if size_bytes < 1024: + return f"{size_bytes:.2f} {unit}" + size_bytes /= 1024 + return f"{size_bytes:.2f} TB" + + +def _get_audio_duration(file_path: str) -> float: + """获取音频时长(秒)""" + try: + import soundfile as sf + info = sf.info(file_path) + return info.duration + except: + return 0.0 + + +def _format_duration(seconds: float) -> str: + """格式化时长""" + minutes = int(seconds // 60) + secs = int(seconds % 60) + return f"{minutes}:{secs:02d}" + + +class CoverPipeline: + """AI 翻唱流水线""" + + def __init__(self, device: str = "cuda"): + """ + 初始化流水线 + + Args: + device: 计算设备 + """ + self.device = str(get_device(device)) + self.separator = None + self.karaoke_separator = None + self.rvc_pipeline = None + self.temp_dir = Path(__file__).parent.parent / "temp" / "cover" + self._last_vc_preprocess_mode = "direct" + + def _get_session_dir(self, session_id: str = None) -> Path: + """获取会话临时目录""" + if session_id is None: + session_id = str(uuid.uuid4())[:8] + session_dir = self.temp_dir / session_id + session_dir.mkdir(parents=True, exist_ok=True) + return session_dir + + @staticmethod + def _get_available_uvr_deecho_model() -> Optional[str]: + """优先使用学习型 DeEcho / DeReverb,而不是手工频谱去回声。""" + root = Path(__file__).parent.parent / "assets" / "uvr5_weights" + candidates = [ + ("VR-DeEchoDeReverb", root / "VR-DeEchoDeReverb.pth"), + ("onnx_dereverb_By_FoxJoy", root / "onnx_dereverb_By_FoxJoy" / "vocals.onnx"), + ("VR-DeEchoNormal", root / "VR-DeEchoNormal.pth"), + ("VR-DeEchoAggressive", root / "VR-DeEchoAggressive.pth"), + ] + for model_name, model_path in candidates: + if model_path.exists(): + return model_name + return None + + def _apply_uvr_deecho_for_vc(self, vocals_path: str, session_dir: Path) -> Optional[str]: + """如果本地已有 UVR DeEcho 模型,则优先用学习型方法清理回声。""" + model_name = self._get_available_uvr_deecho_model() + if not model_name: + return None + + from infer.modules.uvr5.modules import uvr + + root = Path(__file__).parent.parent + os.environ["weight_uvr5_root"] = str(root / "assets" / "uvr5_weights") + + input_dir = session_dir / "vc_deecho_input" + vocal_dir = session_dir / "vc_deecho_vocal" + ins_dir = session_dir / "vc_deecho_ins" + input_dir.mkdir(parents=True, exist_ok=True) + vocal_dir.mkdir(parents=True, exist_ok=True) + ins_dir.mkdir(parents=True, exist_ok=True) + + input_file = input_dir / Path(vocals_path).name + shutil.copy2(vocals_path, input_file) + + log.model(f"VC预处理使用UVR DeEcho模型: {model_name}") + for _ in uvr(model_name, str(input_dir), str(vocal_dir), [], str(ins_dir), 10, "wav"): + pass + + candidate_files = sorted( + list(vocal_dir.glob("*.wav")) + list(ins_dir.glob("*.wav")), + key=lambda path: path.stat().st_mtime, + ) + if not candidate_files: + log.warning("UVR DeEcho produced no usable vocal output; falling back to direct lead input") + return None + + selected_file = self._select_best_uvr_deecho_output(vocals_path, candidate_files) + if selected_file is None: + selected_file = candidate_files[-1] + log.audio(f"UVR DeEcho selected vocal output: {selected_file.name}") + return str(selected_file) + + @staticmethod + def _score_uvr_deecho_candidate(reference_path: str, candidate_path: Path) -> Optional[Tuple[float, Dict[str, float]]]: + """Score UVR DeEcho candidate for VC: keep direct lead, minimize quiet residuals.""" + import librosa + + try: + reference_audio, reference_sr = librosa.load(reference_path, sr=None, mono=True) + candidate_audio, candidate_sr = librosa.load(str(candidate_path), sr=None, mono=True) + except Exception: + return None + + reference_audio = np.asarray(reference_audio, dtype=np.float32) + candidate_audio = np.asarray(candidate_audio, dtype=np.float32) + if reference_audio.size == 0 or candidate_audio.size == 0: + return None + + if candidate_sr != reference_sr: + candidate_audio = librosa.resample( + candidate_audio, + orig_sr=candidate_sr, + target_sr=reference_sr, + ).astype(np.float32) + + aligned_len = min(reference_audio.size, candidate_audio.size) + if aligned_len <= 2048: + return None + + reference_audio = reference_audio[:aligned_len] + candidate_audio = candidate_audio[:aligned_len] + + frame_length = 2048 + hop_length = 512 + eps = 1e-8 + frame_rms = librosa.feature.rms( + y=reference_audio, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0] + if frame_rms.size == 0: + return None + + frame_db = 20.0 * np.log10(frame_rms + eps) + ref_db = float(np.percentile(frame_db, 95)) + active_frames = frame_db > (ref_db - 24.0) + quiet_frames = frame_db < (ref_db - 36.0) + + active_mask = np.repeat(active_frames.astype(np.float32), hop_length) + quiet_mask = np.repeat(quiet_frames.astype(np.float32), hop_length) + if active_mask.size < aligned_len: + active_mask = np.pad(active_mask, (0, aligned_len - active_mask.size), mode="edge") + if quiet_mask.size < aligned_len: + quiet_mask = np.pad(quiet_mask, (0, aligned_len - quiet_mask.size), mode="edge") + active_mask = active_mask[:aligned_len] > 0.5 + quiet_mask = quiet_mask[:aligned_len] > 0.5 + + if not np.any(active_mask): + return None + + active_rms = float(np.sqrt(np.mean(np.square(candidate_audio[active_mask])) + 1e-12)) + quiet_rms = float(np.sqrt(np.mean(np.square(candidate_audio[quiet_mask])) + 1e-12)) if np.any(quiet_mask) else 1e-6 + ref_active_rms = float(np.sqrt(np.mean(np.square(reference_audio[active_mask])) + 1e-12)) + corr = 0.0 + if np.sum(active_mask) > 32: + corr_val = np.corrcoef(reference_audio[active_mask], candidate_audio[active_mask])[0, 1] + if np.isfinite(corr_val): + corr = float(np.clip(corr_val, -1.0, 1.0)) + + separation_db = float(20.0 * np.log10((active_rms + 1e-12) / (quiet_rms + 1e-12))) + active_ratio = float(active_rms / (ref_active_rms + 1e-12)) + ratio_penalty = abs(float(np.log2(max(active_ratio, 1e-4)))) + score = separation_db + 18.0 * corr - 6.0 * ratio_penalty + + return score, { + "score": score, + "separation_db": separation_db, + "corr": corr, + "active_ratio": active_ratio, + } + + def _select_best_uvr_deecho_output(self, reference_path: str, candidate_files: List[Path]) -> Optional[Path]: + """Pick the UVR DeEcho branch best suited for VC input.""" + best_path = None + best_score = None + + for candidate_path in candidate_files: + scored = self._score_uvr_deecho_candidate(reference_path, candidate_path) + if scored is None: + continue + + score, metrics = scored + log.detail( + "UVR DeEcho candidate: " + f"{candidate_path.name}, score={metrics['score']:.2f}, " + f"sep={metrics['separation_db']:.2f}dB, corr={metrics['corr']:.3f}, " + f"ratio={metrics['active_ratio']:.3f}" + ) + if best_score is None or score > best_score: + best_score = score + best_path = candidate_path + + return best_path + + def _init_separator( + self, + model_name: str = "htdemucs", + shifts: int = 2, + overlap: float = 0.25, + split: bool = True + ): + """初始化人声分离器 (Demucs 或 Roformer)""" + # Roformer 模式 + if model_name == "roformer": + if not check_roformer_available(): + raise ImportError( + "请安装 audio-separator: pip install audio-separator[gpu]" + ) + if ( + self.separator is not None + and isinstance(self.separator, RoformerSeparator) + ): + return + if self.separator is not None: + self.separator.unload_model() + self.separator = None + self.separator = RoformerSeparator(device=self.device) + return + + # Demucs 模式 + if not check_demucs_available(): + raise ImportError("请安装 demucs: pip install demucs") + + available = {m["name"] for m in get_available_models() if m["name"] != "roformer"} + if model_name not in available: + log.warning( + f"未知的 Demucs 模型 '{model_name}',回退到 'htdemucs'" + ) + model_name = "htdemucs" + + if ( + self.separator is not None + and isinstance(self.separator, VocalSeparator) + and getattr(self.separator, "model_name", None) == model_name + and getattr(self.separator, "shifts", None) == shifts + and getattr(self.separator, "overlap", None) == overlap + and getattr(self.separator, "split", None) == split + ): + return + + if self.separator is not None: + self.separator.unload_model() + self.separator = None + + self.separator = VocalSeparator( + model_name=model_name, + device=self.device, + shifts=shifts, + overlap=overlap, + split=split + ) + + def _init_karaoke_separator(self, model_name: str = KARAOKE_DEFAULT_MODEL): + """初始化主唱/和声分离器""" + if not check_roformer_available(): + raise ImportError("请安装 audio-separator: pip install audio-separator[gpu]") + + if ( + self.karaoke_separator is not None + and isinstance(self.karaoke_separator, KaraokeSeparator) + and model_name in getattr(self.karaoke_separator, "model_candidates", []) + ): + return + + if self.karaoke_separator is not None: + self.karaoke_separator.unload_model() + self.karaoke_separator = None + + self.karaoke_separator = KaraokeSeparator( + model_filename=model_name, + device=self.device, + ) + + def _separate_karaoke( + self, + vocals_path: str, + session_dir: Path, + karaoke_model: str = KARAOKE_DEFAULT_MODEL, + ) -> Tuple[str, str]: + """分离主唱与和声,并在分离后立即释放显存""" + karaoke_dir = session_dir / "karaoke" + karaoke_dir.mkdir(parents=True, exist_ok=True) + + self._init_karaoke_separator(karaoke_model) + lead_vocals_path, backing_vocals_path = self.karaoke_separator.separate( + vocals_path, + str(karaoke_dir), + ) + + if self.karaoke_separator is not None: + self.karaoke_separator.unload_model() + self.karaoke_separator = None + gc.collect() + empty_device_cache() + + return lead_vocals_path, backing_vocals_path + + @staticmethod + def _ensure_2d(audio: np.ndarray) -> np.ndarray: + if audio.ndim == 1: + return audio[np.newaxis, :] + return audio + + @staticmethod + def _match_channels(audio: np.ndarray, channels: int) -> np.ndarray: + if audio.shape[0] == channels: + return audio + if audio.shape[0] == 1 and channels == 2: + return np.repeat(audio, 2, axis=0) + if audio.shape[0] == 2 and channels == 1: + return np.mean(audio, axis=0, keepdims=True) + if audio.shape[0] > channels: + return audio[:channels] + repeats = channels - audio.shape[0] + if repeats <= 0: + return audio + return np.concatenate([audio, np.repeat(audio[-1:, :], repeats, axis=0)], axis=0) + + @staticmethod + def _resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: + if orig_sr == target_sr: + return audio + import librosa + + if audio.ndim == 1: + return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) + return np.stack( + [librosa.resample(ch, orig_sr=orig_sr, target_sr=target_sr) for ch in audio], + axis=0, + ) + + @staticmethod + def _estimate_echo_metric(audio: np.ndarray, sr: int) -> float: + """Estimate echo/reverb amount from RMS-envelope autocorrelation.""" + import librosa + + if audio.size == 0: + return 1.0 + rms = librosa.feature.rms(y=audio, frame_length=1024, hop_length=256, center=True)[0] + if rms.size < 8: + return 1.0 + rms = rms - float(np.mean(rms)) + denom = float(np.dot(rms, rms) + 1e-8) + if denom <= 0: + return 1.0 + ac = np.correlate(rms, rms, mode="full")[len(rms) - 1 :] / denom + lag_min = max(1, int(0.03 * sr / 256)) # 30ms + lag_max = max(lag_min + 1, int(0.12 * sr / 256)) # 120ms + lag_max = min(lag_max, len(ac)) + if lag_min >= lag_max: + return 1.0 + return float(np.max(ac[lag_min:lag_max])) + + def _select_mono_for_vc(self, audio: np.ndarray, sr: int) -> np.ndarray: + """ + Pick the least-echo mono candidate from {L, R, Mid} to avoid phase-mix artifacts. + """ + audio = self._ensure_2d(audio).astype(np.float32) + if audio.shape[0] == 1: + return audio[0] + + left = audio[0] + right = audio[1] if audio.shape[0] > 1 else audio[0] + mid = 0.5 * (left + right) + candidates = { + "left": left, + "right": right, + "mid": mid, + } + best_name = None + best_score = None + for name, cand in candidates.items(): + score = self._estimate_echo_metric(cand, sr) + if best_score is None or score < best_score: + best_name = name + best_score = score + + if log: + log.detail( + f"VC输入单声道选择: {best_name}, 回声指标={best_score:.4f}" + ) + return candidates[best_name] + + @staticmethod + def _dereverb_for_vc(audio: np.ndarray, sr: int) -> np.ndarray: + """ + 智能去混响:区分自然混响和真实回声,动态调整抑制强度 + """ + import librosa + + if audio.size == 0: + return audio + x = audio.astype(np.float32) + n_fft = 2048 + hop = 512 + win = 2048 + eps = 1e-8 + + spec = librosa.stft(x, n_fft=n_fft, hop_length=hop, win_length=win) + mag = np.abs(spec).astype(np.float32) + phase = np.exp(1j * np.angle(spec)) + + if mag.shape[1] < 4: + return x + + # 计算RMS能量曲线,用于区分高能量段和低能量段 + rms = librosa.feature.rms(y=x, frame_length=win, hop_length=hop, center=True)[0] + rms_db = 20.0 * np.log10(rms + eps) + ref_db = float(np.percentile(rms_db, 90)) + + # 高能量段(主唱强的地方):vocal_strength接近1 + # 低能量段(回声尾巴):vocal_strength接近0 + vocal_strength = np.clip((rms_db - (ref_db - 35.0)) / 25.0, 0.0, 1.0) + vocal_strength = np.pad(vocal_strength, (0, mag.shape[1] - len(vocal_strength)), mode='edge') + + late = np.zeros_like(mag, dtype=np.float32) + # Recursive late-reverb estimate: decayed history + delayed observation. + for t in range(2, mag.shape[1]): + late[:, t] = np.maximum( + late[:, t - 1] * 0.94, + mag[:, t - 2] * 0.86, + ) + + # 动态抑制系数:高能量段保守(0.65),低能量段激进(0.82) + suppress_coef = 0.65 + 0.17 * (1.0 - vocal_strength) + direct = np.maximum(mag - suppress_coef[np.newaxis, :] * late, 0.0) + + # Dynamic floor: pure-echo frames get floor≈0, direct-voice frames keep more + echo_ratio = np.clip(late / (mag + eps), 0.0, 1.0) + # 高能量段保留更多原始信号(floor系数0.22),低能量段少保留(0.12) + floor_coef = 0.12 + 0.10 * vocal_strength + floor = (1.0 - echo_ratio) * floor_coef[np.newaxis, :] * mag + direct = np.maximum(direct, floor) + + # Smooth in time to avoid musical noise. + kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + kernel /= np.sum(kernel) + direct = np.apply_along_axis( + lambda row: np.convolve(row, kernel, mode="same"), + axis=1, + arr=direct, + ) + direct = np.clip(direct, 0.0, mag + eps) + + # Dynamic dry blend: 高能量段混合更多原始信号(0.30),低能量段少混合(0.10) + frame_echo = np.mean(echo_ratio, axis=0, keepdims=True) # [1, T] + blend = (1.0 - frame_echo) * (0.10 + 0.20 * vocal_strength[np.newaxis, :]) + out_spec = direct * phase + dry_spec = mag * phase + blended_spec = (1.0 - blend) * out_spec + blend * dry_spec + out = librosa.istft(blended_spec, hop_length=hop, win_length=win, length=len(x)).astype(np.float32) + + out = soft_clip(out, threshold=0.9, ceiling=0.99) + return out.astype(np.float32) + + @staticmethod + def _compute_echo_tail_sample_gain( + original: np.ndarray, + dereverbed: np.ndarray, + sr: int, + ) -> Tuple[np.ndarray, int, int]: + """根据 original 与 dereverbed 的差异估计回声尾段抑制增益。""" + import librosa + + if original.size == 0 or dereverbed.size == 0: + return np.ones_like(dereverbed, dtype=np.float32), 0, 0 + + frame_length = 2048 + hop_length = 512 + orig_rms = librosa.feature.rms( + y=original, frame_length=frame_length, hop_length=hop_length, center=True + )[0] + derev_rms = librosa.feature.rms( + y=dereverbed, frame_length=frame_length, hop_length=hop_length, center=True + )[0] + + eps = 1e-8 + orig_rms_db = 20.0 * np.log10(orig_rms + eps) + ref_db = float(np.percentile(orig_rms_db, 95)) + + attenuation_ratio = derev_rms / (orig_rms + eps) + + vocal_activity = np.clip((orig_rms_db - (ref_db - 30.0)) / 18.0, 0.0, 1.0) + hold_frames = max(1, int(0.28 * sr / hop_length)) + vocal_activity = CoverPipeline._hold_activity_curve(vocal_activity, hold_frames) + + # Mark frames: original is quiet (echo tail) AND dereverb removed a lot + quiet_mask = ( + (orig_rms_db < (ref_db - 40.0)) + & (attenuation_ratio < 0.25) + & (vocal_activity < 0.15) + ) + + # Enforce minimum duration of 100ms + min_frames = max(1, int(0.1 * sr / hop_length)) + # Dilate: only keep runs >= min_frames + gate = quiet_mask.astype(np.float32) + # Simple run-length filter + filtered = np.zeros_like(gate) + run_start = 0 + in_run = False + for i in range(len(gate)): + if gate[i] > 0.5: + if not in_run: + run_start = i + in_run = True + else: + if in_run: + if (i - run_start) >= min_frames: + filtered[run_start:i] = 1.0 + in_run = False + if in_run and (len(gate) - run_start) >= min_frames: + filtered[run_start:len(gate)] = 1.0 + + # 50ms sigmoid transition + transition_frames = max(1, int(0.05 * sr / hop_length)) + kernel = np.ones(transition_frames, dtype=np.float32) / transition_frames + filtered = np.convolve(filtered, kernel, mode="same") + filtered = np.clip(filtered, 0.0, 1.0) + + # Apply: gated frames attenuated to 0.18x,保留更多尾音避免不自然断裂 + gain_curve = 1.0 - filtered * 0.82 # 1.0 for normal, 0.18 for gated + + # Expand frame-level gain to sample-level + sample_gain = CoverPipeline._frame_curve_to_sample_gain( + gain_curve, + len(dereverbed), + hop_length, + ) + + gated_count = int(np.sum(filtered > 0.5)) + return sample_gain.astype(np.float32), gated_count, len(filtered) + + @staticmethod + def _fit_frame_curve(curve: np.ndarray, target_len: int) -> np.ndarray: + """Pad/truncate frame curves to the target frame count.""" + curve = np.asarray(curve, dtype=np.float32).reshape(-1) + if target_len <= 0: + return np.zeros(0, dtype=np.float32) + if curve.size == target_len: + return curve + if curve.size == 0: + return np.zeros(target_len, dtype=np.float32) + if curve.size > target_len: + return curve[:target_len].astype(np.float32) + pad_width = target_len - curve.size + return np.pad(curve, (0, pad_width), mode="edge").astype(np.float32) + + @staticmethod + def _hold_activity_curve(curve: np.ndarray, hold_frames: int) -> np.ndarray: + """Keep recent vocal activity for a short trailing window.""" + curve = np.asarray(curve, dtype=np.float32).reshape(-1) + if curve.size == 0: + return curve + + hold_frames = max(1, int(hold_frames)) + if hold_frames <= 1: + return curve.astype(np.float32) + + held = np.empty_like(curve, dtype=np.float32) + window = [] + for index, value in enumerate(curve): + while window and window[-1][1] <= value: + window.pop() + window.append((index, float(value))) + min_index = index - hold_frames + 1 + while window and window[0][0] < min_index: + window.pop(0) + held[index] = window[0][1] if window else float(value) + return held.astype(np.float32) + + @staticmethod + def _frame_curve_to_sample_gain( + frame_curve: np.ndarray, + n_samples: int, + hop_length: int, + ) -> np.ndarray: + """Interpolate frame-domain gains to sample-domain gains.""" + if n_samples <= 0: + return np.zeros(0, dtype=np.float32) + + frame_curve = np.asarray(frame_curve, dtype=np.float32).reshape(-1) + if frame_curve.size == 0: + return np.ones(n_samples, dtype=np.float32) + + sample_indices = np.arange(n_samples, dtype=np.float32) + frame_indices = np.clip(sample_indices / float(hop_length), 0, frame_curve.size - 1) + return np.interp( + frame_indices, + np.arange(frame_curve.size, dtype=np.float32), + frame_curve, + ).astype(np.float32) + + + @staticmethod + def _compute_activity_sample_weights( + reference_audio: np.ndarray, + sr: int, + frame_length: int = 2048, + hop_length: int = 512, + ) -> np.ndarray: + """Build sample-domain weights from active vocal regions only.""" + import librosa + + reference_audio = np.asarray(reference_audio, dtype=np.float32).reshape(-1) + if reference_audio.size == 0: + return np.zeros(0, dtype=np.float32) + + eps = 1e-8 + frame_rms = librosa.feature.rms( + y=reference_audio, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0] + frame_rms = np.asarray(frame_rms, dtype=np.float32) + frame_db = 20.0 * np.log10(frame_rms + eps) + ref_db = float(np.percentile(frame_db, 95)) + + activity = np.clip((frame_db - (ref_db - 30.0)) / 18.0, 0.0, 1.0) + kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + kernel /= np.sum(kernel) + activity = np.convolve(activity, kernel, mode="same") + activity = CoverPipeline._hold_activity_curve( + activity, + max(1, int(0.24 * sr / hop_length)), + ) + frame_weights = np.clip(activity * activity, 0.0, 1.0) + return CoverPipeline._frame_curve_to_sample_gain( + frame_weights, + len(reference_audio), + hop_length, + ) + + @staticmethod + def _weighted_rms(audio: np.ndarray, weights: np.ndarray) -> float: + """Compute RMS under sample-domain weights.""" + audio = np.asarray(audio, dtype=np.float32).reshape(-1) + weights = np.asarray(weights, dtype=np.float32).reshape(-1) + if audio.size == 0 or weights.size == 0: + return 0.0 + + aligned_len = min(audio.size, weights.size) + if aligned_len <= 0: + return 0.0 + + audio = audio[:aligned_len] + weights = np.clip(weights[:aligned_len], 0.0, 1.0) + total = float(np.sum(weights)) + if total <= 1e-6: + return 0.0 + return float(np.sqrt(np.sum((audio * audio) * weights) / total + 1e-12)) + + def _apply_source_gap_suppression( + self, + source_vocals_path: str, + converted_vocals_path: str, + ) -> None: + """Suppress hallucinated noise in sustained no-vocal gaps only.""" + import librosa + import soundfile as sf + + source_audio, source_sr = librosa.load(source_vocals_path, sr=None, mono=True) + converted_audio, converted_sr = sf.read(converted_vocals_path) + if converted_audio.ndim > 1: + converted_audio = converted_audio.mean(axis=1) + source_audio = np.asarray(source_audio, dtype=np.float32) + converted_audio = np.asarray(converted_audio, dtype=np.float32) + + if source_sr != converted_sr: + source_audio = librosa.resample( + source_audio, + orig_sr=source_sr, + target_sr=converted_sr, + ).astype(np.float32) + + aligned_len = min(len(source_audio), len(converted_audio)) + if aligned_len <= 0: + return + + source_audio = source_audio[:aligned_len] + converted_main = converted_audio[:aligned_len] + gain, gated_frames, total_frames = self._compute_quiet_gap_sample_gain( + source_audio, + converted_sr, + ) + gain = np.clip(gain[:aligned_len], 0.0, 1.0).astype(np.float32) + suppressed = converted_main * gain + + attenuated_samples = int(np.sum(gain < 0.08)) + if attenuated_samples > 0: + log.detail( + f"Source gap suppression: attenuated {attenuated_samples}/{aligned_len} samples in no-vocal regions" + ) + if gated_frames > 0: + log.detail( + f"Source gap suppression: detected {gated_frames}/{total_frames} sustained quiet frames" + ) + + if len(converted_audio) > aligned_len: + tail = converted_audio[aligned_len:] * 0.0 + converted_audio = np.concatenate([suppressed, tail.astype(np.float32)]) + else: + converted_audio = suppressed + + sf.write(converted_vocals_path, converted_audio.astype(np.float32), converted_sr) + + @staticmethod + def _compute_quiet_gap_sample_gain( + reference_audio: np.ndarray, + sr: int, + frame_length: int = 2048, + hop_length: int = 512, + ) -> Tuple[np.ndarray, int, int]: + """Build a deep attenuation curve for sustained quiet gaps between vocal phrases.""" + import librosa + + reference_audio = np.asarray(reference_audio, dtype=np.float32).reshape(-1) + if reference_audio.size == 0: + return np.zeros(0, dtype=np.float32), 0, 0 + + eps = 1e-8 + frame_rms = librosa.feature.rms( + y=reference_audio, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0] + frame_rms = np.asarray(frame_rms, dtype=np.float32) + if frame_rms.size == 0: + return np.ones(reference_audio.size, dtype=np.float32), 0, 0 + + frame_db = 20.0 * np.log10(frame_rms + eps) + ref_db = float(np.percentile(frame_db, 95)) + + activity = np.clip((frame_db - (ref_db - 28.0)) / 14.0, 0.0, 1.0) + kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + kernel /= np.sum(kernel) + activity = np.convolve(activity, kernel, mode="same") + activity = CoverPipeline._hold_activity_curve( + activity, + max(1, int(0.08 * sr / hop_length)), + ) + + quiet_mask = ( + (frame_db < (ref_db - 36.0)) + & (activity < 0.12) + ) + + min_frames = max(1, int(0.12 * sr / hop_length)) + gate = quiet_mask.astype(np.float32) + filtered = np.zeros_like(gate) + run_start = 0 + in_run = False + for i in range(len(gate)): + if gate[i] > 0.5: + if not in_run: + run_start = i + in_run = True + else: + if in_run: + if (i - run_start) >= min_frames: + filtered[run_start:i] = 1.0 + in_run = False + if in_run and (len(gate) - run_start) >= min_frames: + filtered[run_start:len(gate)] = 1.0 + + transition_frames = max(1, int(0.04 * sr / hop_length)) + smooth_kernel = np.ones(transition_frames, dtype=np.float32) / transition_frames + filtered = np.convolve(filtered, smooth_kernel, mode="same") + filtered = np.clip(filtered, 0.0, 1.0) + + gain_curve = 1.0 - filtered * 0.92 + sample_gain = CoverPipeline._frame_curve_to_sample_gain( + gain_curve, + len(reference_audio), + hop_length, + ) + + gated_count = int(np.sum(filtered > 0.5)) + return sample_gain.astype(np.float32), gated_count, len(filtered) + + def _compute_active_rms_gain( + self, + reference_audio: np.ndarray, + target_audio: np.ndarray, + sr: int, + min_gain: float = 0.7, + max_gain: float = 1.8, + ) -> Tuple[float, float, float, np.ndarray]: + """Estimate active-region gain and its sample-domain weight curve.""" + reference_audio = np.asarray(reference_audio, dtype=np.float32).reshape(-1) + target_audio = np.asarray(target_audio, dtype=np.float32).reshape(-1) + aligned_len = min(reference_audio.size, target_audio.size) + if aligned_len <= 0: + return 1.0, 0.0, 0.0, np.zeros(0, dtype=np.float32) + + reference_audio = reference_audio[:aligned_len] + target_audio = target_audio[:aligned_len] + weights = self._compute_activity_sample_weights(reference_audio, sr)[:aligned_len] + ref_rms = self._weighted_rms(reference_audio, weights) + out_rms = self._weighted_rms(target_audio, weights) + if ref_rms <= 1e-6 or out_rms <= 1e-6: + return 1.0, ref_rms, out_rms, weights + + gain = float(np.clip(ref_rms / out_rms, min_gain, max_gain)) + return gain, ref_rms, out_rms, weights + + @staticmethod + def _apply_weighted_gain( + audio: np.ndarray, + weights: np.ndarray, + gain: float, + ) -> np.ndarray: + """Apply gain mainly on active vocal regions, not on tails/gaps.""" + audio = np.asarray(audio, dtype=np.float32).reshape(-1) + weights = np.asarray(weights, dtype=np.float32).reshape(-1) + aligned_len = min(audio.size, weights.size) + if aligned_len <= 0: + return audio.astype(np.float32) + + output = audio.copy().astype(np.float32) + gain_curve = 1.0 + np.clip(weights[:aligned_len], 0.0, 1.0) * float(gain - 1.0) + output[:aligned_len] *= gain_curve.astype(np.float32) + return output.astype(np.float32) + + @staticmethod + def _gate_echo_tails( + original: np.ndarray, dereverbed: np.ndarray, sr: int + ) -> np.ndarray: + """ + Gate echo-tail segments where dereverb removed most energy but + residual noise would still trigger HuBERT feature extraction. + """ + sample_gain, gated_count, total_frames = CoverPipeline._compute_echo_tail_sample_gain( + original, + dereverbed, + sr, + ) + if gated_count > 0: + log.detail(f"回声尾音门控: {gated_count}/{total_frames} 帧被衰减") + + return (dereverbed * sample_gain).astype(np.float32) + + def _should_apply_source_constraint( + self, + vc_preprocessed: bool, + source_constraint_mode: str, + ) -> bool: + """Decide whether to run source-guided post constraint.""" + normalized_mode = str(source_constraint_mode or "auto").strip().lower() + if normalized_mode == "on": + return vc_preprocessed + if normalized_mode == "auto": + return vc_preprocessed and self._last_vc_preprocess_mode in {"uvr_deecho", "legacy"} + return False + + def _refine_source_constrained_output( + self, + source_vocals_path: str, + converted_vocals_path: str, + source_constraint_mode: str, + f0_method: str, + ) -> None: + """Apply extra cleanup passes for mature UVR DeEcho routing.""" + normalized_mode = str(source_constraint_mode or "auto").strip().lower() + if normalized_mode != "auto": + return + if self._last_vc_preprocess_mode != "uvr_deecho": + return + + self._apply_silence_gate_official( + vocals_path=source_vocals_path, + converted_path=converted_vocals_path, + f0_method=f0_method, + silence_threshold_db=-42.0, + silence_smoothing_ms=35.0, + silence_min_duration_ms=80.0, + protect=0.0, + ) + log.detail("Low-energy unvoiced cleanup: applied after source-guided reconstruction") + + self._apply_source_gap_suppression( + source_vocals_path=source_vocals_path, + converted_vocals_path=converted_vocals_path, + ) + log.detail("Source gap suppression: refined after source-guided reconstruction") + + @staticmethod + def _blend_direct_with_deecho( + direct_mono: np.ndarray, + deecho_mono: np.ndarray, + sr: int, + ) -> np.ndarray: + """Blend direct lead with DeEcho result, using echo presence detection. + + Previous logic only applied DeEcho in low-activity (silent) regions, + which meant echo during active singing passed straight through to HuBERT. + Now we detect echo presence per-frame by comparing direct vs deecho energy: + large energy difference = strong echo = higher DeEcho weight even while singing. + """ + import librosa + + direct_mono = np.asarray(direct_mono, dtype=np.float32).reshape(-1) + deecho_mono = np.asarray(deecho_mono, dtype=np.float32).reshape(-1) + aligned_len = min(direct_mono.size, deecho_mono.size) + if aligned_len <= 0: + return direct_mono.astype(np.float32) + + direct_main = direct_mono[:aligned_len] + deecho_main = deecho_mono[:aligned_len] + + frame_length = 2048 + hop_length = 512 + eps = 1e-8 + smooth_kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + smooth_kernel /= np.sum(smooth_kernel) + + # --- Activity detection (unchanged) --- + frame_rms = librosa.feature.rms( + y=direct_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0] + frame_db = 20.0 * np.log10(frame_rms + eps) + ref_db = float(np.percentile(frame_db, 95)) if frame_db.size > 0 else -20.0 + + activity = np.clip((frame_db - (ref_db - 32.0)) / 14.0, 0.0, 1.0) + activity = np.convolve(activity, smooth_kernel, mode="same") + activity = CoverPipeline._hold_activity_curve( + activity, + max(1, int(0.04 * sr / hop_length)), + ) + activity = np.clip(activity, 0.0, 1.0) + + # --- Echo presence detection --- + # Compare per-frame RMS of direct vs deecho: if deecho removed a lot + # of energy, that energy was echo/reverb. + deecho_rms = librosa.feature.rms( + y=deecho_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0] + n_frames = min(frame_rms.shape[-1], deecho_rms.shape[-1]) + frame_rms_aligned = frame_rms[..., :n_frames] + deecho_rms_aligned = deecho_rms[..., :n_frames] + + # echo_ratio: how much energy was removed by deecho (0=none, 1=all) + echo_ratio = np.clip( + 1.0 - (deecho_rms_aligned / (frame_rms_aligned + eps)), + 0.0, + 1.0, + ) + # Smooth to avoid frame-level jitter + if echo_ratio.ndim > 1: + echo_ratio = echo_ratio[0] + echo_ratio = np.convolve(echo_ratio, smooth_kernel, mode="same") + # Widen with a hold window to cover reverb tails + echo_ratio = CoverPipeline._hold_activity_curve( + echo_ratio, + max(1, int(0.08 * sr / hop_length)), + ) + echo_ratio = np.clip(echo_ratio, 0.0, 1.0) + + # Align to activity length + n_blend = min(len(activity), len(echo_ratio)) + activity = activity[:n_blend] + echo_ratio = echo_ratio[:n_blend] + + # --- Blending weight --- + # Base: original low-activity weight (for silent gaps) + base_weight = 0.65 * np.square(1.0 - activity[:n_blend]) + # Echo boost: even during active singing, apply DeEcho proportional + # to detected echo. Max additional contribution capped at 0.55. + echo_boost = 0.55 * echo_ratio * activity[:n_blend] + deecho_weight = base_weight + echo_boost + deecho_weight = np.convolve(deecho_weight, smooth_kernel, mode="same") + deecho_weight = np.clip(deecho_weight, 0.0, 0.80) + deecho_weight = CoverPipeline._frame_curve_to_sample_gain( + deecho_weight, + aligned_len, + hop_length, + ) + + blended = direct_main * (1.0 - deecho_weight) + deecho_main * deecho_weight + if direct_mono.size > aligned_len: + blended = np.concatenate([blended, direct_mono[aligned_len:]]) + return blended.astype(np.float32) + + def _prepare_vocals_for_vc( + self, + vocals_path: str, + session_dir: Path, + preprocess_mode: str = "auto", + ) -> str: + """ + Prepare vocals for VC using a mature-project-friendly routing strategy. + + Modes: + - auto: prefer learned UVR DeEcho/DeReverb, otherwise advanced dereverb -> RVC + - direct: pass separated lead directly to RVC + - uvr_deecho: require learned UVR DeEcho if available, else fallback to advanced dereverb + - advanced_dereverb: use binary residual masking to separate dry/wet, convert dry only + - legacy: old hand-crafted dereverb + tail gating chain + """ + import librosa + import soundfile as sf + + preprocess_mode = str(preprocess_mode or "auto").strip().lower() + if preprocess_mode not in {"auto", "direct", "uvr_deecho", "advanced_dereverb", "legacy"}: + preprocess_mode = "auto" + + # 保存原始混响用于后处理 + self._original_reverb_path = None + + if preprocess_mode == "advanced_dereverb": + # 使用高级去混响:分离干声和混响 + audio, sr = librosa.load(vocals_path, sr=None, mono=False) + audio = self._ensure_2d(audio).astype(np.float32) + mono = self._select_mono_for_vc(audio, sr) + + log.detail("VC preprocess: advanced dereverb (binary residual masking)") + dry_signal, reverb_tail = advanced_dereverb(mono, sr) + + # 保存混响用于后处理 + reverb_path = session_dir / "original_reverb.wav" + sf.write(str(reverb_path), reverb_tail, sr) + self._original_reverb_path = str(reverb_path) + + mono = dry_signal + self._last_vc_preprocess_mode = "advanced_dereverb" + log.detail(f"Dry/Wet separation: dry RMS={np.sqrt(np.mean(dry_signal**2)):.4f}, reverb RMS={np.sqrt(np.mean(reverb_tail**2)):.4f}") + + elif preprocess_mode == "legacy": + audio, sr = librosa.load(vocals_path, sr=None, mono=False) + audio = self._ensure_2d(audio).astype(np.float32) + mono = self._select_mono_for_vc(audio, sr) + mono_dry = mono.copy() + mono = self._dereverb_for_vc(mono, sr) + mono = self._gate_echo_tails(mono_dry, mono, sr) + self._last_vc_preprocess_mode = "legacy" + log.detail("VC preprocess: legacy dereverb chain -> mono select") + else: + preprocess_input = vocals_path + if preprocess_mode in {"auto", "uvr_deecho"}: + preprocess_input = self._apply_uvr_deecho_for_vc(vocals_path, session_dir) or vocals_path + + if preprocess_input == vocals_path: + # 如果UVR DeEcho不可用,在auto模式下使用advanced dereverb + if preprocess_mode == "auto": + audio, sr = librosa.load(vocals_path, sr=None, mono=False) + audio = self._ensure_2d(audio).astype(np.float32) + mono = self._select_mono_for_vc(audio, sr) + + log.detail("VC preprocess: UVR DeEcho not available, using advanced dereverb") + dry_signal, reverb_tail = advanced_dereverb(mono, sr) + + # 保存混响用于后处理 + reverb_path = session_dir / "original_reverb.wav" + sf.write(str(reverb_path), reverb_tail, sr) + self._original_reverb_path = str(reverb_path) + + mono = dry_signal + self._last_vc_preprocess_mode = "advanced_dereverb" + log.detail(f"Dry/Wet separation: dry RMS={np.sqrt(np.mean(dry_signal**2)):.4f}, reverb RMS={np.sqrt(np.mean(reverb_tail**2)):.4f}") + else: + self._last_vc_preprocess_mode = "direct" + if preprocess_mode == "uvr_deecho": + log.warning("Official DeEcho model not found, falling back to direct lead input") + log.detail("VC preprocess: direct lead -> mono select") + audio, sr = librosa.load(preprocess_input, sr=None, mono=False) + audio = self._ensure_2d(audio).astype(np.float32) + mono = self._select_mono_for_vc(audio, sr) + else: + self._last_vc_preprocess_mode = "uvr_deecho" + log.detail("VC preprocess: UVR learned DeEcho/DeReverb -> mono select") + + if preprocess_input == vocals_path: + audio, sr = librosa.load(preprocess_input, sr=None, mono=False) + audio = self._ensure_2d(audio).astype(np.float32) + mono = self._select_mono_for_vc(audio, sr) + else: + direct_audio, sr = librosa.load(vocals_path, sr=None, mono=False) + deecho_audio, deecho_sr = librosa.load(preprocess_input, sr=None, mono=False) + direct_audio = self._ensure_2d(direct_audio).astype(np.float32) + deecho_audio = self._ensure_2d(deecho_audio).astype(np.float32) + direct_mono = self._select_mono_for_vc(direct_audio, sr) + deecho_mono = self._select_mono_for_vc(deecho_audio, deecho_sr) + if deecho_sr != sr: + deecho_mono = librosa.resample( + deecho_mono, + orig_sr=deecho_sr, + target_sr=sr, + ).astype(np.float32) + mono = self._blend_direct_with_deecho(direct_mono, deecho_mono, sr) + log.detail("VC preprocess: blended direct lead with UVR DeEcho") + + mono = soft_clip(mono, threshold=0.9, ceiling=0.99) + + out_path = session_dir / "vocals_for_vc.wav" + sf.write(str(out_path), mono, sr) + return str(out_path) + + def _suppress_lead_bleed_from_backing( + self, + lead_audio: np.ndarray, + backing_audio: np.ndarray, + ) -> np.ndarray: + """ + 抑制 backing 里残留的主唱,减少 converted lead + 原主唱残留造成的重音。 + """ + import librosa + + n_fft = 4096 + hop_length = 1024 + suppression = 0.9 + min_mask = 0.08 + eps = 1e-8 + + cleaned = np.zeros_like(backing_audio, dtype=np.float32) + for ch in range(backing_audio.shape[0]): + backing_ch = backing_audio[ch] + lead_ch = lead_audio[ch] + backing_spec = librosa.stft( + backing_ch, n_fft=n_fft, hop_length=hop_length, win_length=n_fft + ) + lead_spec = librosa.stft( + lead_ch, n_fft=n_fft, hop_length=hop_length, win_length=n_fft + ) + + backing_mag = np.abs(backing_spec) + lead_mag = np.abs(lead_spec) + residual_mag = np.maximum(backing_mag - suppression * lead_mag, 0.0) + soft_mask = residual_mag / (backing_mag + eps) + soft_mask = np.clip(soft_mask, min_mask, 1.0) + + cleaned_spec = backing_spec * soft_mask + cleaned[ch] = librosa.istft( + cleaned_spec, hop_length=hop_length, win_length=n_fft, length=len(backing_ch) + ) + + return cleaned.astype(np.float32) + + def _merge_backing_into_accompaniment( + self, + backing_vocals_path: str, + accompaniment_path: str, + session_dir: Path, + lead_vocals_path: Optional[str] = None, + ) -> str: + """将和声轨混入伴奏轨;可选抑制 backing 内残留主唱""" + import librosa + import soundfile as sf + + backing, backing_sr = librosa.load(backing_vocals_path, sr=None, mono=False) + accompaniment, accompaniment_sr = librosa.load(accompaniment_path, sr=None, mono=False) + + backing = self._ensure_2d(backing).astype(np.float32) + accompaniment = self._ensure_2d(accompaniment).astype(np.float32) + + if backing_sr != accompaniment_sr: + backing = self._resample_audio(backing, orig_sr=backing_sr, target_sr=accompaniment_sr) + + if lead_vocals_path: + lead, lead_sr = librosa.load(lead_vocals_path, sr=None, mono=False) + lead = self._ensure_2d(lead).astype(np.float32) + if lead_sr != accompaniment_sr: + lead = self._resample_audio(lead, orig_sr=lead_sr, target_sr=accompaniment_sr) + lead = self._match_channels(lead, backing.shape[0]) + + min_len = min(backing.shape[1], lead.shape[1]) + backing = backing[:, :min_len] + lead = lead[:, :min_len] + backing = self._suppress_lead_bleed_from_backing( + lead_audio=lead, + backing_audio=backing, + ) + + accompaniment = self._match_channels(accompaniment, backing.shape[0]) + max_len = max(accompaniment.shape[1], backing.shape[1]) + if accompaniment.shape[1] < max_len: + accompaniment = np.pad( + accompaniment, ((0, 0), (0, max_len - accompaniment.shape[1])), mode="constant" + ) + if backing.shape[1] < max_len: + backing = np.pad(backing, ((0, 0), (0, max_len - backing.shape[1])), mode="constant") + + backing_gain = 1.00 + backing = backing * backing_gain + log.detail(f"和声混入伴奏增益: {backing_gain:.2f}") + mixed = accompaniment + backing + mixed = soft_clip(mixed, threshold=0.92, ceiling=0.98) + + out_path = session_dir / "accompaniment_with_backing.wav" + sf.write(str(out_path), mixed.T, accompaniment_sr) + return str(out_path) + + def _init_rvc_pipeline(self): + """初始化 RVC 管道""" + if self.rvc_pipeline is not None: + return + + from infer.pipeline import VoiceConversionPipeline + + self.rvc_pipeline = VoiceConversionPipeline(device=self.device) + + def _apply_silence_gate_official( + self, + vocals_path: str, + converted_path: str, + f0_method: str, + silence_threshold_db: float, + silence_smoothing_ms: float, + silence_min_duration_ms: float, + protect: float + ): + """对官方转换后的人声应用静音门限(可选)""" + from lib.audio import load_audio, save_audio + from infer.pipeline import VoiceConversionPipeline + import soundfile as sf + + # Load original vocals at 16k for RMS/F0 reference + audio_in = load_audio(vocals_path, sr=16000) + + # Extract F0 using the configured method + gate_pipe = VoiceConversionPipeline(device=self.device) + root_dir = Path(__file__).parent.parent + rmvpe_path = root_dir / "assets" / "rmvpe" / "rmvpe.pt" + if f0_method in ("rmvpe", "hybrid"): + if not rmvpe_path.exists(): + raise FileNotFoundError(f"RMVPE 模型未找到: {rmvpe_path}") + gate_pipe.load_f0_extractor(f0_method, str(rmvpe_path)) + else: + gate_pipe.load_f0_extractor(f0_method, None) + f0 = gate_pipe.f0_extractor.extract(audio_in) + gate_pipe.unload_f0_extractor() + + # Load converted vocals (keep original sample rate) + audio_out, sr_out = sf.read(converted_path) + if audio_out.ndim > 1: + audio_out = audio_out.mean(axis=1) + audio_out = audio_out.astype(np.float32) + + audio_out = gate_pipe._apply_silence_gate( + audio_out=audio_out, + audio_in=audio_in, + f0=f0, + sr_out=sr_out, + sr_in=16000, + hop_length=160, + threshold_db=silence_threshold_db, + smoothing_ms=silence_smoothing_ms, + min_silence_ms=silence_min_duration_ms, + protect=protect + ) + + save_audio(converted_path, audio_out, sr=sr_out) + + def _blend_backing_vocals( + self, + converted_path: str, + original_vocals_path: str, + mix_ratio: float, + output_path: Optional[str] = None + ) -> str: + """混入原始人声以恢复和声层""" + if mix_ratio <= 0: + return converted_path + + import librosa + import soundfile as sf + + conv, sr = librosa.load(converted_path, sr=None, mono=True) + orig, sr_orig = librosa.load(original_vocals_path, sr=None, mono=True) + if sr_orig != sr: + orig = librosa.resample(orig, orig_sr=sr_orig, target_sr=sr) + + min_len = min(len(conv), len(orig)) + conv = conv[:min_len] + orig = orig[:min_len] + + mixed = conv * (1.0 - mix_ratio) + orig * mix_ratio + mixed = soft_clip(mixed, threshold=0.9, ceiling=0.98) + + if output_path is None: + output_path = str(Path(converted_path).with_suffix("").as_posix() + "_blend.wav") + + sf.write(output_path, mixed, sr) + return output_path + + def _constrain_converted_to_source( + self, + source_vocals_path: str, + converted_vocals_path: str, + original_vocals_path: str = None, + output_path: Optional[str] = None, + ) -> str: + """ + Use source-vocal-guided spectral constraint to suppress artifacts that are + absent from the source lead (e.g. spurious echo/noise produced by VC). + """ + import librosa + import soundfile as sf + + src, src_sr = librosa.load(source_vocals_path, sr=None, mono=True) + conv, conv_sr = librosa.load(converted_vocals_path, sr=None, mono=True) + src = src.astype(np.float32) + conv = conv.astype(np.float32) + + if src_sr != conv_sr: + src = librosa.resample(src, orig_sr=src_sr, target_sr=conv_sr).astype(np.float32) + + aligned_len = min(len(src), len(conv)) + if aligned_len <= 0: + raise ValueError("源主唱或转换人声为空,无法执行源约束") + + src = src[:aligned_len] + conv_main = conv[:aligned_len] + conv_tail = conv[aligned_len:] + + n_fft = 2048 + hop_length = 512 + win_length = 2048 + eps = 1e-8 + + src_spec = librosa.stft( + src, n_fft=n_fft, hop_length=hop_length, win_length=win_length + ) + conv_spec = librosa.stft( + conv_main, n_fft=n_fft, hop_length=hop_length, win_length=win_length + ) + src_mag = np.abs(src_spec).astype(np.float32) + conv_mag = np.abs(conv_spec).astype(np.float32) + + frame_count = conv_spec.shape[1] + + # Echo-like component tends to persist from previous frames. + prev_mag = np.concatenate([src_mag[:, :1], src_mag[:, :-1]], axis=1) + echo_like = np.minimum(src_mag, 0.92 * prev_mag) + echo_ratio = np.clip(echo_like / (src_mag + eps), 0.0, 1.0) + direct_floor = (1.0 - echo_ratio) * 0.18 * src_mag + direct_ref = np.maximum(src_mag - 0.60 * echo_like, direct_floor) + + extra_mag = np.maximum(conv_mag - direct_ref, 0.0) + soft_mask = direct_ref / (direct_ref + 0.7 * extra_mag + eps) + + frame_ref = np.mean(direct_ref, axis=0) + frame_conv = np.mean(conv_mag, axis=0) + frame_mask = np.clip((frame_ref + eps) / (frame_conv + eps), 0.0, 1.0) + frame_kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + frame_kernel /= np.sum(frame_kernel) + frame_mask = np.convolve(frame_mask, frame_kernel, mode="same") + soft_mask *= frame_mask[np.newaxis, :] + + time_kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + time_kernel /= np.sum(time_kernel) + soft_mask = np.apply_along_axis( + lambda row: np.convolve(row, time_kernel, mode="same"), + axis=1, + arr=soft_mask, + ) + soft_mask = np.clip(soft_mask, 0.0, 1.0) + src_frame_rms = librosa.feature.rms( + y=src, + frame_length=win_length, + hop_length=hop_length, + center=True, + )[0] + src_frame_rms = self._fit_frame_curve(src_frame_rms, frame_count) + src_frame_db = 20.0 * np.log10(src_frame_rms + eps) + ref_db = float(np.percentile(src_frame_db, 95)) + frame_src_mag = np.mean(src_mag, axis=0) + direct_ratio = np.clip(frame_ref / (frame_src_mag + eps), 0.0, 1.0) + direct_ratio = self._fit_frame_curve(direct_ratio, frame_count) + + orig = None + orig_frame_rms = src_frame_rms.copy() + orig_frame_db = src_frame_db.copy() + orig_ref_db = ref_db + if original_vocals_path is not None: + orig, orig_sr = librosa.load(original_vocals_path, sr=None, mono=True) + if orig_sr != conv_sr: + orig = librosa.resample(orig, orig_sr=orig_sr, target_sr=conv_sr).astype(np.float32) + orig = orig[:aligned_len].astype(np.float32) + orig_frame_rms = librosa.feature.rms( + y=orig, + frame_length=win_length, + hop_length=hop_length, + center=True, + )[0] + orig_frame_rms = self._fit_frame_curve(orig_frame_rms, frame_count) + orig_frame_db = 20.0 * np.log10(orig_frame_rms + eps) + orig_ref_db = float(np.percentile(orig_frame_db, 95)) + + # Use time-domain RMS activity instead of STFT mean magnitude. + # Echo-only frames often keep wide-band STFT energy but very low direct vocal RMS. + direct_activity = np.clip((src_frame_db - (ref_db - 30.0)) / 18.0, 0.0, 1.0) + direct_activity = np.convolve(direct_activity, frame_kernel, mode="same") + direct_activity = self._fit_frame_curve(direct_activity, frame_count) + + vocal_activity = np.clip((orig_frame_db - (orig_ref_db - 30.0)) / 18.0, 0.0, 1.0) + vocal_activity = np.convolve(vocal_activity, frame_kernel, mode="same") + vocal_activity = self._fit_frame_curve(vocal_activity, frame_count) + phrase_activity = self._hold_activity_curve( + vocal_activity, + max(1, int(0.28 * conv_sr / hop_length)), + ) + + activity = np.maximum(direct_activity, phrase_activity) + + mask_floor = 0.02 + 0.14 * (0.25 * direct_activity + 0.20 * direct_ratio + 0.55 * phrase_activity) + mask_floor = np.convolve(mask_floor, frame_kernel, mode="same") + mask_floor = self._fit_frame_curve(mask_floor, frame_count) + soft_mask = np.maximum(soft_mask, mask_floor[np.newaxis, :]) + soft_mask = np.clip(soft_mask, 0.0, 1.0) + + # Step 1: Magnitude-only constraint in STFT domain + # Instead of mixing source and converted complex spectra (which causes + # phase interference / tearing artifacts), we only constrain the + # MAGNITUDE toward the source envelope while preserving the converted + # signal's phase. This eliminates phase cancellation. + source_replace = 0.85 * (1.0 - activity)[np.newaxis, :] * (1.0 - soft_mask) + source_replace = np.clip(source_replace, 0.0, 0.70) + + # Target magnitude: blend toward source magnitude, keep converted phase + target_mag = conv_mag * (1.0 - source_replace) + src_mag * source_replace + # Compute gain per bin: how much to scale converted magnitude + mag_gain = target_mag / (conv_mag + eps) + mag_gain = np.clip(mag_gain, 0.05, 2.0) + constrained_spec = conv_spec * mag_gain + + replaced_frames = int(np.sum(np.mean(source_replace, axis=0) > 0.05)) + if replaced_frames > 0: + log.detail( + f"源低活动段幅度约束: {replaced_frames}/{frame_count} 帧抑制幻觉噪声(相位保留)" + ) + + # Step 2: istft to get constrained main body + constrained = librosa.istft( + constrained_spec, + hop_length=hop_length, + win_length=win_length, + length=aligned_len, + ).astype(np.float32) + + # Step 3: Symmetric global gain (only on main body, before tail concat) + # 增益目标用原始主唱(未去混响),避免目标偏低 + gain, ref_rms, out_rms, gain_weights = self._compute_active_rms_gain( + reference_audio=orig if orig is not None else src, + target_audio=constrained, + sr=conv_sr, + min_gain=0.95, # 放宽到0.95,只降低5%(从0.80改为0.95) + max_gain=1.30, # 允许更大的提升(从1.25改为1.30) + ) + if abs(gain - 1.0) > 1e-3 and out_rms > 1e-6 and ref_rms > 1e-6: + constrained = self._apply_weighted_gain(constrained, gain_weights, gain) + log.detail( + f"Source-constrained active RMS: ref={ref_rms:.6f}, out={out_rms:.6f}, gain={gain:.3f}" + ) + + constrained_frame_rms = librosa.feature.rms( + y=constrained, + frame_length=win_length, + hop_length=hop_length, + center=True, + )[0] + constrained_frame_rms = self._fit_frame_curve(constrained_frame_rms, frame_count) + base_budget_rms = np.maximum(src_frame_rms, orig_frame_rms) + ref_frame_rms = float(np.percentile(base_budget_rms, 95)) + energy_guard = np.clip(0.20 * direct_activity + 0.15 * direct_ratio + 0.65 * phrase_activity, 0.0, 1.0) + allowed_boost = 0.50 + 1.50 * energy_guard # 提高基础boost(从0.35改为0.50,从1.20改为1.50) + noise_floor = ref_frame_rms * (0.002 + 0.005 * (1.0 - phrase_activity)) # 降低noise_floor + frame_budget = base_budget_rms * allowed_boost + noise_floor + cleanup_gain = np.clip( + frame_budget / (constrained_frame_rms + eps), + 0.75 + 0.20 * phrase_activity, # 提高最小增益(从0.55改为0.75) + 1.0, + ) + cleanup_gain = np.convolve(cleanup_gain, frame_kernel, mode="same") + cleanup_gain = self._fit_frame_curve(cleanup_gain, frame_count) + attenuated_frames = int(np.sum(cleanup_gain < 0.98)) + if attenuated_frames > 0: + constrained = constrained * self._frame_curve_to_sample_gain( + cleanup_gain, + len(constrained), + hop_length, + ) + log.detail( + f"源能量预算清理: {attenuated_frames}/{frame_count} 帧抑制超额转换残留" + ) + + if original_vocals_path is not None: + try: + orig_gate, orig_gate_sr = librosa.load(original_vocals_path, sr=None, mono=True) + if orig_gate_sr != conv_sr: + orig_gate = librosa.resample( + orig_gate, + orig_sr=orig_gate_sr, + target_sr=conv_sr, + ).astype(np.float32) + orig_gate = orig_gate[:aligned_len].astype(np.float32) + echo_tail_gain, gated_count, total_frames = self._compute_echo_tail_sample_gain( + original=orig_gate, + dereverbed=src, + sr=conv_sr, + ) + if gated_count > 0: + constrained = constrained * echo_tail_gain[:len(constrained)] + log.detail( + f"源回声尾段同步抑制: {gated_count}/{total_frames} 帧应用到转换人声" + ) + except Exception as e: + log.warning(f"源回声尾段同步抑制失败,跳过: {e}") + + # Step 4: Append tail with fade-out (tail is likely noise from VC overshoot) + if conv_tail.size > 0: + tail_fade = np.linspace(1.0, 0.0, len(conv_tail)).astype(np.float32) + constrained = np.concatenate([constrained, conv_tail * tail_fade * 0.18]) + + constrained = soft_clip(constrained, threshold=0.9, ceiling=0.99) + + if output_path is None: + output_path = converted_vocals_path + sf.write(output_path, constrained, conv_sr) + return output_path + + def process( + self, + input_audio: str, + model_path: str, + index_path: Optional[str] = None, + pitch_shift: int = 0, + index_ratio: float = 0.5, + filter_radius: int = 3, + rms_mix_rate: float = 0.25, + protect: float = 0.33, + speaker_id: int = 0, + f0_method: str = "rmvpe", + demucs_model: str = "htdemucs", + demucs_shifts: int = 2, + demucs_overlap: float = 0.25, + demucs_split: bool = True, + separator: str = "uvr5", + uvr5_model: Optional[str] = None, + uvr5_agg: int = 10, + uvr5_format: str = "wav", + use_official: bool = True, + hubert_layer: int = 12, + silence_gate: bool = False, + silence_threshold_db: float = -40.0, + silence_smoothing_ms: float = 50.0, + silence_min_duration_ms: float = 200.0, + vocals_volume: float = 1.0, + accompaniment_volume: float = 1.0, + reverb_amount: float = 0.0, + backing_mix: float = 0.0, + karaoke_separation: bool = True, + karaoke_model: str = KARAOKE_DEFAULT_MODEL, + karaoke_merge_backing_into_accompaniment: bool = True, + vc_preprocess_mode: str = "auto", + source_constraint_mode: str = "auto", + vc_pipeline_mode: str = "current", + singing_repair: bool = False, + output_dir: Optional[str] = None, + model_display_name: Optional[str] = None, + progress_callback: Optional[Callable[[str, int, int], None]] = None + ) -> Dict[str, str]: + """ + 执行完整的翻唱流程 + + Args: + input_audio: 输入歌曲路径 + model_path: RVC 模型路径 + index_path: 索引文件路径 (可选) + pitch_shift: 音调偏移 (半音) + index_ratio: 索引混合比率 + index_ratio: 索引混合比率 + filter_radius: 中值滤波半径 + rms_mix_rate: RMS 混合比率 + protect: 保护参数 + speaker_id: 说话人 ID(多说话人模型可调) + f0_method: F0 提取方法 + demucs_model: Demucs 模型名称 + demucs_shifts: Demucs shifts 参数 + demucs_overlap: Demucs overlap 参数 + demucs_split: Demucs split 参数 + hubert_layer: HuBERT 输出层 + silence_gate: 是否启用静音门限 + silence_threshold_db: 静音阈值 (dB, 相对峰值) + silence_smoothing_ms: 门限平滑时长 (ms) + silence_min_duration_ms: 最短静音时长 (ms) + vocals_volume: 人声音量 (0-2) + accompaniment_volume: 伴奏音量 (0-2) + reverb_amount: 人声混响量 (0-1) + backing_mix: 原始人声混入比例 (0-1) + output_dir: 输出目录 (可选) + progress_callback: 进度回调 (message, current_step, total_steps) + + Returns: + dict: { + "cover": 最终翻唱路径, + "vocals": 原始人声路径, + "converted_vocals": 转换后人声路径, + "accompaniment": 伴奏路径 + } + """ + normalized_vc_pipeline_mode = str(vc_pipeline_mode or "current").strip().lower() + if normalized_vc_pipeline_mode not in {"current", "official"}: + normalized_vc_pipeline_mode = "current" + effective_official_mode = normalized_vc_pipeline_mode == "official" + effective_separator = "uvr5" if effective_official_mode else separator + effective_karaoke_separation = False if effective_official_mode else karaoke_separation + effective_karaoke_merge_backing = False if effective_official_mode else karaoke_merge_backing_into_accompaniment + effective_use_official = True if effective_official_mode else use_official + + total_steps = 5 if effective_karaoke_separation else 4 + step_karaoke = 2 if effective_karaoke_separation else None + step_convert = 3 if effective_karaoke_separation else 2 + step_mix = 4 if effective_karaoke_separation else 3 + step_finalize = 5 if effective_karaoke_separation else 4 + session_dir = self._get_session_dir() + + # 记录输入信息 + input_path = Path(input_audio) + input_size = input_path.stat().st_size if input_path.exists() else 0 + input_duration = _get_audio_duration(input_audio) + + log.separator() + log.info(f"开始翻唱处理: {input_path.name}") + log.detail(f"输入文件: {input_audio}") + log.detail(f"文件大小: {_format_size(input_size)}") + log.detail(f"音频时长: {_format_duration(input_duration)}") + log.detail(f"会话目录: {session_dir}") + log.separator() + # 记录参数配置 + log.config(f"RVC模型: {Path(model_path).name}") + log.config(f"索引文件: {Path(index_path).name if index_path else '无'}") + log.config(f"音调偏移: {pitch_shift} 半音") + log.config(f"F0提取方法: {f0_method}") + log.config(f"索引混合比率: {index_ratio}") + log.config(f"说话人ID: {speaker_id}") + log.config(f"VC管线模式: {normalized_vc_pipeline_mode}") + if effective_official_mode: + log.config("官方模式: 强制使用官方UVR5分离 + 官方VC,不使用Karaoke二次分离") + log.config(f"人声分离器: {effective_separator}") + if effective_separator == "uvr5": + log.config(f"UVR5模型: {uvr5_model or '自动选择'}") + log.config(f"UVR5激进度: {uvr5_agg}") + elif effective_separator == "roformer": + log.config(f"Roformer模型: {ROFORMER_DEFAULT_MODEL}") + else: + log.config(f"Demucs模型: {demucs_model}") + log.config(f"Demucs shifts: {demucs_shifts}") + log.config(f"人声音量: {vocals_volume}") + log.config(f"伴奏音量: {accompaniment_volume}") + log.config(f"混响量: {reverb_amount}") + log.separator() + + log.config(f"Karaoke分离: {'开启' if effective_karaoke_separation else '关闭'}") + if effective_karaoke_separation: + log.config(f"Karaoke模型: {karaoke_model}") + log.config( + "Karaoke和声混入伴奏: " + f"{'开启' if effective_karaoke_merge_backing else '关闭'}" + ) + elif effective_official_mode: + log.config("Karaoke分离: 官方模式下关闭") + + def report_progress(msg: str, step: int): + if progress_callback: + progress_callback(msg, step, total_steps) + log.step(step, total_steps, msg) + + try: + # ===== 步骤 1: 人声分离 ===== + report_progress("正在分离人声和伴奏...", 1) + + if effective_official_mode: + log.model("官方模式:使用内置官方UVR5进行人声分离") + uvr_temp = session_dir / "official_uvr5" + log.detail(f"官方UVR5临时目录: {uvr_temp}") + vocals_path, accompaniment_path = separate_uvr5_official_upstream( + input_audio, + uvr_temp, + uvr5_model, + agg=uvr5_agg, + fmt=uvr5_format, + ) + elif effective_use_official and effective_separator == "uvr5": + log.model("使用当前项目官方封装UVR5进行人声分离") + setup_official_env(Path(__file__).parent.parent) + uvr_temp = session_dir / "uvr5" + log.detail(f"UVR5临时目录: {uvr_temp}") + vocals_path, accompaniment_path = separate_uvr5( + input_audio, + uvr_temp, + uvr5_model, + agg=uvr5_agg, + fmt=uvr5_format, + ) + log.success("UVR5分离完成") + elif effective_separator == "roformer": + log.model("使用 Mel-Band Roformer 进行人声分离") + self._init_separator("roformer") + vocals_path, accompaniment_path = self.separator.separate( + input_audio, + str(session_dir) + ) + log.success("Mel-Band Roformer 分离完成") + else: + log.model(f"使用Demucs进行人声分离: {demucs_model}") + self._init_separator( + demucs_model, + shifts=demucs_shifts, + overlap=demucs_overlap, + split=demucs_split + ) + vocals_path, accompaniment_path = self.separator.separate( + input_audio, + str(session_dir) + ) + log.success("Demucs分离完成") + gc.collect() + empty_device_cache() + log.detail("已清理设备缓存") + + # ===== 步骤 1.5: Karaoke 分离(主唱/和声)===== + original_vocals_path = vocals_path + lead_vocals_path = None + backing_vocals_path = None + + if effective_karaoke_separation: + report_progress("正在分离主唱和和声...", step_karaoke) + lead_vocals_path, backing_vocals_path = self._separate_karaoke( + vocals_path=vocals_path, + session_dir=session_dir, + karaoke_model=karaoke_model, + ) + lead_size = Path(lead_vocals_path).stat().st_size if Path(lead_vocals_path).exists() else 0 + backing_size = Path(backing_vocals_path).stat().st_size if Path(backing_vocals_path).exists() else 0 + log.audio(f"主唱文件: {Path(lead_vocals_path).name} ({_format_size(lead_size)})") + log.audio(f"和声文件: {Path(backing_vocals_path).name} ({_format_size(backing_size)})") + vocals_path = lead_vocals_path + + normalized_vc_preprocess_mode = str(vc_preprocess_mode or "auto").strip().lower() + normalized_source_constraint_mode = str(source_constraint_mode or "auto").strip().lower() + available_uvr_deecho_model = self._get_available_uvr_deecho_model() + log.config(f"VC预处理模式: {normalized_vc_preprocess_mode}") + if normalized_vc_pipeline_mode == "current" and normalized_vc_preprocess_mode in {"auto", "uvr_deecho"}: + if available_uvr_deecho_model: + log.config(f"Mature DeEcho模型: {available_uvr_deecho_model}") + else: + log.config("Mature DeEcho模型: 未找到,将回退到主唱直通") + log.config(f"源约束模式: {normalized_source_constraint_mode}") + + vc_input_path = vocals_path + vc_preprocessed = False + if normalized_vc_pipeline_mode == "official": + self._last_vc_preprocess_mode = "direct" + log.detail("官方VC模式:跳过自定义VC预处理") + log.audio(f"官方VC输入: {Path(vc_input_path).name}") + else: + try: + prepared_path = self._prepare_vocals_for_vc(vocals_path, session_dir, preprocess_mode=normalized_vc_preprocess_mode) + vc_input_path = prepared_path + vc_preprocessed = True + log.audio(f"VC预处理输入: {Path(vc_input_path).name}") + except Exception as e: + log.warning(f"VC预处理失败,回退原始输入: {e}") + + report_progress("正在转换人声...", step_convert) + converted_vocals_path = str(session_dir / "converted_vocals.wav") + + log.model(f"加载RVC模型: {Path(model_path).name}") + log.detail(f"输入人声: {vc_input_path}") + log.detail(f"输出路径: {converted_vocals_path}") + if normalized_vc_pipeline_mode == "official" and not singing_repair: + log.detail("使用内置官方VC实现进行转换") + log.config(f"F0方法: {f0_method}, 音调: {pitch_shift}, 索引率: {index_ratio}") + log.config(f"滤波半径: {filter_radius}, RMS混合: {rms_mix_rate}, 保护: {protect}") + + convert_vocals_official_upstream( + vocals_path=vc_input_path, + output_path=converted_vocals_path, + model_path=model_path, + index_path=index_path, + f0_method=f0_method, + pitch_shift=pitch_shift, + index_rate=index_ratio, + filter_radius=filter_radius, + rms_mix_rate=rms_mix_rate, + protect=protect, + speaker_id=speaker_id, + ) + log.detail("内置官方模式已跳过自定义VC前后处理") + log.success("内置官方VC转换完成") + elif normalized_vc_pipeline_mode == "official" and singing_repair: + log.detail("使用官方兼容唱歌修复链进行转换") + log.config(f"F0方法: {f0_method}, 音调: {pitch_shift}, 索引率: {index_ratio}") + log.config(f"滤波半径: {filter_radius}, RMS混合: {rms_mix_rate}, 保护: {protect}") + log.config("唱歌修复: 开启(FP32 + 保守F0兜底 + F0稳定/限速)") + + convert_vocals_official( + vocals_path=vc_input_path, + output_path=converted_vocals_path, + model_path=model_path, + index_path=index_path, + f0_method=f0_method, + pitch_shift=pitch_shift, + index_rate=index_ratio, + filter_radius=filter_radius, + rms_mix_rate=rms_mix_rate, + protect=protect, + speaker_id=speaker_id, + repair_profile=True, + ) + try: + self._apply_silence_gate_official( + vocals_path=vc_input_path, + converted_path=converted_vocals_path, + f0_method=f0_method, + silence_threshold_db=-38.0, + silence_smoothing_ms=35.0, + silence_min_duration_ms=70.0, + protect=0.0, + ) + log.detail("唱歌修复: 已应用低能量静音清理") + except Exception as e: + log.warning(f"唱歌修复静音清理失败,保留原始转换结果: {e}") + + try: + self._apply_source_gap_suppression( + source_vocals_path=vc_input_path, + converted_vocals_path=converted_vocals_path, + ) + log.detail("唱歌修复: 已应用源静音区抑制") + except Exception as e: + log.warning(f"唱歌修复静音区抑制失败,保留当前结果: {e}") + log.success("官方兼容唱歌修复转换完成") + elif effective_use_official: + log.detail("使用当前项目官方封装VC进行转换") + log.config(f"F0方法: {f0_method}, 音调: {pitch_shift}, 索引率: {index_ratio}") + log.config(f"滤波半径: {filter_radius}, RMS混合: {rms_mix_rate}, 保护: {protect}") + + convert_vocals_official( + vocals_path=vc_input_path, + output_path=converted_vocals_path, + model_path=model_path, + index_path=index_path, + f0_method=f0_method, + pitch_shift=pitch_shift, + index_rate=index_ratio, + filter_radius=filter_radius, + rms_mix_rate=rms_mix_rate, + protect=protect, + speaker_id=speaker_id, + ) + if silence_gate: + log.detail("启用静音门限(当前项目官方封装VC后处理)") + self._apply_silence_gate_official( + vocals_path=vc_input_path, + converted_path=converted_vocals_path, + f0_method=f0_method, + silence_threshold_db=silence_threshold_db, + silence_smoothing_ms=silence_smoothing_ms, + silence_min_duration_ms=silence_min_duration_ms, + protect=protect + ) + normalized_source_constraint_mode = str(source_constraint_mode or "auto").strip().lower() + should_apply_source_constraint = self._should_apply_source_constraint( + vc_preprocessed=vc_preprocessed, + source_constraint_mode=normalized_source_constraint_mode, + ) + + if should_apply_source_constraint: + try: + self._constrain_converted_to_source( + source_vocals_path=vc_input_path, + converted_vocals_path=converted_vocals_path, + original_vocals_path=vocals_path, + ) + log.detail("Applied source-guided reconstruction to suppress echo/noise") + self._refine_source_constrained_output( + source_vocals_path=vc_input_path, + converted_vocals_path=converted_vocals_path, + source_constraint_mode=normalized_source_constraint_mode, + f0_method=f0_method, + ) + except Exception as e: + log.warning(f"Source-guided reconstruction failed, keeping raw conversion: {e}") + elif vc_preprocessed and normalized_source_constraint_mode == "off": + log.detail("Source constraint: off") + elif vc_preprocessed and normalized_source_constraint_mode == "auto": + try: + self._apply_source_gap_suppression( + source_vocals_path=vc_input_path, + converted_vocals_path=converted_vocals_path, + ) + log.detail("Source gap suppression: applied for mature/default route") + except Exception as e: + log.warning(f"Source gap suppression failed, keeping raw conversion: {e}") + elif vc_preprocessed: + log.detail("Skipping source-guided reconstruction for this preprocess mode") + else: + log.warning("VC preprocess unavailable, skipping source-guided reconstruction") + log.success("官方VC转换完成") + + # 如果使用了advanced dereverb,重新应用原始混响 + if hasattr(self, '_original_reverb_path') and self._original_reverb_path and Path(self._original_reverb_path).exists(): + log.detail("重新应用原始混响到转换后的干声...") + import librosa + import soundfile as sf + + converted_dry, sr = librosa.load(converted_vocals_path, sr=None, mono=True) + original_reverb, reverb_sr = librosa.load(self._original_reverb_path, sr=None, mono=True) + + if reverb_sr != sr: + original_reverb = librosa.resample(original_reverb, orig_sr=reverb_sr, target_sr=sr).astype(np.float32) + + # 重新应用混响(80%强度) + wet_signal = apply_reverb_to_converted(converted_dry, original_reverb, mix_ratio=0.8) + + # 保存带混响的版本 + sf.write(converted_vocals_path, wet_signal, sr) + log.detail(f"混响重应用完成: mix_ratio=0.8") + + else: + # 使用自定义VC管道进行转换 + log.detail("使用自定义VC管道进行转换") + self._init_rvc_pipeline() + self.rvc_pipeline.hubert_layer = hubert_layer + log.config(f"HuBERT层: {hubert_layer}") + + root_dir = Path(__file__).parent.parent + hubert_path = root_dir / "assets" / "hubert" / "hubert_base.pt" + rmvpe_path = root_dir / "assets" / "rmvpe" / "rmvpe.pt" + + if self.rvc_pipeline.hubert_model is None: + if hubert_path.exists(): + log.model(f"加载HuBERT模型: {hubert_path}") + self.rvc_pipeline.load_hubert(str(hubert_path)) + log.success("HuBERT模型加载完成") + else: + raise FileNotFoundError(f"HuBERT 模型未找到: {hubert_path}") + + if self.rvc_pipeline.f0_extractor is None: + if f0_method in ("rmvpe", "hybrid"): + if rmvpe_path.exists(): + log.model(f"加载RMVPE模型: {rmvpe_path}") + self.rvc_pipeline.load_f0_extractor(f0_method, str(rmvpe_path)) + log.success(f"{f0_method.upper()}模型加载完成") + else: + raise FileNotFoundError(f"RMVPE 模型未找到: {rmvpe_path}") + else: + log.model(f"加载F0提取器: {f0_method}") + self.rvc_pipeline.load_f0_extractor(f0_method, None) + + log.model(f"加载声音模型: {Path(model_path).name}") + self.rvc_pipeline.load_voice_model(model_path) + if index_path: + log.model(f"加载索引文件: {Path(index_path).name}") + self.rvc_pipeline.load_index(index_path) + + log.progress("开始人声转换...") + self.rvc_pipeline.convert( + audio_path=vc_input_path, + output_path=converted_vocals_path, + pitch_shift=pitch_shift, + index_ratio=index_ratio, + filter_radius=filter_radius, + rms_mix_rate=rms_mix_rate, + protect=protect, + speaker_id=speaker_id, + silence_gate=silence_gate, + silence_threshold_db=silence_threshold_db, + silence_smoothing_ms=silence_smoothing_ms, + silence_min_duration_ms=silence_min_duration_ms, + ) + normalized_source_constraint_mode = str(source_constraint_mode or "auto").strip().lower() + should_apply_source_constraint = self._should_apply_source_constraint( + vc_preprocessed=vc_preprocessed, + source_constraint_mode=normalized_source_constraint_mode, + ) + + if should_apply_source_constraint: + try: + self._constrain_converted_to_source( + source_vocals_path=vc_input_path, + converted_vocals_path=converted_vocals_path, + original_vocals_path=vocals_path, + ) + log.detail("Applied source-guided reconstruction to suppress echo/noise") + self._refine_source_constrained_output( + source_vocals_path=vc_input_path, + converted_vocals_path=converted_vocals_path, + source_constraint_mode=normalized_source_constraint_mode, + f0_method=f0_method, + ) + except Exception as e: + log.warning(f"Source-guided reconstruction failed, keeping raw conversion: {e}") + elif vc_preprocessed and normalized_source_constraint_mode == "off": + log.detail("Source constraint: off") + elif vc_preprocessed and normalized_source_constraint_mode == "auto": + try: + self._apply_source_gap_suppression( + source_vocals_path=vc_input_path, + converted_vocals_path=converted_vocals_path, + ) + log.detail("Source gap suppression: applied for mature/default route") + except Exception as e: + log.warning(f"Source gap suppression failed, keeping raw conversion: {e}") + elif vc_preprocessed: + log.detail("Skipping source-guided reconstruction for this preprocess mode") + else: + log.warning("VC preprocess unavailable, skipping source-guided reconstruction") + log.success("自定义VC转换完成") + + log.detail("释放RVC管道资源...") + self.rvc_pipeline.unload_all() + gc.collect() + empty_device_cache() + log.detail("已清理设备缓存") + + # 记录转换结果 + converted_size = Path(converted_vocals_path).stat().st_size if Path(converted_vocals_path).exists() else 0 + log.audio(f"转换后人声: {Path(converted_vocals_path).name} ({_format_size(converted_size)})") + + mix_vocals_path = converted_vocals_path + if backing_mix > 0: + try: + blended_path = str(session_dir / "converted_vocals_blend.wav") + mix_vocals_path = self._blend_backing_vocals( + converted_path=converted_vocals_path, + original_vocals_path=vocals_path, + mix_ratio=backing_mix, + output_path=blended_path + ) + log.detail(f"已混入原始人声: ratio={backing_mix:.2f}") + except Exception as e: + log.warning(f"混入原始人声失败,使用转换人声: {e}") + + if ( + effective_karaoke_separation + and effective_karaoke_merge_backing + and backing_vocals_path + ): + accompaniment_path = self._merge_backing_into_accompaniment( + backing_vocals_path=backing_vocals_path, + accompaniment_path=accompaniment_path, + session_dir=session_dir, + lead_vocals_path=lead_vocals_path, + ) + log.detail("已将和声混入伴奏轨道") + + # ===== 步骤 3: 混音 ===== + report_progress("正在混合人声和伴奏...", step_mix) + + cover_path = str(session_dir / "cover.wav") + log.detail(f"混音输出: {cover_path}") + log.config(f"人声音量: {vocals_volume}, 伴奏音量: {accompaniment_volume}, 混响: {reverb_amount}") + + mix_vocals_and_accompaniment( + vocals_path=mix_vocals_path, + accompaniment_path=accompaniment_path, + output_path=cover_path, + vocals_volume=vocals_volume, + accompaniment_volume=accompaniment_volume, + reverb_amount=reverb_amount + ) + + cover_size = Path(cover_path).stat().st_size if Path(cover_path).exists() else 0 + log.success(f"混音完成: {_format_size(cover_size)}") + + # ===== 步骤 4: 整理输出 ===== + report_progress("正在整理输出文件...", step_finalize) + + # 如果指定了输出目录,复制文件 + if output_dir: + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + log.detail(f"输出目录: {output_path}") + + input_name = Path(input_audio).stem + # Gradio 临时路径可能在 stem 里残留路径分隔符,只取最后一段 + if "/" in input_name or "\\" in input_name: + input_name = Path(input_name).name + # 去掉 Gradio 上传时追加的随机后缀(如 -0-100) + input_name = re.sub(r'-\d+-\d+$', '', input_name) + # 拼上角色名 + tag = f"_{model_display_name}" if model_display_name else "" + final_cover = str(output_path / f"{input_name}{tag}_cover.wav") + final_vocals = str(output_path / f"{input_name}_vocals.wav") + final_converted = str(output_path / f"{input_name}{tag}_converted.wav") + final_accompaniment = str(output_path / f"{input_name}_accompaniment.wav") + final_lead = str(output_path / f"{input_name}_lead_vocals.wav") + final_backing = str(output_path / f"{input_name}_backing_vocals.wav") + + log.detail(f"复制翻唱文件: {final_cover}") + shutil.copy(cover_path, final_cover) + log.detail(f"复制原始人声: {final_vocals}") + shutil.copy(original_vocals_path, final_vocals) + log.detail(f"复制转换人声: {final_converted}") + shutil.copy(converted_vocals_path, final_converted) + log.detail(f"复制伴奏文件: {final_accompaniment}") + shutil.copy(accompaniment_path, final_accompaniment) + + if effective_karaoke_separation and lead_vocals_path and backing_vocals_path: + log.detail(f"复制主唱文件: {final_lead}") + shutil.copy(lead_vocals_path, final_lead) + log.detail(f"复制和声文件: {final_backing}") + shutil.copy(backing_vocals_path, final_backing) + + # 完整保留本次会话所有中间文件(分离结果、主唱/和声、回灌前后文件等) + all_files_dir = output_path / f"{input_name}{tag}_all_files_{session_dir.name}" + log.detail(f"复制全部中间文件: {all_files_dir}") + shutil.copytree(session_dir, all_files_dir, dirs_exist_ok=True) + + result = { + "cover": final_cover, + "vocals": final_vocals, + "converted_vocals": final_converted, + "accompaniment": final_accompaniment, + "all_files_dir": str(all_files_dir), + } + if effective_karaoke_separation and lead_vocals_path and backing_vocals_path: + result["lead_vocals"] = final_lead + result["backing_vocals"] = final_backing + else: + result = { + "cover": cover_path, + "vocals": original_vocals_path, + "converted_vocals": converted_vocals_path, + "accompaniment": accompaniment_path, + "all_files_dir": str(session_dir), + } + if effective_karaoke_separation and lead_vocals_path and backing_vocals_path: + result["lead_vocals"] = lead_vocals_path + result["backing_vocals"] = backing_vocals_path + if karaoke_separation and lead_vocals_path and backing_vocals_path: + result["lead_vocals"] = lead_vocals_path + result["backing_vocals"] = backing_vocals_path + + log.separator() + report_progress("翻唱完成!", step_finalize) + log.success(f"最终输出: {result['cover']}") + log.separator() + return result + + except Exception as e: + import traceback + error_detail = traceback.format_exc() + log.separator() + log.error(f"处理失败: {e}") + log.error(f"详细错误:\n{error_detail}") + log.separator() + report_progress(f"处理失败: {e}", 0) + raise + + def cleanup_session(self, session_dir: str): + """清理会话临时文件""" + if os.path.exists(session_dir): + shutil.rmtree(session_dir) + + def cleanup_all(self): + """清理所有临时文件""" + if self.separator is not None: + self.separator.unload_model() + self.separator = None + if self.karaoke_separator is not None: + self.karaoke_separator.unload_model() + self.karaoke_separator = None + + if self.temp_dir.exists(): + shutil.rmtree(self.temp_dir) + self.temp_dir.mkdir(parents=True, exist_ok=True) + + +# 全局实例 +_cover_pipeline = None + + +def get_cover_pipeline(device: str = "cuda") -> CoverPipeline: + """获取翻唱流水线单例""" + global _cover_pipeline + if _cover_pipeline is None: + _cover_pipeline = CoverPipeline(device=device) + return _cover_pipeline diff --git a/infer/f0_extractor.py b/infer/f0_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..7aed32d30aa70ce7ea02c2c7a917cf07dfd818e1 --- /dev/null +++ b/infer/f0_extractor.py @@ -0,0 +1,265 @@ +# -*- coding: utf-8 -*- +""" +F0 (基频) 提取模块 - 支持多种提取方法 +""" +import numpy as np +import torch +from typing import Optional, Literal + +# F0 提取方法类型 +F0Method = Literal["rmvpe", "pm", "harvest", "crepe", "hybrid"] + + +class F0Extractor: + """F0 提取器基类""" + + def __init__(self, sample_rate: int = 16000, hop_length: int = 160): + self.sample_rate = sample_rate + self.hop_length = hop_length + self.f0_min = 50 + self.f0_max = 1100 + + def extract(self, audio: np.ndarray) -> np.ndarray: + """提取 F0,子类需实现此方法""" + raise NotImplementedError + + +class PMExtractor(F0Extractor): + """Parselmouth (Praat) F0 提取器 - 速度快""" + + def extract(self, audio: np.ndarray) -> np.ndarray: + import parselmouth + + time_step = self.hop_length / self.sample_rate + sound = parselmouth.Sound(audio, self.sample_rate) + + pitch = sound.to_pitch_ac( + time_step=time_step, + voicing_threshold=0.6, + pitch_floor=self.f0_min, + pitch_ceiling=self.f0_max + ) + + f0 = pitch.selected_array["frequency"] + f0[f0 == 0] = np.nan + + return f0 + + +class HarvestExtractor(F0Extractor): + """PyWorld Harvest F0 提取器 - 质量较好""" + + def extract(self, audio: np.ndarray) -> np.ndarray: + import pyworld + + audio = audio.astype(np.float64) + f0, _ = pyworld.harvest( + audio, + self.sample_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=self.hop_length / self.sample_rate * 1000 + ) + + return f0 + + +class CrepeExtractor(F0Extractor): + """TorchCrepe F0 提取器 - 深度学习方法""" + + def __init__(self, sample_rate: int = 16000, hop_length: int = 160, + device: str = "cuda"): + super().__init__(sample_rate, hop_length) + self.device = device + + def extract(self, audio: np.ndarray) -> np.ndarray: + import torchcrepe + + audio_tensor = torch.from_numpy(audio).float().unsqueeze(0) + audio_tensor = audio_tensor.to(self.device) + + f0, _ = torchcrepe.predict( + audio_tensor, + self.sample_rate, + self.hop_length, + self.f0_min, + self.f0_max, + model="full", + batch_size=512, + device=self.device, + return_periodicity=True + ) + + f0 = f0.squeeze(0).cpu().numpy() + return f0 + + +class RMVPEExtractor(F0Extractor): + """RMVPE F0 提取器 - 质量最高 (推荐)""" + + def __init__(self, model_path: str, sample_rate: int = 16000, + hop_length: int = 160, device: str = "cuda"): + super().__init__(sample_rate, hop_length) + self.device = device + self.model = None + self.model_path = model_path + + def load_model(self): + """加载 RMVPE 模型""" + if self.model is not None: + return + + from models.rmvpe import RMVPE + + self.model = RMVPE(self.model_path, device=self.device) + print(f"RMVPE 模型已加载: {self.device}") + + def extract(self, audio: np.ndarray) -> np.ndarray: + self.load_model() + + # RMVPE 需要 16kHz 输入 + f0 = self.model.infer_from_audio(audio, thred=0.01) + + return f0 + + +def get_f0_extractor(method: F0Method, device: str = "cuda", + rmvpe_path: str = None, crepe_threshold: float = 0.05) -> F0Extractor: + """ + 获取 F0 提取器实例 + + Args: + method: 提取方法 ("rmvpe", "pm", "harvest", "crepe", "hybrid") + device: 计算设备 + rmvpe_path: RMVPE 模型路径 (rmvpe/hybrid 方法需要) + crepe_threshold: CREPE置信度阈值 (仅hybrid方法使用) + + Returns: + F0Extractor: 提取器实例 + """ + if method == "rmvpe": + if rmvpe_path is None: + raise ValueError("RMVPE 方法需要指定模型路径") + return RMVPEExtractor(rmvpe_path, device=device) + elif method == "hybrid": + if rmvpe_path is None: + raise ValueError("Hybrid 方法需要指定RMVPE模型路径") + return HybridF0Extractor(rmvpe_path, device=device, crepe_threshold=crepe_threshold) + elif method == "pm": + return PMExtractor() + elif method == "harvest": + return HarvestExtractor() + elif method == "crepe": + return CrepeExtractor(device=device) + else: + raise ValueError(f"未知的 F0 提取方法: {method}") + + +class HybridF0Extractor(F0Extractor): + """混合F0提取器 - RMVPE主导 + CREPE高精度补充""" + + def __init__(self, rmvpe_path: str, sample_rate: int = 16000, + hop_length: int = 160, device: str = "cuda", + crepe_threshold: float = 0.05): + super().__init__(sample_rate, hop_length) + self.device = device + self.rmvpe = RMVPEExtractor(rmvpe_path, sample_rate, hop_length, device) + self.crepe = None # 延迟加载 + self.crepe_threshold = crepe_threshold + + def _load_crepe(self): + """延迟加载CREPE模型""" + if self.crepe is None: + try: + self.crepe = CrepeExtractor(self.sample_rate, self.hop_length, self.device) + except ImportError: + print("警告: torchcrepe未安装,混合F0将仅使用RMVPE") + self.crepe = False + + def extract(self, audio: np.ndarray) -> np.ndarray: + """ + 混合提取F0: + 1. 使用RMVPE作为主要方法(快速、稳定) + 2. 在RMVPE不稳定的区域使用CREPE补充(高精度) + """ + # 提取RMVPE F0 + f0_rmvpe = self.rmvpe.extract(audio) + + # 如果CREPE不可用,直接返回RMVPE结果 + self._load_crepe() + if self.crepe is False: + return f0_rmvpe + + # 提取CREPE F0和置信度 + import torchcrepe + audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device) + f0_crepe, confidence = torchcrepe.predict( + audio_tensor, + self.sample_rate, + self.hop_length, + self.f0_min, + self.f0_max, + model="full", + batch_size=512, + device=self.device, + return_periodicity=True + ) + f0_crepe = f0_crepe.squeeze(0).cpu().numpy() + confidence = confidence.squeeze(0).cpu().numpy() + + # 对齐长度 + min_len = min(len(f0_rmvpe), len(f0_crepe), len(confidence)) + f0_rmvpe = f0_rmvpe[:min_len] + f0_crepe = f0_crepe[:min_len] + confidence = confidence[:min_len] + + # 检测RMVPE不稳定区域 + # 1. F0跳变过大(超过3个半音) + f0_diff = np.abs(np.diff(f0_rmvpe, prepend=f0_rmvpe[0])) + semitone_diff = np.abs(12 * np.log2((f0_rmvpe + 1e-6) / (np.roll(f0_rmvpe, 1) + 1e-6))) + semitone_diff[0] = 0 + unstable_jump = semitone_diff > 3.0 + + # 2. CREPE置信度高但RMVPE给出F0=0 + unstable_unvoiced = (f0_rmvpe < 1e-3) & (confidence > self.crepe_threshold) + + # 3. RMVPE和CREPE差异过大(超过2个半音)且CREPE置信度高 + f0_ratio = (f0_crepe + 1e-6) / (f0_rmvpe + 1e-6) + semitone_gap = np.abs(12 * np.log2(f0_ratio)) + unstable_diverge = (semitone_gap > 2.0) & (confidence > self.crepe_threshold * 1.5) + + # 合并不稳定区域 + unstable_mask = unstable_jump | unstable_unvoiced | unstable_diverge + + # 扩展不稳定区域(前后各2帧)以平滑过渡 + kernel = np.ones(5, dtype=bool) + unstable_mask = np.convolve(unstable_mask, kernel, mode='same') + + # 混合F0:不稳定区域使用CREPE,其他区域使用RMVPE + f0_hybrid = f0_rmvpe.copy() + f0_hybrid[unstable_mask] = f0_crepe[unstable_mask] + + # 平滑过渡边界 + for i in range(1, len(f0_hybrid) - 1): + if unstable_mask[i] != unstable_mask[i-1]: + # 边界处使用加权平均 + w = 0.5 + f0_hybrid[i] = w * f0_rmvpe[i] + (1-w) * f0_crepe[i] + + return f0_hybrid + + +def shift_f0(f0: np.ndarray, semitones: float) -> np.ndarray: + """ + 音调偏移 + + Args: + f0: 原始 F0 + semitones: 偏移半音数 (正数升调,负数降调) + + Returns: + np.ndarray: 偏移后的 F0 + """ + factor = 2 ** (semitones / 12) + f0_shifted = f0 * factor + return f0_shifted diff --git a/infer/lib/audio.py b/infer/lib/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..60ef07cdaa850e5aa6414db99d0d60d75942e3a3 --- /dev/null +++ b/infer/lib/audio.py @@ -0,0 +1,60 @@ +import platform, os +import ffmpeg +import numpy as np +import av +from io import BytesIO +import traceback +import re + + +def wav2(i, o, format): + inp = av.open(i, "rb") + if format == "m4a": + format = "mp4" + out = av.open(o, "wb", format=format) + if format == "ogg": + format = "libvorbis" + if format == "mp4": + format = "aac" + + ostream = out.add_stream(format) + + for frame in inp.decode(audio=0): + for p in ostream.encode(frame): + out.mux(p) + + for p in ostream.encode(None): + out.mux(p) + + out.close() + inp.close() + + +def load_audio(file, sr): + try: + # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 + # This launches a subprocess to decode audio while down-mixing and resampling as necessary. + # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. + file = clean_path(file) # 防止小白拷路径头尾带了空格和"和回车 + if os.path.exists(file) == False: + raise RuntimeError( + "You input a wrong audio path that does not exists, please fix it!" + ) + out, _ = ( + ffmpeg.input(file, threads=0) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) + .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) + ) + except Exception as e: + traceback.print_exc() + raise RuntimeError(f"Failed to load audio: {e}") + + return np.frombuffer(out, np.float32).flatten() + + + +def clean_path(path_str): + if platform.system() == "Windows": + path_str = path_str.replace("/", "\\") + path_str = re.sub(r'[\u202a\u202b\u202c\u202d\u202e]', '', path_str) # 移除 Unicode 控制字符 + return path_str.strip(" ").strip('"').strip("\n").strip('"').strip(" ") diff --git a/infer/lib/infer_pack/attentions.py b/infer/lib/infer_pack/attentions.py new file mode 100644 index 0000000000000000000000000000000000000000..2cc745ae7d2e61ab260c6ba5b65379fb2262a240 --- /dev/null +++ b/infer/lib/infer_pack/attentions.py @@ -0,0 +1,459 @@ +import copy +import math +from typing import Optional + +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from infer.lib.infer_pack import commons, modules +from infer.lib.infer_pack.modules import LayerNorm + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=10, + **kwargs + ): + super(Encoder, self).__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = int(n_layers) + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + zippep = zip( + self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2 + ) + for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zippep: + y = attn_layers(x, x, attn_mask) + y = self.drop(y) + x = norm_layers_1(x + y) + + y = ffn_layers(x, x_mask) + y = self.drop(y) + x = norm_layers_2(x + y) + x = x * x_mask + return x + + +class Decoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + proximal_bias=False, + proximal_init=True, + **kwargs + ): + super(Decoder, self).__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.ModuleList() + self.norm_layers_0 = nn.ModuleList() + self.encdec_attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.self_attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + proximal_bias=proximal_bias, + proximal_init=proximal_init, + ) + ) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.encdec_attn_layers.append( + MultiHeadAttention( + hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + causal=True, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, h, h_mask): + """ + x: decoder input + h: encoder output + """ + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( + device=x.device, dtype=x.dtype + ) + encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) + + y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=True, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super(MultiHeadAttention, self).__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward( + self, x: torch.Tensor, c: torch.Tensor, attn_mask: Optional[torch.Tensor] = None + ): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, _ = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor] = None, + ): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s = key.size() + t_t = query.size(2) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + if self.window_size is not None: + assert ( + t_s == t_t + ), "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), key_relative_embeddings + ) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert ( + t_s == t_t + ), "Local attention is only available for self-attention." + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length: int): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length: int = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + # commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + [0, 0, pad_length, pad_length, 0, 0], + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad( + x, + # commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]) + [0, 1, 0, 0, 0, 0, 0, 0], + ) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad( + x_flat, + # commons.convert_pad_shape([[0, 0], [0, 0], [0, int(length) - 1]]) + [0, int(length) - 1, 0, 0, 0, 0], + ) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad( + x, + # commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, int(length) - 1]]) + [0, int(length) - 1, 0, 0, 0, 0, 0, 0], + ) + x_flat = x.view([batch, heads, int(length**2) + int(length * (length - 1))]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad( + x_flat, + # commons.convert_pad_shape([[0, 0], [0, 0], [int(length), 0]]) + [length, 0, 0, 0, 0, 0], + ) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length: int): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation: str = None, + causal=False, + ): + super(FFN, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + self.is_activation = True if activation == "gelu" else False + # if causal: + # self.padding = self._causal_padding + # else: + # self.padding = self._same_padding + + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) + + def padding(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor: + if self.causal: + padding = self._causal_padding(x * x_mask) + else: + padding = self._same_padding(x * x_mask) + return padding + + def forward(self, x: torch.Tensor, x_mask: torch.Tensor): + x = self.conv_1(self.padding(x, x_mask)) + if self.is_activation: + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + + x = self.conv_2(self.padding(x, x_mask)) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l: int = self.kernel_size - 1 + pad_r: int = 0 + # padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad( + x, + # commons.convert_pad_shape(padding) + [pad_l, pad_r, 0, 0, 0, 0], + ) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l: int = (self.kernel_size - 1) // 2 + pad_r: int = self.kernel_size // 2 + # padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad( + x, + # commons.convert_pad_shape(padding) + [pad_l, pad_r, 0, 0, 0, 0], + ) + return x diff --git a/infer/lib/infer_pack/attentions_onnx.py b/infer/lib/infer_pack/attentions_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..a32abc1b6ff69246eef2d6ceff126ae991dc83cf --- /dev/null +++ b/infer/lib/infer_pack/attentions_onnx.py @@ -0,0 +1,459 @@ +############################## Warning! ############################## +# # +# Onnx Export Not Support All Of Non-Torch Types # +# Include Python Built-in Types!!!!!!!!!!!!!!!!! # +# If You Want TO Change This File # +# Do Not Use All Of Non-Torch Types! # +# # +############################## Warning! ############################## +import copy +import math +from typing import Optional + +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from infer.lib.infer_pack import commons, modules +from infer.lib.infer_pack.modules import LayerNorm + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=10, + **kwargs + ): + super(Encoder, self).__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = int(n_layers) + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + zippep = zip( + self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2 + ) + for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zippep: + y = attn_layers(x, x, attn_mask) + y = self.drop(y) + x = norm_layers_1(x + y) + + y = ffn_layers(x, x_mask) + y = self.drop(y) + x = norm_layers_2(x + y) + x = x * x_mask + return x + + +class Decoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + proximal_bias=False, + proximal_init=True, + **kwargs + ): + super(Decoder, self).__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.ModuleList() + self.norm_layers_0 = nn.ModuleList() + self.encdec_attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.self_attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + proximal_bias=proximal_bias, + proximal_init=proximal_init, + ) + ) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.encdec_attn_layers.append( + MultiHeadAttention( + hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + causal=True, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, h, h_mask): + """ + x: decoder input + h: encoder output + """ + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( + device=x.device, dtype=x.dtype + ) + encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) + + y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=True, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super(MultiHeadAttention, self).__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward( + self, x: torch.Tensor, c: torch.Tensor, attn_mask: Optional[torch.Tensor] = None + ): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, _ = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor] = None, + ): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s = key.size() + t_t = query.size(2) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + if self.window_size is not None: + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), key_relative_embeddings + ) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert ( + t_s == t_t + ), "Local attention is only available for self-attention." + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + + pad_length = torch.clamp(length - (self.window_size + 1), min=0) + slice_start_position = torch.clamp((self.window_size + 1) - length, min=0) + slice_end_position = slice_start_position + 2 * length - 1 + padded_relative_embeddings = F.pad( + relative_embeddings, + # commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + [0, 0, pad_length, pad_length, 0, 0], + ) + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad( + x, + # commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]) + [0, 1, 0, 0, 0, 0, 0, 0], + ) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad( + x_flat, + [0, length - 1, 0, 0, 0, 0], + ) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad( + x, + [0, length - 1, 0, 0, 0, 0, 0, 0], + ) + x_flat = x.view([batch, heads, length*length + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad( + x_flat, + [length, 0, 0, 0, 0, 0], + ) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation: str = None, + causal=False, + ): + super(FFN, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + self.is_activation = True if activation == "gelu" else False + # if causal: + # self.padding = self._causal_padding + # else: + # self.padding = self._same_padding + + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) + + def padding(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor: + if self.causal: + padding = self._causal_padding(x * x_mask) + else: + padding = self._same_padding(x * x_mask) + return padding + + def forward(self, x: torch.Tensor, x_mask: torch.Tensor): + x = self.conv_1(self.padding(x, x_mask)) + if self.is_activation: + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + + x = self.conv_2(self.padding(x, x_mask)) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + # padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad( + x, + # commons.convert_pad_shape(padding) + [pad_l, pad_r, 0, 0, 0, 0], + ) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + # padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad( + x, + # commons.convert_pad_shape(padding) + [pad_l, pad_r, 0, 0, 0, 0], + ) + return x diff --git a/infer/lib/infer_pack/commons.py b/infer/lib/infer_pack/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..4ec6c244e228647b125429f62b8c9fddbe40eba9 --- /dev/null +++ b/infer/lib/infer_pack/commons.py @@ -0,0 +1,172 @@ +from typing import List, Optional +import math + +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +# def convert_pad_shape(pad_shape): +# l = pad_shape[::-1] +# pad_shape = [item for sublist in l for item in sublist] +# return pad_shape + + +def kl_divergence(m_p, logs_p, m_q, logs_q): + """KL(P||Q)""" + kl = (logs_q - logs_p) - 0.5 + kl += ( + 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) + ) + return kl + + +def rand_gumbel(shape): + """Sample from the Gumbel distribution, protect from overflows.""" + uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 + return -torch.log(-torch.log(uniform_samples)) + + +def rand_gumbel_like(x): + g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) + return g + + +def slice_segments(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, :, idx_str:idx_end] + return ret + + +def slice_segments2(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, idx_str:idx_end] + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( + num_timescales - 1 + ) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment + ) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = F.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal + + +def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return x + signal.to(dtype=x.dtype, device=x.device) + + +def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) + + +def subsequent_mask(length): + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +# def convert_pad_shape(pad_shape): +# l = pad_shape[::-1] +# pad_shape = [item for sublist in l for item in sublist] +# return pad_shape + + +def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]: + return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist() + + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + device = duration.device + + b, _, t_y, t_x = mask.shape + cum_duration = torch.cumsum(duration, -1) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path.unsqueeze(1).transpose(2, 3) * mask + return path + + +def clip_grad_value_(parameters, clip_value, norm_type=2): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + if clip_value is not None: + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1.0 / norm_type) + return total_norm diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py new file mode 100644 index 0000000000000000000000000000000000000000..e104549be20b519839fe2e9c6723803293215f55 --- /dev/null +++ b/infer/lib/infer_pack/models.py @@ -0,0 +1,1223 @@ +import math +import logging +from typing import Optional + +logger = logging.getLogger(__name__) + +import numpy as np +import torch +from torch import nn +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm +from infer.lib.infer_pack import attentions, commons, modules +from infer.lib.infer_pack.commons import get_padding, init_weights + +has_xpu = bool(hasattr(torch, "xpu") and torch.xpu.is_available()) + + +class TextEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super(TextEncoder, self).__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.emb_phone = nn.Linear(in_channels, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, + phone: torch.Tensor, + pitch: torch.Tensor, + lengths: torch.Tensor, + skip_head: Optional[torch.Tensor] = None, + ): + if pitch is None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + if skip_head is not None: + assert isinstance(skip_head, torch.Tensor) + head = int(skip_head.item()) + x = x[:, :, head:] + x_mask = x_mask[:, :, head:] + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super(ResidualCouplingBlock, self).__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in self.flows[::-1]: + x, _ = flow.forward(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + def __prepare_scriptable__(self): + for i in range(self.n_flows): + for hook in self.flows[i * 2]._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flows[i * 2]) + + return self + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super(PosteriorEncoder, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None + ): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.enc._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc) + return self + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward( + self, + x: torch.Tensor, + g: Optional[torch.Tensor] = None, + n_res: Optional[torch.Tensor] = None, + ): + if n_res is not None: + assert isinstance(n_res, torch.Tensor) + n = int(n_res.item()) + if n != x.shape[-1]: + x = F.interpolate(x, size=n, mode="linear") + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def __prepare_scriptable__(self): + for l in self.ups: + for hook in l._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + + for l in self.resblocks: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(torch.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + if uv.device.type == "privateuseone": # for DirectML + uv = uv.float() + return uv + + def _f02sine(self, f0, upp): + """ f0: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + a = torch.arange(1, upp + 1, dtype=f0.dtype, device=f0.device) + rad = f0 / self.sampling_rate * a + rad2 = torch.fmod(rad[:, :-1, -1:].float() + 0.5, 1.0) - 0.5 + rad_acc = rad2.cumsum(dim=1).fmod(1.0).to(f0) + rad += F.pad(rad_acc, (0, 0, 1, 0), mode='constant') + rad = rad.reshape(f0.shape[0], -1, 1) + b = torch.arange(1, self.dim + 1, dtype=f0.dtype, device=f0.device).reshape(1, 1, -1) + rad *= b + rand_ini = torch.rand(1, 1, self.dim, device=f0.device) + rand_ini[..., 0] = 0 + rad += rand_ini + sines = torch.sin(2 * np.pi * rad) + return sines + + def forward(self, f0: torch.Tensor, upp: int): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0 = f0.unsqueeze(-1) + sine_waves = self._f02sine(f0, upp) * self.sine_amp + uv = self._f02uv(f0) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=float(upp), mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + # self.ddtype:int = -1 + + def forward(self, x: torch.Tensor, upp: int = 1): + # if self.ddtype ==-1: + # self.ddtype = self.l_linear.weight.dtype + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype) + # if self.is_half: + # sine_wavs = sine_wavs.half() + # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x))) + # print(sine_wavs.dtype,self.ddtype) + # if sine_wavs.dtype != self.l_linear.weight.dtype: + sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None # noise, uv + + +class GeneratorNSF(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + + self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sampling_rate=sr, harmonic_num=0, is_half=is_half + ) + self.noise_convs = nn.ModuleList() + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + if i + 1 < len(upsample_rates): + stride_f0 = math.prod(upsample_rates[i + 1 :]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = math.prod(upsample_rates) + + self.lrelu_slope = modules.LRELU_SLOPE + + def forward( + self, + x, + f0, + g: Optional[torch.Tensor] = None, + n_res: Optional[torch.Tensor] = None, + ): + har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + if n_res is not None: + assert isinstance(n_res, torch.Tensor) + n = int(n_res.item()) + if n * self.upp != har_source.shape[-1]: + har_source = F.interpolate(har_source, size=n * self.upp, mode="linear") + if n != x.shape[-1]: + x = F.interpolate(x, size=n, mode="linear") + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + # torch.jit.script() does not support direct indexing of torch modules + # That's why I wrote this + for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)): + if i < self.num_upsamples: + x = F.leaky_relu(x, self.lrelu_slope) + x = ups(x) + x_source = noise_convs(har_source) + x = x + x_source + xs: Optional[torch.Tensor] = None + l = [i * self.num_kernels + j for j in range(self.num_kernels)] + for j, resblock in enumerate(self.resblocks): + if j in l: + if xs is None: + xs = resblock(x) + else: + xs += resblock(x) + # This assertion cannot be ignored! \ + # If ignored, it will cause torch.jit.script() compilation errors + assert isinstance(xs, torch.Tensor) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + def __prepare_scriptable__(self): + for l in self.ups: + for hook in l._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + for l in self.resblocks: + for hook in self.resblocks._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +class SynthesizerTrnMs256NSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super(SynthesizerTrnMs256NSFsid, self).__init__() + if isinstance(sr, str): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder( + 256, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + logger.debug( + "gin_channels: " + + str(gin_channels) + + ", self.spk_embed_dim: " + + str(self.spk_embed_dim) + ) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.dec._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.dec) + for hook in self.flow._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flow) + if hasattr(self, "enc_q"): + for hook in self.enc_q._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc_q) + return self + + @torch.jit.ignore + def forward( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: torch.Tensor, + pitchf: torch.Tensor, + y: torch.Tensor, + y_lengths: torch.Tensor, + ds: Optional[torch.Tensor] = None, + ): # 这里ds是id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: torch.Tensor, + nsff0: torch.Tensor, + sid: torch.Tensor, + skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, + return_length2: Optional[torch.Tensor] = None, + ): + g = self.emb_g(sid).unsqueeze(-1) + if skip_head is not None and return_length is not None: + assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) + head = int(skip_head.item()) + length = int(return_length.item()) + flow_head = torch.clamp(skip_head - 24, min=0) + dec_head = head - int(flow_head.item()) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths, flow_head) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + z = z[:, :, dec_head : dec_head + length] + x_mask = x_mask[:, :, dec_head : dec_head + length] + nsff0 = nsff0[:, head : head + length] + else: + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, nsff0, g=g, n_res=return_length2) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs768NSFsid(SynthesizerTrnMs256NSFsid): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super(SynthesizerTrnMs768NSFsid, self).__init__( + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ) + del self.enc_p + self.enc_p = TextEncoder( + 768, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + ) + + +class SynthesizerTrnMs256NSFsid_nono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super(SynthesizerTrnMs256NSFsid_nono, self).__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder( + 256, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + logger.debug( + "gin_channels: " + + str(gin_channels) + + ", self.spk_embed_dim: " + + str(self.spk_embed_dim) + ) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + if hasattr(self, "enc_q"): + self.enc_q.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.dec._forward_pre_hooks.values(): + # The hook we want to remove is an instance of WeightNorm class, so + # normally we would do `if isinstance(...)` but this class is not accessible + # because of shadowing, so we check the module name directly. + # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3 + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.dec) + for hook in self.flow._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flow) + if hasattr(self, "enc_q"): + for hook in self.enc_q._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc_q) + return self + + @torch.jit.ignore + def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + sid: torch.Tensor, + skip_head: Optional[torch.Tensor] = None, + return_length: Optional[torch.Tensor] = None, + return_length2: Optional[torch.Tensor] = None, + ): + g = self.emb_g(sid).unsqueeze(-1) + if skip_head is not None and return_length is not None: + assert isinstance(skip_head, torch.Tensor) + assert isinstance(return_length, torch.Tensor) + head = int(skip_head.item()) + length = int(return_length.item()) + flow_head = torch.clamp(skip_head - 24, min=0) + dec_head = head - int(flow_head.item()) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths, flow_head) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + z = z[:, :, dec_head : dec_head + length] + x_mask = x_mask[:, :, dec_head : dec_head + length] + else: + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, g=g, n_res=return_length2) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs768NSFsid_nono(SynthesizerTrnMs256NSFsid_nono): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super(SynthesizerTrnMs768NSFsid_nono, self).__init__( + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ) + del self.enc_p + self.enc_p = TextEncoder( + 768, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + f0=False, + ) + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11, 17] + # periods = [3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class MultiPeriodDiscriminatorV2(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminatorV2, self).__init__() + # periods = [2, 3, 5, 7, 11, 17] + periods = [2, 3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + if has_xpu and x.dtype == torch.bfloat16: + x = F.pad(x.to(dtype=torch.float16), (0, n_pad), "reflect").to( + dtype=torch.bfloat16 + ) + else: + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap diff --git a/infer/lib/infer_pack/models_onnx.py b/infer/lib/infer_pack/models_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..4668de2a842d7c8fd0eabda4dae3aedfc9966746 --- /dev/null +++ b/infer/lib/infer_pack/models_onnx.py @@ -0,0 +1,818 @@ +############################## Warning! ############################## +# # +# Onnx Export Not Support All Of Non-Torch Types # +# Include Python Built-in Types!!!!!!!!!!!!!!!!! # +# If You Want TO Change This File # +# Do Not Use All Of Non-Torch Types! # +# # +############################## Warning! ############################## + +import math +import logging + +logger = logging.getLogger(__name__) + +import numpy as np +import torch +from torch import nn +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from infer.lib.infer_pack import commons, modules +import infer.lib.infer_pack.attentions_onnx as attentions +from infer.lib.infer_pack.commons import get_padding, init_weights + + +class TextEncoder256(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(256, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class TextEncoder768(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(768, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x, _ = flow(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + if uv.device.type == "privateuseone": # for DirectML + uv = uv.float() + return uv + + def _f02sine(self, f0, upp): + """ f0: (batchsize, length, dim) + where dim indicates fundamental tone and overtones + """ + a = torch.arange(1, upp + 1, dtype=f0.dtype, device=f0.device) + rad = f0 / self.sampling_rate * a + rad2 = torch.fmod(rad[:, :-1, -1:].float() + 0.5, 1.0) - 0.5 + rad_acc = rad2.cumsum(dim=1).fmod(1.0).to(f0) + rad += F.pad(rad_acc, (0, 0, 1, 0), mode='constant') + rad = rad.reshape(f0.shape[0], -1, 1) + b = torch.arange(1, self.dim + 1, dtype=f0.dtype, device=f0.device).reshape(1, 1, -1) + rad *= b + rand_ini = torch.rand(1, 1, self.dim, device=f0.device) + rand_ini[..., 0] = 0 + rad += rand_ini + sines = torch.sin(2 * np.pi * rad) + return sines + + def forward(self, f0: torch.Tensor, upp: int): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0 = f0.unsqueeze(-1) + sine_waves = self._f02sine(f0, upp) * self.sine_amp + uv = self._f02uv(f0) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=float(upp), mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x, upp=None): + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + if self.is_half: + sine_wavs = sine_wavs.half() + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None # noise, uv + + +class GeneratorNSF(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sampling_rate=sr, harmonic_num=0, is_half=is_half + ) + self.noise_convs = nn.ModuleList() + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + if i + 1 < len(upsample_rates): + stride_f0 = np.prod(upsample_rates[i + 1 :]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = np.prod(upsample_rates) + + def forward(self, x, f0, g=None): + har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + x_source = self.noise_convs[i](har_source) + x = x + x_source + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +class SynthesizerTrnMsNSFsidM(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + version, + **kwargs, + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + if version == "v1": + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + else: + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + self.speaker_map = None + logger.debug( + f"gin_channels: {gin_channels}, self.spk_embed_dim: {self.spk_embed_dim}" + ) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def construct_spkmixmap(self, n_speaker): + self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels)) + for i in range(n_speaker): + self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]])) + self.speaker_map = self.speaker_map.unsqueeze(0) + + def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None): + if self.speaker_map is not None: # [N, S] * [S, B, 1, H] + g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1] + g = g * self.speaker_map # [N, S, B, 1, H] + g = torch.sum(g, dim=1) # [N, 1, B, 1, H] + g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N] + else: + g = g.unsqueeze(0) + g = self.emb_g(g).transpose(1, 2) + + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11, 17] + # periods = [3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class MultiPeriodDiscriminatorV2(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminatorV2, self).__init__() + # periods = [2, 3, 5, 7, 11, 17] + periods = [2, 3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap diff --git a/infer/lib/infer_pack/modules.py b/infer/lib/infer_pack/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..51aeaf0799819c59714aeed0c8b6a3f8b2872f36 --- /dev/null +++ b/infer/lib/infer_pack/modules.py @@ -0,0 +1,615 @@ +import copy +import math +from typing import Optional, Tuple + +import numpy as np +import scipy +import torch +from torch import nn +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, weight_norm + +from infer.lib.infer_pack import commons +from infer.lib.infer_pack.commons import get_padding, init_weights +from infer.lib.infer_pack.transforms import piecewise_rational_quadratic_transform + +LRELU_SLOPE = 0.1 + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super(LayerNorm, self).__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +class ConvReluNorm(nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + out_channels, + kernel_size, + n_layers, + p_dropout, + ): + super(ConvReluNorm, self).__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = float(p_dropout) + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append( + nn.Conv1d( + in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(float(p_dropout))) + for _ in range(n_layers - 1): + self.conv_layers.append( + nn.Conv1d( + hidden_channels, + hidden_channels, + kernel_size, + padding=kernel_size // 2, + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class DDSConv(nn.Module): + """ + Dialted and Depth-Separable Convolution + """ + + def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): + super(DDSConv, self).__init__() + self.channels = channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = float(p_dropout) + + self.drop = nn.Dropout(float(p_dropout)) + self.convs_sep = nn.ModuleList() + self.convs_1x1 = nn.ModuleList() + self.norms_1 = nn.ModuleList() + self.norms_2 = nn.ModuleList() + for i in range(n_layers): + dilation = kernel_size**i + padding = (kernel_size * dilation - dilation) // 2 + self.convs_sep.append( + nn.Conv1d( + channels, + channels, + kernel_size, + groups=channels, + dilation=dilation, + padding=padding, + ) + ) + self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) + self.norms_1.append(LayerNorm(channels)) + self.norms_2.append(LayerNorm(channels)) + + def forward(self, x, x_mask, g: Optional[torch.Tensor] = None): + if g is not None: + x = x + g + for i in range(self.n_layers): + y = self.convs_sep[i](x * x_mask) + y = self.norms_1[i](y) + y = F.gelu(y) + y = self.convs_1x1[i](y) + y = self.norms_2[i](y) + y = F.gelu(y) + y = self.drop(y) + x = x + y + return x * x_mask + + +class WN(torch.nn.Module): + def __init__( + self, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + p_dropout=0, + ): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = float(p_dropout) + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(float(p_dropout)) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d( + gin_channels, 2 * hidden_channels * n_layers, 1 + ) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + + for i in range(n_layers): + dilation = dilation_rate**i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) + + def forward( + self, x: torch.Tensor, x_mask: torch.Tensor, g: Optional[torch.Tensor] = None + ): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i, (in_layer, res_skip_layer) in enumerate( + zip(self.in_layers, self.res_skip_layers) + ): + x_in = in_layer(x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + else: + g_l = torch.zeros_like(x_in) + + acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) + acts = self.drop(acts) + + res_skip_acts = res_skip_layer(acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + def __prepare_scriptable__(self): + if self.gin_channels != 0: + for hook in self.cond_layer._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + +class ResBlock1(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.convs1 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) + self.convs2.apply(init_weights) + self.lrelu_slope = LRELU_SLOPE + + def forward(self, x: torch.Tensor, x_mask: Optional[torch.Tensor] = None): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, self.lrelu_slope) + if x_mask is not None: + xt = xt * x_mask + xt = c1(xt) + xt = F.leaky_relu(xt, self.lrelu_slope) + if x_mask is not None: + xt = xt * x_mask + xt = c2(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + def __prepare_scriptable__(self): + for l in self.convs1: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + for l in self.convs2: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + +class ResBlock2(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.convs = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + ] + ) + self.convs.apply(init_weights) + self.lrelu_slope = LRELU_SLOPE + + def forward(self, x, x_mask: Optional[torch.Tensor] = None): + for c in self.convs: + xt = F.leaky_relu(x, self.lrelu_slope) + if x_mask is not None: + xt = xt * x_mask + xt = c(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + def __prepare_scriptable__(self): + for l in self.convs: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + +class Log(nn.Module): + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + if not reverse: + y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask + logdet = torch.sum(-y, [1, 2]) + return y, logdet + else: + x = torch.exp(x) * x_mask + return x + + +class Flip(nn.Module): + # torch.jit.script() Compiled functions \ + # can't take variable number of arguments or \ + # use keyword-only arguments with defaults + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x, torch.zeros([1], device=x.device) + + +class ElementwiseAffine(nn.Module): + def __init__(self, channels): + super(ElementwiseAffine, self).__init__() + self.channels = channels + self.m = nn.Parameter(torch.zeros(channels, 1)) + self.logs = nn.Parameter(torch.zeros(channels, 1)) + + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = self.m + torch.exp(self.logs) * x + y = y * x_mask + logdet = torch.sum(self.logs * x_mask, [1, 2]) + return y, logdet + else: + x = (x - self.m) * torch.exp(-self.logs) * x_mask + return x + + +class ResidualCouplingLayer(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super(ResidualCouplingLayer, self).__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=float(p_dropout), + gin_channels=gin_channels, + ) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x, torch.zeros([1]) + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.enc._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc) + return self + + +class ConvFlow(nn.Module): + def __init__( + self, + in_channels, + filter_channels, + kernel_size, + n_layers, + num_bins=10, + tail_bound=5.0, + ): + super(ConvFlow, self).__init__() + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.num_bins = num_bins + self.tail_bound = tail_bound + self.half_channels = in_channels // 2 + + self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) + self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) + self.proj = nn.Conv1d( + filter_channels, self.half_channels * (num_bins * 3 - 1), 1 + ) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse=False, + ): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) + h = self.convs(h, x_mask, g=g) + h = self.proj(h) * x_mask + + b, c, t = x0.shape + h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] + + unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) + unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( + self.filter_channels + ) + unnormalized_derivatives = h[..., 2 * self.num_bins :] + + x1, logabsdet = piecewise_rational_quadratic_transform( + x1, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=reverse, + tails="linear", + tail_bound=self.tail_bound, + ) + + x = torch.cat([x0, x1], 1) * x_mask + logdet = torch.sum(logabsdet * x_mask, [1, 2]) + if not reverse: + return x, logdet + else: + return x diff --git a/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..e69a603440709fc7dc60e92079addbfa490778fd --- /dev/null +++ b/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py @@ -0,0 +1,91 @@ +import numpy as np +import pyworld + +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor + + +class DioF0Predictor(F0Predictor): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.sampling_rate = sampling_rate + + def interpolate_f0(self, f0): + """ + 对F0进行插值处理 + """ + + data = np.reshape(f0, (f0.size, 1)) + + vuv_vector = np.zeros((data.size, 1), dtype=np.float32) + vuv_vector[data > 0.0] = 1.0 + vuv_vector[data <= 0.0] = 0.0 + + ip_data = data + + frame_number = data.size + last_value = 0.0 + for i in range(frame_number): + if data[i] <= 0.0: + j = i + 1 + for j in range(i + 1, frame_number): + if data[j] > 0.0: + break + if j < frame_number - 1: + if last_value > 0.0: + step = (data[j] - data[i - 1]) / float(j - i) + for k in range(i, j): + ip_data[k] = data[i - 1] + step * (k - i + 1) + else: + for k in range(i, j): + ip_data[k] = data[j] + else: + for k in range(i, frame_number): + ip_data[k] = last_value + else: + ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 + last_value = data[i] + + return ip_data[:, 0], vuv_vector[:, 0] + + def resize_f0(self, x, target_len): + source = np.array(x) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * target_len, len(source)) / target_len, + np.arange(0, len(source)), + source, + ) + res = np.nan_to_num(target) + return res + + def compute_f0(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.dio( + wav.astype(np.double), + fs=self.sampling_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + for index, pitch in enumerate(f0): + f0[index] = round(pitch, 1) + return self.interpolate_f0(self.resize_f0(f0, p_len))[0] + + def compute_f0_uv(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.dio( + wav.astype(np.double), + fs=self.sampling_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + for index, pitch in enumerate(f0): + f0[index] = round(pitch, 1) + return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..0d81b05eef25f0ebeead80bb9baaaef695823b19 --- /dev/null +++ b/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py @@ -0,0 +1,16 @@ +class F0Predictor(object): + def compute_f0(self, wav, p_len): + """ + input: wav:[signal_length] + p_len:int + output: f0:[signal_length//hop_length] + """ + pass + + def compute_f0_uv(self, wav, p_len): + """ + input: wav:[signal_length] + p_len:int + output: f0:[signal_length//hop_length],uv:[signal_length//hop_length] + """ + pass diff --git a/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..2b13917ce07455e87b076ac4f3cfabab2e443f8e --- /dev/null +++ b/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py @@ -0,0 +1,87 @@ +import numpy as np +import pyworld + +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor + + +class HarvestF0Predictor(F0Predictor): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.sampling_rate = sampling_rate + + def interpolate_f0(self, f0): + """ + 对F0进行插值处理 + """ + + data = np.reshape(f0, (f0.size, 1)) + + vuv_vector = np.zeros((data.size, 1), dtype=np.float32) + vuv_vector[data > 0.0] = 1.0 + vuv_vector[data <= 0.0] = 0.0 + + ip_data = data + + frame_number = data.size + last_value = 0.0 + for i in range(frame_number): + if data[i] <= 0.0: + j = i + 1 + for j in range(i + 1, frame_number): + if data[j] > 0.0: + break + if j < frame_number - 1: + if last_value > 0.0: + step = (data[j] - data[i - 1]) / float(j - i) + for k in range(i, j): + ip_data[k] = data[i - 1] + step * (k - i + 1) + else: + for k in range(i, j): + ip_data[k] = data[j] + else: + for k in range(i, frame_number): + ip_data[k] = last_value + else: + ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 + last_value = data[i] + + return ip_data[:, 0], vuv_vector[:, 0] + + def resize_f0(self, x, target_len): + source = np.array(x) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * target_len, len(source)) / target_len, + np.arange(0, len(source)), + source, + ) + res = np.nan_to_num(target) + return res + + def compute_f0(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.harvest( + wav.astype(np.double), + fs=self.sampling_rate, + f0_ceil=self.f0_max, + f0_floor=self.f0_min, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs) + return self.interpolate_f0(self.resize_f0(f0, p_len))[0] + + def compute_f0_uv(self, wav, p_len=None): + if p_len is None: + p_len = wav.shape[0] // self.hop_length + f0, t = pyworld.harvest( + wav.astype(np.double), + fs=self.sampling_rate, + f0_floor=self.f0_min, + f0_ceil=self.f0_max, + frame_period=1000 * self.hop_length / self.sampling_rate, + ) + f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate) + return self.interpolate_f0(self.resize_f0(f0, p_len)) diff --git a/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..957ec467ec808dc9fe78a2f4a863771b45c7ad4e --- /dev/null +++ b/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py @@ -0,0 +1,98 @@ +import numpy as np +import parselmouth + +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor + + +class PMF0Predictor(F0Predictor): + def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100): + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.sampling_rate = sampling_rate + + def interpolate_f0(self, f0): + """ + 对F0进行插值处理 + """ + + data = np.reshape(f0, (f0.size, 1)) + + vuv_vector = np.zeros((data.size, 1), dtype=np.float32) + vuv_vector[data > 0.0] = 1.0 + vuv_vector[data <= 0.0] = 0.0 + + ip_data = data + + frame_number = data.size + last_value = 0.0 + for i in range(frame_number): + if data[i] <= 0.0: + j = i + 1 + for j in range(i + 1, frame_number): + if data[j] > 0.0: + break + if j < frame_number - 1: + if last_value > 0.0: + step = (data[j] - data[i - 1]) / float(j - i) + for k in range(i, j): + ip_data[k] = data[i - 1] + step * (k - i + 1) + else: + for k in range(i, j): + ip_data[k] = data[j] + else: + for k in range(i, frame_number): + ip_data[k] = last_value + else: + ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝 + last_value = data[i] + + return ip_data[:, 0], vuv_vector[:, 0] + + def compute_f0(self, wav, p_len=None): + x = wav + if p_len is None: + p_len = x.shape[0] // self.hop_length + else: + assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" + time_step = self.hop_length / self.sampling_rate * 1000 + f0 = ( + parselmouth.Sound(x, self.sampling_rate) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=self.f0_min, + pitch_ceiling=self.f0_max, + ) + .selected_array["frequency"] + ) + + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") + f0, uv = self.interpolate_f0(f0) + return f0 + + def compute_f0_uv(self, wav, p_len=None): + x = wav + if p_len is None: + p_len = x.shape[0] // self.hop_length + else: + assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error" + time_step = self.hop_length / self.sampling_rate * 1000 + f0 = ( + parselmouth.Sound(x, self.sampling_rate) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=self.f0_min, + pitch_ceiling=self.f0_max, + ) + .selected_array["frequency"] + ) + + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant") + f0, uv = self.interpolate_f0(f0) + return f0, uv diff --git a/infer/lib/infer_pack/modules/F0Predictor/__init__.py b/infer/lib/infer_pack/modules/F0Predictor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/infer/lib/infer_pack/onnx_inference.py b/infer/lib/infer_pack/onnx_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..3d8328b2ac337a365e2ebc3e6ae767761e3e17c5 --- /dev/null +++ b/infer/lib/infer_pack/onnx_inference.py @@ -0,0 +1,149 @@ +import librosa +import numpy as np +import onnxruntime +import soundfile + +import logging + +logger = logging.getLogger(__name__) + + +class ContentVec: + def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None): + logger.info("Load model(s) from {}".format(vec_path)) + if device == "cpu" or device is None: + providers = ["CPUExecutionProvider"] + elif device == "cuda": + providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] + elif device == "dml": + providers = ["DmlExecutionProvider"] + else: + raise RuntimeError("Unsportted Device") + self.model = onnxruntime.InferenceSession(vec_path, providers=providers) + + def __call__(self, wav): + return self.forward(wav) + + def forward(self, wav): + feats = wav + if feats.ndim == 2: # double channels + feats = feats.mean(-1) + assert feats.ndim == 1, feats.ndim + feats = np.expand_dims(np.expand_dims(feats, 0), 0) + onnx_input = {self.model.get_inputs()[0].name: feats} + logits = self.model.run(None, onnx_input)[0] + return logits.transpose(0, 2, 1) + + +def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs): + if f0_predictor == "pm": + from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor + + f0_predictor_object = PMF0Predictor( + hop_length=hop_length, sampling_rate=sampling_rate + ) + elif f0_predictor == "harvest": + from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import ( + HarvestF0Predictor, + ) + + f0_predictor_object = HarvestF0Predictor( + hop_length=hop_length, sampling_rate=sampling_rate + ) + elif f0_predictor == "dio": + from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor + + f0_predictor_object = DioF0Predictor( + hop_length=hop_length, sampling_rate=sampling_rate + ) + else: + raise Exception("Unknown f0 predictor") + return f0_predictor_object + + +class OnnxRVC: + def __init__( + self, + model_path, + sr=40000, + hop_size=512, + vec_path="vec-768-layer-12", + device="cpu", + ): + vec_path = f"pretrained/{vec_path}.onnx" + self.vec_model = ContentVec(vec_path, device) + if device == "cpu" or device is None: + providers = ["CPUExecutionProvider"] + elif device == "cuda": + providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] + elif device == "dml": + providers = ["DmlExecutionProvider"] + else: + raise RuntimeError("Unsportted Device") + self.model = onnxruntime.InferenceSession(model_path, providers=providers) + self.sampling_rate = sr + self.hop_size = hop_size + + def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd): + onnx_input = { + self.model.get_inputs()[0].name: hubert, + self.model.get_inputs()[1].name: hubert_length, + self.model.get_inputs()[2].name: pitch, + self.model.get_inputs()[3].name: pitchf, + self.model.get_inputs()[4].name: ds, + self.model.get_inputs()[5].name: rnd, + } + return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16) + + def inference( + self, + raw_path, + sid, + f0_method="dio", + f0_up_key=0, + pad_time=0.5, + cr_threshold=0.02, + ): + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + f0_predictor = get_f0_predictor( + f0_method, + hop_length=self.hop_size, + sampling_rate=self.sampling_rate, + threshold=cr_threshold, + ) + wav, sr = librosa.load(raw_path, sr=self.sampling_rate) + org_length = len(wav) + if org_length / sr > 50.0: + raise RuntimeError("Reached Max Length") + + wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000) + wav16k = wav16k + + hubert = self.vec_model(wav16k) + hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32) + hubert_length = hubert.shape[1] + + pitchf = f0_predictor.compute_f0(wav, hubert_length) + pitchf = pitchf * 2 ** (f0_up_key / 12) + pitch = pitchf.copy() + f0_mel = 1127 * np.log(1 + pitch / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + pitch = np.rint(f0_mel).astype(np.int64) + + pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32) + pitch = pitch.reshape(1, len(pitch)) + ds = np.array([sid]).astype(np.int64) + + rnd = np.random.randn(1, 192, hubert_length).astype(np.float32) + hubert_length = np.array([hubert_length]).astype(np.int64) + + out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze() + out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant") + return out_wav[0:org_length] diff --git a/infer/lib/infer_pack/transforms.py b/infer/lib/infer_pack/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..6d07b3b12cee87869440feb1496dd634d334e96f --- /dev/null +++ b/infer/lib/infer_pack/transforms.py @@ -0,0 +1,207 @@ +import numpy as np +import torch +from torch.nn import functional as F + +DEFAULT_MIN_BIN_WIDTH = 1e-3 +DEFAULT_MIN_BIN_HEIGHT = 1e-3 +DEFAULT_MIN_DERIVATIVE = 1e-3 + + +def piecewise_rational_quadratic_transform( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails=None, + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + if tails is None: + spline_fn = rational_quadratic_spline + spline_kwargs = {} + else: + spline_fn = unconstrained_rational_quadratic_spline + spline_kwargs = {"tails": tails, "tail_bound": tail_bound} + + outputs, logabsdet = spline_fn( + inputs=inputs, + unnormalized_widths=unnormalized_widths, + unnormalized_heights=unnormalized_heights, + unnormalized_derivatives=unnormalized_derivatives, + inverse=inverse, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + **spline_kwargs + ) + return outputs, logabsdet + + +def searchsorted(bin_locations, inputs, eps=1e-6): + bin_locations[..., -1] += eps + return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 + + +def unconstrained_rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails="linear", + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) + outside_interval_mask = ~inside_interval_mask + + outputs = torch.zeros_like(inputs) + logabsdet = torch.zeros_like(inputs) + + if tails == "linear": + unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) + constant = np.log(np.exp(1 - min_derivative) - 1) + unnormalized_derivatives[..., 0] = constant + unnormalized_derivatives[..., -1] = constant + + outputs[outside_interval_mask] = inputs[outside_interval_mask] + logabsdet[outside_interval_mask] = 0 + else: + raise RuntimeError("{} tails are not implemented.".format(tails)) + + ( + outputs[inside_interval_mask], + logabsdet[inside_interval_mask], + ) = rational_quadratic_spline( + inputs=inputs[inside_interval_mask], + unnormalized_widths=unnormalized_widths[inside_interval_mask, :], + unnormalized_heights=unnormalized_heights[inside_interval_mask, :], + unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], + inverse=inverse, + left=-tail_bound, + right=tail_bound, + bottom=-tail_bound, + top=tail_bound, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + ) + + return outputs, logabsdet + + +def rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + left=0.0, + right=1.0, + bottom=0.0, + top=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + if torch.min(inputs) < left or torch.max(inputs) > right: + raise ValueError("Input to a transform is not within its domain") + + num_bins = unnormalized_widths.shape[-1] + + if min_bin_width * num_bins > 1.0: + raise ValueError("Minimal bin width too large for the number of bins") + if min_bin_height * num_bins > 1.0: + raise ValueError("Minimal bin height too large for the number of bins") + + widths = F.softmax(unnormalized_widths, dim=-1) + widths = min_bin_width + (1 - min_bin_width * num_bins) * widths + cumwidths = torch.cumsum(widths, dim=-1) + cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) + cumwidths = (right - left) * cumwidths + left + cumwidths[..., 0] = left + cumwidths[..., -1] = right + widths = cumwidths[..., 1:] - cumwidths[..., :-1] + + derivatives = min_derivative + F.softplus(unnormalized_derivatives) + + heights = F.softmax(unnormalized_heights, dim=-1) + heights = min_bin_height + (1 - min_bin_height * num_bins) * heights + cumheights = torch.cumsum(heights, dim=-1) + cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) + cumheights = (top - bottom) * cumheights + bottom + cumheights[..., 0] = bottom + cumheights[..., -1] = top + heights = cumheights[..., 1:] - cumheights[..., :-1] + + if inverse: + bin_idx = searchsorted(cumheights, inputs)[..., None] + else: + bin_idx = searchsorted(cumwidths, inputs)[..., None] + + input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] + input_bin_widths = widths.gather(-1, bin_idx)[..., 0] + + input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] + delta = heights / widths + input_delta = delta.gather(-1, bin_idx)[..., 0] + + input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] + input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] + + input_heights = heights.gather(-1, bin_idx)[..., 0] + + if inverse: + a = (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + input_heights * (input_delta - input_derivatives) + b = input_heights * input_derivatives - (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + c = -input_delta * (inputs - input_cumheights) + + discriminant = b.pow(2) - 4 * a * c + assert (discriminant >= 0).all() + + root = (2 * c) / (-b - torch.sqrt(discriminant)) + outputs = root * input_bin_widths + input_cumwidths + + theta_one_minus_theta = root * (1 - root) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * root.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - root).pow(2) + ) + logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) + + return outputs, -logabsdet + else: + theta = (inputs - input_cumwidths) / input_bin_widths + theta_one_minus_theta = theta * (1 - theta) + + numerator = input_heights * ( + input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta + ) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + outputs = input_cumheights + numerator / denominator + + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * theta.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - theta).pow(2) + ) + logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) + + return outputs, logabsdet diff --git a/infer/lib/jit/__init__.py b/infer/lib/jit/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aac6597268d7ea6e41819016d394843f09bdc7c2 --- /dev/null +++ b/infer/lib/jit/__init__.py @@ -0,0 +1,163 @@ +from io import BytesIO +import pickle +import time +import torch +from tqdm import tqdm +from collections import OrderedDict + + +def load_inputs(path, device, is_half=False): + parm = torch.load(path, map_location=torch.device("cpu"), weights_only=False) + for key in parm.keys(): + parm[key] = parm[key].to(device) + if is_half and parm[key].dtype == torch.float32: + parm[key] = parm[key].half() + elif not is_half and parm[key].dtype == torch.float16: + parm[key] = parm[key].float() + return parm + + +def benchmark( + model, inputs_path, device=torch.device("cpu"), epoch=1000, is_half=False +): + parm = load_inputs(inputs_path, device, is_half) + total_ts = 0.0 + bar = tqdm(range(epoch)) + for i in bar: + start_time = time.perf_counter() + o = model(**parm) + total_ts += time.perf_counter() - start_time + print(f"num_epoch: {epoch} | avg time(ms): {(total_ts*1000)/epoch}") + + +def jit_warm_up(model, inputs_path, device=torch.device("cpu"), epoch=5, is_half=False): + benchmark(model, inputs_path, device, epoch=epoch, is_half=is_half) + + +def to_jit_model( + model_path, + model_type: str, + mode: str = "trace", + inputs_path: str = None, + device=torch.device("cpu"), + is_half=False, +): + model = None + if model_type.lower() == "synthesizer": + from .get_synthesizer import get_synthesizer + + model, _ = get_synthesizer(model_path, device) + model.forward = model.infer + elif model_type.lower() == "rmvpe": + from .get_rmvpe import get_rmvpe + + model = get_rmvpe(model_path, device) + elif model_type.lower() == "hubert": + from .get_hubert import get_hubert_model + + model = get_hubert_model(model_path, device) + model.forward = model.infer + else: + raise ValueError(f"No model type named {model_type}") + model = model.eval() + model = model.half() if is_half else model.float() + if mode == "trace": + assert not inputs_path + inputs = load_inputs(inputs_path, device, is_half) + model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs) + elif mode == "script": + model_jit = torch.jit.script(model) + model_jit.to(device) + model_jit = model_jit.half() if is_half else model_jit.float() + # model = model.half() if is_half else model.float() + return (model, model_jit) + + +def export( + model: torch.nn.Module, + mode: str = "trace", + inputs: dict = None, + device=torch.device("cpu"), + is_half: bool = False, +) -> dict: + model = model.half() if is_half else model.float() + model.eval() + if mode == "trace": + assert inputs is not None + model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs) + elif mode == "script": + model_jit = torch.jit.script(model) + model_jit.to(device) + model_jit = model_jit.half() if is_half else model_jit.float() + buffer = BytesIO() + # model_jit=model_jit.cpu() + torch.jit.save(model_jit, buffer) + del model_jit + cpt = OrderedDict() + cpt["model"] = buffer.getvalue() + cpt["is_half"] = is_half + return cpt + + +def load(path: str): + with open(path, "rb") as f: + return pickle.load(f) + + +def save(ckpt: dict, save_path: str): + with open(save_path, "wb") as f: + pickle.dump(ckpt, f) + + +def rmvpe_jit_export( + model_path: str, + mode: str = "script", + inputs_path: str = None, + save_path: str = None, + device=torch.device("cpu"), + is_half=False, +): + if not save_path: + save_path = model_path.rstrip(".pth") + save_path += ".half.jit" if is_half else ".jit" + if "cuda" in str(device) and ":" not in str(device): + device = torch.device("cuda:0") + from .get_rmvpe import get_rmvpe + + model = get_rmvpe(model_path, device) + inputs = None + if mode == "trace": + inputs = load_inputs(inputs_path, device, is_half) + ckpt = export(model, mode, inputs, device, is_half) + ckpt["device"] = str(device) + save(ckpt, save_path) + return ckpt + + +def synthesizer_jit_export( + model_path: str, + mode: str = "script", + inputs_path: str = None, + save_path: str = None, + device=torch.device("cpu"), + is_half=False, +): + if not save_path: + save_path = model_path.rstrip(".pth") + save_path += ".half.jit" if is_half else ".jit" + if "cuda" in str(device) and ":" not in str(device): + device = torch.device("cuda:0") + from .get_synthesizer import get_synthesizer + + model, cpt = get_synthesizer(model_path, device) + assert isinstance(cpt, dict) + model.forward = model.infer + inputs = None + if mode == "trace": + inputs = load_inputs(inputs_path, device, is_half) + ckpt = export(model, mode, inputs, device, is_half) + cpt.pop("weight") + cpt["model"] = ckpt["model"] + cpt["device"] = device + save(cpt, save_path) + return cpt diff --git a/infer/lib/jit/get_hubert.py b/infer/lib/jit/get_hubert.py new file mode 100644 index 0000000000000000000000000000000000000000..d236f4f134aa087b9e92a8c3f53e168428bc4ca8 --- /dev/null +++ b/infer/lib/jit/get_hubert.py @@ -0,0 +1,342 @@ +import math +import random +from typing import Optional, Tuple +from fairseq.checkpoint_utils import load_model_ensemble_and_task +import numpy as np +import torch +import torch.nn.functional as F + +# from fairseq.data.data_utils import compute_mask_indices +from fairseq.utils import index_put + + +# @torch.jit.script +def pad_to_multiple(x, multiple, dim=-1, value=0): + # Inspired from https://github.com/lucidrains/local-attention/blob/master/local_attention/local_attention.py#L41 + if x is None: + return None, 0 + tsz = x.size(dim) + m = tsz / multiple + remainder = math.ceil(m) * multiple - tsz + if int(tsz % multiple) == 0: + return x, 0 + pad_offset = (0,) * (-1 - dim) * 2 + + return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder + + +def extract_features( + self, + x, + padding_mask=None, + tgt_layer=None, + min_layer=0, +): + if padding_mask is not None: + x = index_put(x, padding_mask, 0) + + x_conv = self.pos_conv(x.transpose(1, 2)) + x_conv = x_conv.transpose(1, 2) + x = x + x_conv + + if not self.layer_norm_first: + x = self.layer_norm(x) + + # pad to the sequence length dimension + x, pad_length = pad_to_multiple(x, self.required_seq_len_multiple, dim=-2, value=0) + if pad_length > 0 and padding_mask is None: + padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool) + padding_mask[:, -pad_length:] = True + else: + padding_mask, _ = pad_to_multiple( + padding_mask, self.required_seq_len_multiple, dim=-1, value=True + ) + x = F.dropout(x, p=self.dropout, training=self.training) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + layer_results = [] + r = None + for i, layer in enumerate(self.layers): + dropout_probability = np.random.random() if self.layerdrop > 0 else 1 + if not self.training or (dropout_probability > self.layerdrop): + x, (z, lr) = layer( + x, self_attn_padding_mask=padding_mask, need_weights=False + ) + if i >= min_layer: + layer_results.append((x, z, lr)) + if i == tgt_layer: + r = x + break + + if r is not None: + x = r + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + # undo paddding + if pad_length > 0: + x = x[:, :-pad_length] + + def undo_pad(a, b, c): + return ( + a[:-pad_length], + b[:-pad_length] if b is not None else b, + c[:-pad_length], + ) + + layer_results = [undo_pad(*u) for u in layer_results] + + return x, layer_results + + +def compute_mask_indices( + shape: Tuple[int, int], + padding_mask: Optional[torch.Tensor], + mask_prob: float, + mask_length: int, + mask_type: str = "static", + mask_other: float = 0.0, + min_masks: int = 0, + no_overlap: bool = False, + min_space: int = 0, + require_same_masks: bool = True, + mask_dropout: float = 0.0, +) -> torch.Tensor: + """ + Computes random mask spans for a given shape + + Args: + shape: the the shape for which to compute masks. + should be of size 2 where first element is batch size and 2nd is timesteps + padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements + mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by + number of timesteps divided by length of mask span to mask approximately this percentage of all elements. + however due to overlaps, the actual number will be smaller (unless no_overlap is True) + mask_type: how to compute mask lengths + static = fixed size + uniform = sample from uniform distribution [mask_other, mask_length*2] + normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element + poisson = sample from possion distribution with lambda = mask length + min_masks: minimum number of masked spans + no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping + min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans + require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample + mask_dropout: randomly dropout this percentage of masks in each example + """ + + bsz, all_sz = shape + mask = torch.full((bsz, all_sz), False) + + all_num_mask = int( + # add a random number for probabilistic rounding + mask_prob * all_sz / float(mask_length) + + torch.rand([1]).item() + ) + + all_num_mask = max(min_masks, all_num_mask) + + mask_idcs = [] + for i in range(bsz): + if padding_mask is not None: + sz = all_sz - padding_mask[i].long().sum().item() + num_mask = int(mask_prob * sz / float(mask_length) + np.random.rand()) + num_mask = max(min_masks, num_mask) + else: + sz = all_sz + num_mask = all_num_mask + + if mask_type == "static": + lengths = torch.full([num_mask], mask_length) + elif mask_type == "uniform": + lengths = torch.randint(mask_other, mask_length * 2 + 1, size=[num_mask]) + elif mask_type == "normal": + lengths = torch.normal(mask_length, mask_other, size=[num_mask]) + lengths = [max(1, int(round(x))) for x in lengths] + else: + raise Exception("unknown mask selection " + mask_type) + + if sum(lengths) == 0: + lengths[0] = min(mask_length, sz - 1) + + if no_overlap: + mask_idc = [] + + def arrange(s, e, length, keep_length): + span_start = torch.randint(low=s, high=e - length, size=[1]).item() + mask_idc.extend(span_start + i for i in range(length)) + + new_parts = [] + if span_start - s - min_space >= keep_length: + new_parts.append((s, span_start - min_space + 1)) + if e - span_start - length - min_space > keep_length: + new_parts.append((span_start + length + min_space, e)) + return new_parts + + parts = [(0, sz)] + min_length = min(lengths) + for length in sorted(lengths, reverse=True): + t = [e - s if e - s >= length + min_space else 0 for s, e in parts] + lens = torch.asarray(t, dtype=torch.int) + l_sum = torch.sum(lens) + if l_sum == 0: + break + probs = lens / torch.sum(lens) + c = torch.multinomial(probs.float(), len(parts)).item() + s, e = parts.pop(c) + parts.extend(arrange(s, e, length, min_length)) + mask_idc = torch.asarray(mask_idc) + else: + min_len = min(lengths) + if sz - min_len <= num_mask: + min_len = sz - num_mask - 1 + mask_idc = torch.asarray( + random.sample([i for i in range(sz - min_len)], num_mask) + ) + mask_idc = torch.asarray( + [ + mask_idc[j] + offset + for j in range(len(mask_idc)) + for offset in range(lengths[j]) + ] + ) + + mask_idcs.append(torch.unique(mask_idc[mask_idc < sz])) + + min_len = min([len(m) for m in mask_idcs]) + for i, mask_idc in enumerate(mask_idcs): + if isinstance(mask_idc, torch.Tensor): + mask_idc = torch.asarray(mask_idc, dtype=torch.float) + if len(mask_idc) > min_len and require_same_masks: + mask_idc = torch.asarray( + random.sample([i for i in range(mask_idc)], min_len) + ) + if mask_dropout > 0: + num_holes = int(round(len(mask_idc) * mask_dropout)) + mask_idc = torch.asarray( + random.sample([i for i in range(mask_idc)], len(mask_idc) - num_holes) + ) + + mask[i, mask_idc.int()] = True + + return mask + + +def apply_mask(self, x, padding_mask, target_list): + B, T, C = x.shape + torch.zeros_like(x) + if self.mask_prob > 0: + mask_indices = compute_mask_indices( + (B, T), + padding_mask, + self.mask_prob, + self.mask_length, + self.mask_selection, + self.mask_other, + min_masks=2, + no_overlap=self.no_mask_overlap, + min_space=self.mask_min_space, + ) + mask_indices = mask_indices.to(x.device) + x[mask_indices] = self.mask_emb + else: + mask_indices = None + + if self.mask_channel_prob > 0: + mask_channel_indices = compute_mask_indices( + (B, C), + None, + self.mask_channel_prob, + self.mask_channel_length, + self.mask_channel_selection, + self.mask_channel_other, + no_overlap=self.no_mask_channel_overlap, + min_space=self.mask_channel_min_space, + ) + mask_channel_indices = ( + mask_channel_indices.to(x.device).unsqueeze(1).expand(-1, T, -1) + ) + x[mask_channel_indices] = 0 + + return x, mask_indices + + +def get_hubert_model( + model_path="assets/hubert/hubert_base.pt", device=torch.device("cpu") +): + models, _, _ = load_model_ensemble_and_task( + [model_path], + suffix="", + ) + hubert_model = models[0] + hubert_model = hubert_model.to(device) + + def _apply_mask(x, padding_mask, target_list): + return apply_mask(hubert_model, x, padding_mask, target_list) + + hubert_model.apply_mask = _apply_mask + + def _extract_features( + x, + padding_mask=None, + tgt_layer=None, + min_layer=0, + ): + return extract_features( + hubert_model.encoder, + x, + padding_mask=padding_mask, + tgt_layer=tgt_layer, + min_layer=min_layer, + ) + + hubert_model.encoder.extract_features = _extract_features + + hubert_model._forward = hubert_model.forward + + def hubert_extract_features( + self, + source: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + mask: bool = False, + ret_conv: bool = False, + output_layer: Optional[int] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + res = self._forward( + source, + padding_mask=padding_mask, + mask=mask, + features_only=True, + output_layer=output_layer, + ) + feature = res["features"] if ret_conv else res["x"] + return feature, res["padding_mask"] + + def _hubert_extract_features( + source: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + mask: bool = False, + ret_conv: bool = False, + output_layer: Optional[int] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + return hubert_extract_features( + hubert_model, source, padding_mask, mask, ret_conv, output_layer + ) + + hubert_model.extract_features = _hubert_extract_features + + def infer(source, padding_mask, output_layer: torch.Tensor): + output_layer = output_layer.item() + logits = hubert_model.extract_features( + source=source, padding_mask=padding_mask, output_layer=output_layer + ) + feats = hubert_model.final_proj(logits[0]) if output_layer == 9 else logits[0] + return feats + + hubert_model.infer = infer + # hubert_model.forward=infer + # hubert_model.forward + + return hubert_model diff --git a/infer/lib/jit/get_rmvpe.py b/infer/lib/jit/get_rmvpe.py new file mode 100644 index 0000000000000000000000000000000000000000..d6fb6b813f235519a13539edb3e8f88600ddb649 --- /dev/null +++ b/infer/lib/jit/get_rmvpe.py @@ -0,0 +1,12 @@ +import torch + + +def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")): + from infer.lib.rmvpe import E2E + + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location=device, weights_only=False) + model.load_state_dict(ckpt) + model.eval() + model = model.to(device) + return model diff --git a/infer/lib/jit/get_synthesizer.py b/infer/lib/jit/get_synthesizer.py new file mode 100644 index 0000000000000000000000000000000000000000..d0e37e36670473b1aa20fe169a33fb696d865539 --- /dev/null +++ b/infer/lib/jit/get_synthesizer.py @@ -0,0 +1,38 @@ +import torch + + +def get_synthesizer(pth_path, device=torch.device("cpu")): + from infer.lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, + ) + + cpt = torch.load(pth_path, map_location=torch.device("cpu"), weights_only=False) + # tgt_sr = cpt["config"][-1] + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] + if_f0 = cpt.get("f0", 1) + version = cpt.get("version", "v1") + if version == "v1": + if if_f0 == 1: + net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False) + else: + net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + elif version == "v2": + if if_f0 == 1: + net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=False) + else: + net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) + del net_g.enc_q + # net_g.forward = net_g.infer + # ckpt = {} + # ckpt["config"] = cpt["config"] + # ckpt["f0"] = if_f0 + # ckpt["version"] = version + # ckpt["info"] = cpt.get("info", "0epoch") + net_g.load_state_dict(cpt["weight"], strict=False) + net_g = net_g.float() + net_g.eval().to(device) + net_g.remove_weight_norm() + return net_g, cpt diff --git a/infer/lib/rmvpe.py b/infer/lib/rmvpe.py new file mode 100644 index 0000000000000000000000000000000000000000..6c4082079475a4c3a15ecf33f3c56270099dbf12 --- /dev/null +++ b/infer/lib/rmvpe.py @@ -0,0 +1,670 @@ +from io import BytesIO +import os +from typing import List, Optional, Tuple +import numpy as np +import torch + +from infer.lib import jit + +try: + # Fix "Torch not compiled with CUDA enabled" + import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import + + if torch.xpu.is_available(): + from infer.modules.ipex import ipex_init + + ipex_init() +except Exception: # pylint: disable=broad-exception-caught + pass +import torch.nn as nn +import torch.nn.functional as F +from librosa.util import normalize, pad_center, tiny +from scipy.signal import get_window + +import logging + +logger = logging.getLogger(__name__) + + +class STFT(torch.nn.Module): + def __init__( + self, filter_length=1024, hop_length=512, win_length=None, window="hann" + ): + """ + This module implements an STFT using 1D convolution and 1D transpose convolutions. + This is a bit tricky so there are some cases that probably won't work as working + out the same sizes before and after in all overlap add setups is tough. Right now, + this code should work with hop lengths that are half the filter length (50% overlap + between frames). + + Keyword Arguments: + filter_length {int} -- Length of filters used (default: {1024}) + hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512}) + win_length {[type]} -- Length of the window function applied to each frame (if not specified, it + equals the filter length). (default: {None}) + window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris) + (default: {'hann'}) + """ + super(STFT, self).__init__() + self.filter_length = filter_length + self.hop_length = hop_length + self.win_length = win_length if win_length else filter_length + self.window = window + self.forward_transform = None + self.pad_amount = int(self.filter_length / 2) + fourier_basis = np.fft.fft(np.eye(self.filter_length)) + + cutoff = int((self.filter_length / 2 + 1)) + fourier_basis = np.vstack( + [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])] + ) + forward_basis = torch.FloatTensor(fourier_basis) + inverse_basis = torch.FloatTensor(np.linalg.pinv(fourier_basis)) + + assert filter_length >= self.win_length + # get window and zero center pad it to filter_length + fft_window = get_window(window, self.win_length, fftbins=True) + fft_window = pad_center(fft_window, size=filter_length) + fft_window = torch.from_numpy(fft_window).float() + + # window the bases + forward_basis *= fft_window + inverse_basis = (inverse_basis.T * fft_window).T + + self.register_buffer("forward_basis", forward_basis.float()) + self.register_buffer("inverse_basis", inverse_basis.float()) + self.register_buffer("fft_window", fft_window.float()) + + def transform(self, input_data, return_phase=False): + """Take input data (audio) to STFT domain. + + Arguments: + input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples) + + Returns: + magnitude {tensor} -- Magnitude of STFT with shape (num_batch, + num_frequencies, num_frames) + phase {tensor} -- Phase of STFT with shape (num_batch, + num_frequencies, num_frames) + """ + input_data = F.pad( + input_data, + (self.pad_amount, self.pad_amount), + mode="reflect", + ) + forward_transform = input_data.unfold( + 1, self.filter_length, self.hop_length + ).permute(0, 2, 1) + forward_transform = torch.matmul(self.forward_basis, forward_transform) + cutoff = int((self.filter_length / 2) + 1) + real_part = forward_transform[:, :cutoff, :] + imag_part = forward_transform[:, cutoff:, :] + magnitude = torch.sqrt(real_part**2 + imag_part**2) + if return_phase: + phase = torch.atan2(imag_part.data, real_part.data) + return magnitude, phase + else: + return magnitude + + def inverse(self, magnitude, phase): + """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced + by the ```transform``` function. + + Arguments: + magnitude {tensor} -- Magnitude of STFT with shape (num_batch, + num_frequencies, num_frames) + phase {tensor} -- Phase of STFT with shape (num_batch, + num_frequencies, num_frames) + + Returns: + inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of + shape (num_batch, num_samples) + """ + cat = torch.cat( + [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 + ) + fold = torch.nn.Fold( + output_size=(1, (cat.size(-1) - 1) * self.hop_length + self.filter_length), + kernel_size=(1, self.filter_length), + stride=(1, self.hop_length), + ) + inverse_transform = torch.matmul(self.inverse_basis, cat) + inverse_transform = fold(inverse_transform)[ + :, 0, 0, self.pad_amount : -self.pad_amount + ] + window_square_sum = ( + self.fft_window.pow(2).repeat(cat.size(-1), 1).T.unsqueeze(0) + ) + window_square_sum = fold(window_square_sum)[ + :, 0, 0, self.pad_amount : -self.pad_amount + ] + inverse_transform /= window_square_sum + return inverse_transform + + def forward(self, input_data): + """Take input data (audio) to STFT domain and then back to audio. + + Arguments: + input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples) + + Returns: + reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of + shape (num_batch, num_samples) + """ + self.magnitude, self.phase = self.transform(input_data, return_phase=True) + reconstruction = self.inverse(self.magnitude, self.phase) + return reconstruction + + +from time import time as ttime + + +class BiGRU(nn.Module): + def __init__(self, input_features, hidden_features, num_layers): + super(BiGRU, self).__init__() + self.gru = nn.GRU( + input_features, + hidden_features, + num_layers=num_layers, + batch_first=True, + bidirectional=True, + ) + + def forward(self, x): + return self.gru(x)[0] + + +class ConvBlockRes(nn.Module): + def __init__(self, in_channels, out_channels, momentum=0.01): + super(ConvBlockRes, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + # self.shortcut:Optional[nn.Module] = None + if in_channels != out_channels: + self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) + + def forward(self, x: torch.Tensor): + if not hasattr(self, "shortcut"): + return self.conv(x) + x + else: + return self.conv(x) + self.shortcut(x) + + +class Encoder(nn.Module): + def __init__( + self, + in_channels, + in_size, + n_encoders, + kernel_size, + n_blocks, + out_channels=16, + momentum=0.01, + ): + super(Encoder, self).__init__() + self.n_encoders = n_encoders + self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) + self.layers = nn.ModuleList() + self.latent_channels = [] + for i in range(self.n_encoders): + self.layers.append( + ResEncoderBlock( + in_channels, out_channels, kernel_size, n_blocks, momentum=momentum + ) + ) + self.latent_channels.append([out_channels, in_size]) + in_channels = out_channels + out_channels *= 2 + in_size //= 2 + self.out_size = in_size + self.out_channel = out_channels + + def forward(self, x: torch.Tensor): + concat_tensors: List[torch.Tensor] = [] + x = self.bn(x) + for i, layer in enumerate(self.layers): + t, x = layer(x) + concat_tensors.append(t) + return x, concat_tensors + + +class ResEncoderBlock(nn.Module): + def __init__( + self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 + ): + super(ResEncoderBlock, self).__init__() + self.n_blocks = n_blocks + self.conv = nn.ModuleList() + self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) + for i in range(n_blocks - 1): + self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) + self.kernel_size = kernel_size + if self.kernel_size is not None: + self.pool = nn.AvgPool2d(kernel_size=kernel_size) + + def forward(self, x): + for i, conv in enumerate(self.conv): + x = conv(x) + if self.kernel_size is not None: + return x, self.pool(x) + else: + return x + + +class Intermediate(nn.Module): # + def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): + super(Intermediate, self).__init__() + self.n_inters = n_inters + self.layers = nn.ModuleList() + self.layers.append( + ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) + ) + for i in range(self.n_inters - 1): + self.layers.append( + ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) + ) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = layer(x) + return x + + +class ResDecoderBlock(nn.Module): + def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): + super(ResDecoderBlock, self).__init__() + out_padding = (0, 1) if stride == (1, 2) else (1, 1) + self.n_blocks = n_blocks + self.conv1 = nn.Sequential( + nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=stride, + padding=(1, 1), + output_padding=out_padding, + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + self.conv2 = nn.ModuleList() + self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) + for i in range(n_blocks - 1): + self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) + + def forward(self, x, concat_tensor): + x = self.conv1(x) + x = torch.cat((x, concat_tensor), dim=1) + for i, conv2 in enumerate(self.conv2): + x = conv2(x) + return x + + +class Decoder(nn.Module): + def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): + super(Decoder, self).__init__() + self.layers = nn.ModuleList() + self.n_decoders = n_decoders + for i in range(self.n_decoders): + out_channels = in_channels // 2 + self.layers.append( + ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) + ) + in_channels = out_channels + + def forward(self, x: torch.Tensor, concat_tensors: List[torch.Tensor]): + for i, layer in enumerate(self.layers): + x = layer(x, concat_tensors[-1 - i]) + return x + + +class DeepUnet(nn.Module): + def __init__( + self, + kernel_size, + n_blocks, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(DeepUnet, self).__init__() + self.encoder = Encoder( + in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels + ) + self.intermediate = Intermediate( + self.encoder.out_channel // 2, + self.encoder.out_channel, + inter_layers, + n_blocks, + ) + self.decoder = Decoder( + self.encoder.out_channel, en_de_layers, kernel_size, n_blocks + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, concat_tensors = self.encoder(x) + x = self.intermediate(x) + x = self.decoder(x, concat_tensors) + return x + + +class E2E(nn.Module): + def __init__( + self, + n_blocks, + n_gru, + kernel_size, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(E2E, self).__init__() + self.unet = DeepUnet( + kernel_size, + n_blocks, + en_de_layers, + inter_layers, + in_channels, + en_out_channels, + ) + self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) + if n_gru: + self.fc = nn.Sequential( + BiGRU(3 * 128, 256, n_gru), + nn.Linear(512, 360), + nn.Dropout(0.25), + nn.Sigmoid(), + ) + else: + self.fc = nn.Sequential( + nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid() + ) + + def forward(self, mel): + # print(mel.shape) + mel = mel.transpose(-1, -2).unsqueeze(1) + x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) + x = self.fc(x) + # print(x.shape) + return x + + +from librosa.filters import mel + + +class MelSpectrogram(torch.nn.Module): + def __init__( + self, + is_half, + n_mel_channels, + sampling_rate, + win_length, + hop_length, + n_fft=None, + mel_fmin=0, + mel_fmax=None, + clamp=1e-5, + ): + super().__init__() + n_fft = win_length if n_fft is None else n_fft + self.hann_window = {} + mel_basis = mel( + sr=sampling_rate, + n_fft=n_fft, + n_mels=n_mel_channels, + fmin=mel_fmin, + fmax=mel_fmax, + htk=True, + ) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer("mel_basis", mel_basis) + self.n_fft = win_length if n_fft is None else n_fft + self.hop_length = hop_length + self.win_length = win_length + self.sampling_rate = sampling_rate + self.n_mel_channels = n_mel_channels + self.clamp = clamp + self.is_half = is_half + + def forward(self, audio, keyshift=0, speed=1, center=True): + factor = 2 ** (keyshift / 12) + n_fft_new = int(np.round(self.n_fft * factor)) + win_length_new = int(np.round(self.win_length * factor)) + hop_length_new = int(np.round(self.hop_length * speed)) + keyshift_key = str(keyshift) + "_" + str(audio.device) + if keyshift_key not in self.hann_window: + self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( + audio.device + ) + if "privateuseone" in str(audio.device): + if not hasattr(self, "stft"): + self.stft = STFT( + filter_length=n_fft_new, + hop_length=hop_length_new, + win_length=win_length_new, + window="hann", + ).to(audio.device) + magnitude = self.stft.transform(audio) + else: + fft = torch.stft( + audio, + n_fft=n_fft_new, + hop_length=hop_length_new, + win_length=win_length_new, + window=self.hann_window[keyshift_key], + center=center, + return_complex=True, + ) + magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) + if keyshift != 0: + size = self.n_fft // 2 + 1 + resize = magnitude.size(1) + if resize < size: + magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) + magnitude = magnitude[:, :size, :] * self.win_length / win_length_new + mel_output = torch.matmul(self.mel_basis, magnitude) + if self.is_half == True: + mel_output = mel_output.half() + log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) + return log_mel_spec + + +class RMVPE: + def __init__(self, model_path: str, is_half, device=None, use_jit=False): + self.resample_kernel = {} + self.resample_kernel = {} + self.is_half = is_half + if device is None: + device = "cuda:0" if torch.cuda.is_available() else "cpu" + self.device = device + self.mel_extractor = MelSpectrogram( + is_half, 128, 16000, 1024, 160, None, 30, 8000 + ).to(device) + if "privateuseone" in str(device): + import onnxruntime as ort + + ort_session = ort.InferenceSession( + "%s/rmvpe.onnx" % os.environ["rmvpe_root"], + providers=["DmlExecutionProvider"], + ) + self.model = ort_session + else: + if str(self.device) == "cuda": + self.device = torch.device("cuda:0") + + def get_jit_model(): + jit_model_path = model_path.rstrip(".pth") + jit_model_path += ".half.jit" if is_half else ".jit" + reload = False + if os.path.exists(jit_model_path): + ckpt = jit.load(jit_model_path) + model_device = ckpt["device"] + if model_device != str(self.device): + reload = True + else: + reload = True + + if reload: + ckpt = jit.rmvpe_jit_export( + model_path=model_path, + mode="script", + inputs_path=None, + save_path=jit_model_path, + device=device, + is_half=is_half, + ) + model = torch.jit.load(BytesIO(ckpt["model"]), map_location=device) + return model + + def get_default_model(): + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location="cpu", weights_only=False) + model.load_state_dict(ckpt) + model.eval() + if is_half: + model = model.half() + else: + model = model.float() + return model + + if use_jit: + if is_half and "cpu" in str(self.device): + logger.warning( + "Use default rmvpe model. \ + Jit is not supported on the CPU for half floating point" + ) + self.model = get_default_model() + else: + self.model = get_jit_model() + else: + self.model = get_default_model() + + self.model = self.model.to(device) + cents_mapping = 20 * np.arange(360) + 1997.3794084376191 + self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 + + def mel2hidden(self, mel): + with torch.no_grad(): + n_frames = mel.shape[-1] + n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames + if n_pad > 0: + mel = F.pad(mel, (0, n_pad), mode="constant") + if "privateuseone" in str(self.device): + onnx_input_name = self.model.get_inputs()[0].name + onnx_outputs_names = self.model.get_outputs()[0].name + hidden = self.model.run( + [onnx_outputs_names], + input_feed={onnx_input_name: mel.cpu().numpy()}, + )[0] + else: + mel = mel.half() if self.is_half else mel.float() + hidden = self.model(mel) + return hidden[:, :n_frames] + + def decode(self, hidden, thred=0.03): + cents_pred = self.to_local_average_cents(hidden, thred=thred) + f0 = 10 * (2 ** (cents_pred / 1200)) + f0[f0 == 10] = 0 + # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) + return f0 + + def infer_from_audio(self, audio, thred=0.03): + # torch.cuda.synchronize() + # t0 = ttime() + if not torch.is_tensor(audio): + audio = torch.from_numpy(audio) + mel = self.mel_extractor( + audio.float().to(self.device).unsqueeze(0), center=True + ) + # print(123123123,mel.device.type) + # torch.cuda.synchronize() + # t1 = ttime() + hidden = self.mel2hidden(mel) + # torch.cuda.synchronize() + # t2 = ttime() + # print(234234,hidden.device.type) + if "privateuseone" not in str(self.device): + hidden = hidden.squeeze(0).cpu().numpy() + else: + hidden = hidden[0] + if self.is_half == True: + hidden = hidden.astype("float32") + + f0 = self.decode(hidden, thred=thred) + # torch.cuda.synchronize() + # t3 = ttime() + # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) + return f0 + + def to_local_average_cents(self, salience, thred=0.05): + # t0 = ttime() + center = np.argmax(salience, axis=1) # 帧长#index + salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368 + # t1 = ttime() + center += 4 + todo_salience = [] + todo_cents_mapping = [] + starts = center - 4 + ends = center + 5 + for idx in range(salience.shape[0]): + todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) + todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) + # t2 = ttime() + todo_salience = np.array(todo_salience) # 帧长,9 + todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9 + product_sum = np.sum(todo_salience * todo_cents_mapping, 1) + weight_sum = np.sum(todo_salience, 1) # 帧长 + devided = product_sum / weight_sum # 帧长 + # t3 = ttime() + maxx = np.max(salience, axis=1) # 帧长 + devided[maxx <= thred] = 0 + # t4 = ttime() + # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) + return devided + + +if __name__ == "__main__": + import librosa + import soundfile as sf + + audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav") + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.transpose(1, 0)) + audio_bak = audio.copy() + if sampling_rate != 16000: + audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) + model_path = r"D:\BaiduNetdiskDownload\RVC-beta-v2-0727AMD_realtime\rmvpe.pt" + thred = 0.03 # 0.01 + device = "cuda" if torch.cuda.is_available() else "cpu" + rmvpe = RMVPE(model_path, is_half=False, device=device) + t0 = ttime() + f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + t1 = ttime() + logger.info("%s %.2f", f0.shape, t1 - t0) diff --git a/infer/lib/rtrvc.py b/infer/lib/rtrvc.py new file mode 100644 index 0000000000000000000000000000000000000000..8da568c7a3af141177ab0799718af81ecf01ac76 --- /dev/null +++ b/infer/lib/rtrvc.py @@ -0,0 +1,461 @@ +from io import BytesIO +import os +import sys +import traceback +from infer.lib import jit +from infer.lib.jit.get_synthesizer import get_synthesizer +from time import time as ttime +import fairseq +import faiss +import numpy as np +import parselmouth +import pyworld +import scipy.signal as signal +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchcrepe +from torchaudio.transforms import Resample + +now_dir = os.getcwd() +sys.path.append(now_dir) +from multiprocessing import Manager as M + +from configs.config import Config + +# config = Config() + +mm = M() + + +def printt(strr, *args): + if len(args) == 0: + print(strr) + else: + print(strr % args) + + +# config.device=torch.device("cpu")########强制cpu测试 +# config.is_half=False########强制cpu测试 +class RVC: + def __init__( + self, + key, + formant, + pth_path, + index_path, + index_rate, + n_cpu, + inp_q, + opt_q, + config: Config, + last_rvc=None, + ) -> None: + """ + 初始化 + """ + try: + if config.dml == True: + + def forward_dml(ctx, x, scale): + ctx.scale = scale + res = x.clone().detach() + return res + + fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml + # global config + self.config = config + self.inp_q = inp_q + self.opt_q = opt_q + # device="cpu"########强制cpu测试 + self.device = config.device + self.f0_up_key = key + self.formant_shift = formant + self.f0_min = 50 + self.f0_max = 1100 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + self.n_cpu = n_cpu + self.use_jit = self.config.use_jit + self.is_half = config.is_half + + if index_rate != 0: + self.index = faiss.read_index(index_path) + self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) + printt("Index search enabled") + self.pth_path: str = pth_path + self.index_path = index_path + self.index_rate = index_rate + self.cache_pitch: torch.Tensor = torch.zeros( + 1024, device=self.device, dtype=torch.long + ) + self.cache_pitchf = torch.zeros( + 1024, device=self.device, dtype=torch.float32 + ) + + self.resample_kernel = {} + + if last_rvc is None: + models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( + ["assets/hubert/hubert_base.pt"], + suffix="", + ) + hubert_model = models[0] + hubert_model = hubert_model.to(self.device) + if self.is_half: + hubert_model = hubert_model.half() + else: + hubert_model = hubert_model.float() + hubert_model.eval() + self.model = hubert_model + else: + self.model = last_rvc.model + + self.net_g: nn.Module = None + + def set_default_model(): + self.net_g, cpt = get_synthesizer(self.pth_path, self.device) + self.tgt_sr = cpt["config"][-1] + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] + self.if_f0 = cpt.get("f0", 1) + self.version = cpt.get("version", "v1") + if self.is_half: + self.net_g = self.net_g.half() + else: + self.net_g = self.net_g.float() + + def set_jit_model(): + jit_pth_path = self.pth_path.rstrip(".pth") + jit_pth_path += ".half.jit" if self.is_half else ".jit" + reload = False + if str(self.device) == "cuda": + self.device = torch.device("cuda:0") + if os.path.exists(jit_pth_path): + cpt = jit.load(jit_pth_path) + model_device = cpt["device"] + if model_device != str(self.device): + reload = True + else: + reload = True + + if reload: + cpt = jit.synthesizer_jit_export( + self.pth_path, + "script", + None, + device=self.device, + is_half=self.is_half, + ) + + self.tgt_sr = cpt["config"][-1] + self.if_f0 = cpt.get("f0", 1) + self.version = cpt.get("version", "v1") + self.net_g = torch.jit.load( + BytesIO(cpt["model"]), map_location=self.device + ) + self.net_g.infer = self.net_g.forward + self.net_g.eval().to(self.device) + + def set_synthesizer(): + if self.use_jit and not config.dml: + if self.is_half and "cpu" in str(self.device): + printt( + "Use default Synthesizer model. \ + Jit is not supported on the CPU for half floating point" + ) + set_default_model() + else: + set_jit_model() + else: + set_default_model() + + if last_rvc is None or last_rvc.pth_path != self.pth_path: + set_synthesizer() + else: + self.tgt_sr = last_rvc.tgt_sr + self.if_f0 = last_rvc.if_f0 + self.version = last_rvc.version + self.is_half = last_rvc.is_half + if last_rvc.use_jit != self.use_jit: + set_synthesizer() + else: + self.net_g = last_rvc.net_g + + if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"): + self.model_rmvpe = last_rvc.model_rmvpe + if last_rvc is not None and hasattr(last_rvc, "model_fcpe"): + self.device_fcpe = last_rvc.device_fcpe + self.model_fcpe = last_rvc.model_fcpe + except: + printt(traceback.format_exc()) + + def change_key(self, new_key): + self.f0_up_key = new_key + + def change_formant(self, new_formant): + self.formant_shift = new_formant + + def change_index_rate(self, new_index_rate): + if new_index_rate != 0 and self.index_rate == 0: + self.index = faiss.read_index(self.index_path) + self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) + printt("Index search enabled") + self.index_rate = new_index_rate + + def get_f0_post(self, f0): + if not torch.is_tensor(f0): + f0 = torch.from_numpy(f0) + f0 = f0.float().to(self.device).squeeze() + f0_mel = 1127 * torch.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / ( + self.f0_mel_max - self.f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = torch.round(f0_mel).long() + return f0_coarse, f0 + + def get_f0(self, x, f0_up_key, n_cpu, method="harvest"): + n_cpu = int(n_cpu) + if method == "crepe": + return self.get_f0_crepe(x, f0_up_key) + if method == "rmvpe": + return self.get_f0_rmvpe(x, f0_up_key) + if method == "fcpe": + return self.get_f0_fcpe(x, f0_up_key) + x = x.cpu().numpy() + if method == "pm": + p_len = x.shape[0] // 160 + 1 + f0_min = 65 + l_pad = int(np.ceil(1.5 / f0_min * 16000)) + r_pad = l_pad + 1 + s = parselmouth.Sound(np.pad(x, (l_pad, r_pad)), 16000).to_pitch_ac( + time_step=0.01, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=1100, + ) + assert np.abs(s.t1 - 1.5 / f0_min) < 0.001 + f0 = s.selected_array["frequency"] + if len(f0) < p_len: + f0 = np.pad(f0, (0, p_len - len(f0))) + f0 = f0[:p_len] + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + if n_cpu == 1: + f0, t = pyworld.harvest( + x.astype(np.double), + fs=16000, + f0_ceil=1100, + f0_floor=50, + frame_period=10, + ) + f0 = signal.medfilt(f0, 3) + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + f0bak = np.zeros(x.shape[0] // 160 + 1, dtype=np.float64) + length = len(x) + part_length = 160 * ((length // 160 - 1) // n_cpu + 1) + n_cpu = (length // 160 - 1) // (part_length // 160) + 1 + ts = ttime() + res_f0 = mm.dict() + for idx in range(n_cpu): + tail = part_length * (idx + 1) + 320 + if idx == 0: + self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts)) + else: + self.inp_q.put( + (idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts) + ) + while 1: + res_ts = self.opt_q.get() + if res_ts == ts: + break + f0s = [i[1] for i in sorted(res_f0.items(), key=lambda x: x[0])] + for idx, f0 in enumerate(f0s): + if idx == 0: + f0 = f0[:-3] + elif idx != n_cpu - 1: + f0 = f0[2:-3] + else: + f0 = f0[2:] + f0bak[part_length * idx // 160 : part_length * idx // 160 + f0.shape[0]] = ( + f0 + ) + f0bak = signal.medfilt(f0bak, 3) + f0bak *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0bak) + + def get_f0_crepe(self, x, f0_up_key): + if "privateuseone" in str( + self.device + ): ###不支持dml,cpu又太慢用不成,拿fcpe顶替 + return self.get_f0(x, f0_up_key, 1, "fcpe") + # printt("using crepe,device:%s"%self.device) + f0, pd = torchcrepe.predict( + x.unsqueeze(0).float(), + 16000, + 160, + self.f0_min, + self.f0_max, + "full", + batch_size=512, + # device=self.device if self.device.type!="privateuseone" else "cpu",###crepe不用半精度全部是全精度所以不愁###cpu延迟高到没法用 + device=self.device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + + def get_f0_rmvpe(self, x, f0_up_key): + if hasattr(self, "model_rmvpe") == False: + from infer.lib.rmvpe import RMVPE + + printt("Loading rmvpe model") + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", + is_half=self.is_half, + device=self.device, + use_jit=self.config.use_jit, + ) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + + def get_f0_fcpe(self, x, f0_up_key): + if hasattr(self, "model_fcpe") == False: + from torchfcpe import spawn_bundled_infer_model + + printt("Loading fcpe model") + if "privateuseone" in str(self.device): + self.device_fcpe = "cpu" + else: + self.device_fcpe = self.device + self.model_fcpe = spawn_bundled_infer_model(self.device_fcpe) + f0 = self.model_fcpe.infer( + x.to(self.device_fcpe).unsqueeze(0).float(), + sr=16000, + decoder_mode="local_argmax", + threshold=0.006, + ) + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + + def infer( + self, + input_wav: torch.Tensor, + block_frame_16k, + skip_head, + return_length, + f0method, + ) -> np.ndarray: + t1 = ttime() + with torch.no_grad(): + if self.config.is_half: + feats = input_wav.half().view(1, -1) + else: + feats = input_wav.float().view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + inputs = { + "source": feats, + "padding_mask": padding_mask, + "output_layer": 9 if self.version == "v1" else 12, + } + logits = self.model.extract_features(**inputs) + feats = ( + self.model.final_proj(logits[0]) if self.version == "v1" else logits[0] + ) + feats = torch.cat((feats, feats[:, -1:, :]), 1) + t2 = ttime() + try: + if hasattr(self, "index") and self.index_rate != 0: + npy = feats[0][skip_head // 2 :].cpu().numpy().astype("float32") + score, ix = self.index.search(npy, k=8) + if (ix >= 0).all(): + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum( + self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1 + ) + if self.config.is_half: + npy = npy.astype("float16") + feats[0][skip_head // 2 :] = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) + * self.index_rate + + (1 - self.index_rate) * feats[0][skip_head // 2 :] + ) + else: + printt( + "Invalid index. You MUST use added_xxxx.index but not trained_xxxx.index!" + ) + else: + printt("Index search FAILED or disabled") + except: + traceback.print_exc() + printt("Index search FAILED") + t3 = ttime() + p_len = input_wav.shape[0] // 160 + factor = pow(2, self.formant_shift / 12) + return_length2 = int(np.ceil(return_length * factor)) + if self.if_f0 == 1: + f0_extractor_frame = block_frame_16k + 800 + if f0method == "rmvpe": + f0_extractor_frame = 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 + pitch, pitchf = self.get_f0( + input_wav[-f0_extractor_frame:], self.f0_up_key - self.formant_shift, self.n_cpu, f0method + ) + shift = block_frame_16k // 160 + self.cache_pitch[:-shift] = self.cache_pitch[shift:].clone() + self.cache_pitchf[:-shift] = self.cache_pitchf[shift:].clone() + self.cache_pitch[4 - pitch.shape[0] :] = pitch[3:-1] + self.cache_pitchf[4 - pitch.shape[0] :] = pitchf[3:-1] + cache_pitch = self.cache_pitch[None, -p_len:] + cache_pitchf = self.cache_pitchf[None, -p_len:] * return_length2 / return_length + t4 = ttime() + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + feats = feats[:, :p_len, :] + p_len = torch.LongTensor([p_len]).to(self.device) + sid = torch.LongTensor([0]).to(self.device) + skip_head = torch.LongTensor([skip_head]) + return_length2 = torch.LongTensor([return_length2]) + return_length = torch.LongTensor([return_length]) + with torch.no_grad(): + if self.if_f0 == 1: + infered_audio, _, _ = self.net_g.infer( + feats, + p_len, + cache_pitch, + cache_pitchf, + sid, + skip_head, + return_length, + return_length2, + ) + else: + infered_audio, _, _ = self.net_g.infer( + feats, p_len, sid, skip_head, return_length, return_length2 + ) + infered_audio = infered_audio.squeeze(1).float() + upp_res = int(np.floor(factor * self.tgt_sr // 100)) + if upp_res != self.tgt_sr // 100: + if upp_res not in self.resample_kernel: + self.resample_kernel[upp_res] = Resample( + orig_freq=upp_res, + new_freq=self.tgt_sr // 100, + dtype=torch.float32, + ).to(self.device) + infered_audio = self.resample_kernel[upp_res]( + infered_audio[:, : return_length * upp_res] + ) + t5 = ttime() + printt( + "Spent time: fea = %.3fs, index = %.3fs, f0 = %.3fs, model = %.3fs", + t2 - t1, + t3 - t2, + t4 - t3, + t5 - t4, + ) + return infered_audio.squeeze() diff --git a/infer/lib/slicer2.py b/infer/lib/slicer2.py new file mode 100644 index 0000000000000000000000000000000000000000..7d9d16db55e30c5c732f7fd32a234af026097e13 --- /dev/null +++ b/infer/lib/slicer2.py @@ -0,0 +1,260 @@ +import numpy as np + + +# This function is obtained from librosa. +def get_rms( + y, + frame_length=2048, + hop_length=512, + pad_mode="constant", +): + padding = (int(frame_length // 2), int(frame_length // 2)) + y = np.pad(y, padding, mode=pad_mode) + + axis = -1 + # put our new within-frame axis at the end for now + out_strides = y.strides + tuple([y.strides[axis]]) + # Reduce the shape on the framing axis + x_shape_trimmed = list(y.shape) + x_shape_trimmed[axis] -= frame_length - 1 + out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) + xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) + if axis < 0: + target_axis = axis - 1 + else: + target_axis = axis + 1 + xw = np.moveaxis(xw, -1, target_axis) + # Downsample along the target axis + slices = [slice(None)] * xw.ndim + slices[axis] = slice(0, None, hop_length) + x = xw[tuple(slices)] + + # Calculate power + power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) + + return np.sqrt(power) + + +class Slicer: + def __init__( + self, + sr: int, + threshold: float = -40.0, + min_length: int = 5000, + min_interval: int = 300, + hop_size: int = 20, + max_sil_kept: int = 5000, + ): + if not min_length >= min_interval >= hop_size: + raise ValueError( + "The following condition must be satisfied: min_length >= min_interval >= hop_size" + ) + if not max_sil_kept >= hop_size: + raise ValueError( + "The following condition must be satisfied: max_sil_kept >= hop_size" + ) + min_interval = sr * min_interval / 1000 + self.threshold = 10 ** (threshold / 20.0) + self.hop_size = round(sr * hop_size / 1000) + self.win_size = min(round(min_interval), 4 * self.hop_size) + self.min_length = round(sr * min_length / 1000 / self.hop_size) + self.min_interval = round(min_interval / self.hop_size) + self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) + + def _apply_slice(self, waveform, begin, end): + if len(waveform.shape) > 1: + return waveform[ + :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size) + ] + else: + return waveform[ + begin * self.hop_size : min(waveform.shape[0], end * self.hop_size) + ] + + # @timeit + def slice(self, waveform): + if len(waveform.shape) > 1: + samples = waveform.mean(axis=0) + else: + samples = waveform + if samples.shape[0] <= self.min_length: + return [waveform] + rms_list = get_rms( + y=samples, frame_length=self.win_size, hop_length=self.hop_size + ).squeeze(0) + sil_tags = [] + silence_start = None + clip_start = 0 + for i, rms in enumerate(rms_list): + # Keep looping while frame is silent. + if rms < self.threshold: + # Record start of silent frames. + if silence_start is None: + silence_start = i + continue + # Keep looping while frame is not silent and silence start has not been recorded. + if silence_start is None: + continue + # Clear recorded silence start if interval is not enough or clip is too short + is_leading_silence = silence_start == 0 and i > self.max_sil_kept + need_slice_middle = ( + i - silence_start >= self.min_interval + and i - clip_start >= self.min_length + ) + if not is_leading_silence and not need_slice_middle: + silence_start = None + continue + # Need slicing. Record the range of silent frames to be removed. + if i - silence_start <= self.max_sil_kept: + pos = rms_list[silence_start : i + 1].argmin() + silence_start + if silence_start == 0: + sil_tags.append((0, pos)) + else: + sil_tags.append((pos, pos)) + clip_start = pos + elif i - silence_start <= self.max_sil_kept * 2: + pos = rms_list[ + i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 + ].argmin() + pos += i - self.max_sil_kept + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) + if silence_start == 0: + sil_tags.append((0, pos_r)) + clip_start = pos_r + else: + sil_tags.append((min(pos_l, pos), max(pos_r, pos))) + clip_start = max(pos_r, pos) + else: + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) + if silence_start == 0: + sil_tags.append((0, pos_r)) + else: + sil_tags.append((pos_l, pos_r)) + clip_start = pos_r + silence_start = None + # Deal with trailing silence. + total_frames = rms_list.shape[0] + if ( + silence_start is not None + and total_frames - silence_start >= self.min_interval + ): + silence_end = min(total_frames, silence_start + self.max_sil_kept) + pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start + sil_tags.append((pos, total_frames + 1)) + # Apply and return slices. + if len(sil_tags) == 0: + return [waveform] + else: + chunks = [] + if sil_tags[0][0] > 0: + chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0])) + for i in range(len(sil_tags) - 1): + chunks.append( + self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]) + ) + if sil_tags[-1][1] < total_frames: + chunks.append( + self._apply_slice(waveform, sil_tags[-1][1], total_frames) + ) + return chunks + + +def main(): + import os.path + from argparse import ArgumentParser + + import librosa + import soundfile + + parser = ArgumentParser() + parser.add_argument("audio", type=str, help="The audio to be sliced") + parser.add_argument( + "--out", type=str, help="Output directory of the sliced audio clips" + ) + parser.add_argument( + "--db_thresh", + type=float, + required=False, + default=-40, + help="The dB threshold for silence detection", + ) + parser.add_argument( + "--min_length", + type=int, + required=False, + default=5000, + help="The minimum milliseconds required for each sliced audio clip", + ) + parser.add_argument( + "--min_interval", + type=int, + required=False, + default=300, + help="The minimum milliseconds for a silence part to be sliced", + ) + parser.add_argument( + "--hop_size", + type=int, + required=False, + default=10, + help="Frame length in milliseconds", + ) + parser.add_argument( + "--max_sil_kept", + type=int, + required=False, + default=500, + help="The maximum silence length kept around the sliced clip, presented in milliseconds", + ) + args = parser.parse_args() + out = args.out + if out is None: + out = os.path.dirname(os.path.abspath(args.audio)) + audio, sr = librosa.load(args.audio, sr=None, mono=False) + slicer = Slicer( + sr=sr, + threshold=args.db_thresh, + min_length=args.min_length, + min_interval=args.min_interval, + hop_size=args.hop_size, + max_sil_kept=args.max_sil_kept, + ) + chunks = slicer.slice(audio) + if not os.path.exists(out): + os.makedirs(out) + for i, chunk in enumerate(chunks): + if len(chunk.shape) > 1: + chunk = chunk.T + soundfile.write( + os.path.join( + out, + f"%s_%d.wav" + % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), + ), + chunk, + sr, + ) + + +if __name__ == "__main__": + main() diff --git a/infer/lib/train/data_utils.py b/infer/lib/train/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9c15bb3c7124612a34217489ff04a6db6614bde0 --- /dev/null +++ b/infer/lib/train/data_utils.py @@ -0,0 +1,517 @@ +import os +import traceback +import logging + +logger = logging.getLogger(__name__) + +import numpy as np +import torch +import torch.utils.data + +from infer.lib.train.mel_processing import spectrogram_torch +from infer.lib.train.utils import load_filepaths_and_text, load_wav_to_torch + + +class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): + """ + 1) loads audio, text pairs + 2) normalizes text and converts them to sequences of integers + 3) computes spectrograms from audio files. + """ + + def __init__(self, audiopaths_and_text, hparams): + self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.filter_length = hparams.filter_length + self.hop_length = hparams.hop_length + self.win_length = hparams.win_length + self.sampling_rate = hparams.sampling_rate + self.min_text_len = getattr(hparams, "min_text_len", 1) + self.max_text_len = getattr(hparams, "max_text_len", 5000) + self._filter() + + def _filter(self): + """ + Filter text & store spec lengths + """ + # Store spectrogram lengths for Bucketing + # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) + # spec_length = wav_length // hop_length + audiopaths_and_text_new = [] + lengths = [] + for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text: + if self.min_text_len <= len(text) and len(text) <= self.max_text_len: + audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv]) + lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) + self.audiopaths_and_text = audiopaths_and_text_new + self.lengths = lengths + + def get_sid(self, sid): + sid = torch.LongTensor([int(sid)]) + return sid + + def get_audio_text_pair(self, audiopath_and_text): + # separate filename and text + file = audiopath_and_text[0] + phone = audiopath_and_text[1] + pitch = audiopath_and_text[2] + pitchf = audiopath_and_text[3] + dv = audiopath_and_text[4] + + phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf) + spec, wav = self.get_audio(file) + dv = self.get_sid(dv) + + len_phone = phone.size()[0] + len_spec = spec.size()[-1] + # print(123,phone.shape,pitch.shape,spec.shape) + if len_phone != len_spec: + len_min = min(len_phone, len_spec) + # amor + len_wav = len_min * self.hop_length + + spec = spec[:, :len_min] + wav = wav[:, :len_wav] + + phone = phone[:len_min, :] + pitch = pitch[:len_min] + pitchf = pitchf[:len_min] + + return (spec, wav, phone, pitch, pitchf, dv) + + def get_labels(self, phone, pitch, pitchf): + phone = np.load(phone) + phone = np.repeat(phone, 2, axis=0) + pitch = np.load(pitch) + pitchf = np.load(pitchf) + n_num = min(phone.shape[0], 900) # DistributedBucketSampler + # print(234,phone.shape,pitch.shape) + phone = phone[:n_num, :] + pitch = pitch[:n_num] + pitchf = pitchf[:n_num] + phone = torch.FloatTensor(phone) + pitch = torch.LongTensor(pitch) + pitchf = torch.FloatTensor(pitchf) + return phone, pitch, pitchf + + def get_audio(self, filename): + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.sampling_rate: + raise ValueError( + "{} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate + ) + ) + audio_norm = audio + # audio_norm = audio / self.max_wav_value + # audio_norm = audio / np.abs(audio).max() + + audio_norm = audio_norm.unsqueeze(0) + spec_filename = filename.replace(".wav", ".spec.pt") + if os.path.exists(spec_filename): + try: + spec = torch.load(spec_filename, weights_only=False) + except: + logger.warning("%s %s", spec_filename, traceback.format_exc()) + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + else: + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + return spec, audio_norm + + def __getitem__(self, index): + return self.get_audio_text_pair(self.audiopaths_and_text[index]) + + def __len__(self): + return len(self.audiopaths_and_text) + + +class TextAudioCollateMultiNSFsid: + """Zero-pads model inputs and targets""" + + def __init__(self, return_ids=False): + self.return_ids = return_ids + + def __call__(self, batch): + """Collate's training batch from normalized text and aduio + PARAMS + ------ + batch: [text_normalized, spec_normalized, wav_normalized] + """ + # Right zero-pad all one-hot text sequences to max input length + _, ids_sorted_decreasing = torch.sort( + torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True + ) + + max_spec_len = max([x[0].size(1) for x in batch]) + max_wave_len = max([x[1].size(1) for x in batch]) + spec_lengths = torch.LongTensor(len(batch)) + wave_lengths = torch.LongTensor(len(batch)) + spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) + wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) + spec_padded.zero_() + wave_padded.zero_() + + max_phone_len = max([x[2].size(0) for x in batch]) + phone_lengths = torch.LongTensor(len(batch)) + phone_padded = torch.FloatTensor( + len(batch), max_phone_len, batch[0][2].shape[1] + ) # (spec, wav, phone, pitch) + pitch_padded = torch.LongTensor(len(batch), max_phone_len) + pitchf_padded = torch.FloatTensor(len(batch), max_phone_len) + phone_padded.zero_() + pitch_padded.zero_() + pitchf_padded.zero_() + # dv = torch.FloatTensor(len(batch), 256)#gin=256 + sid = torch.LongTensor(len(batch)) + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + + spec = row[0] + spec_padded[i, :, : spec.size(1)] = spec + spec_lengths[i] = spec.size(1) + + wave = row[1] + wave_padded[i, :, : wave.size(1)] = wave + wave_lengths[i] = wave.size(1) + + phone = row[2] + phone_padded[i, : phone.size(0), :] = phone + phone_lengths[i] = phone.size(0) + + pitch = row[3] + pitch_padded[i, : pitch.size(0)] = pitch + pitchf = row[4] + pitchf_padded[i, : pitchf.size(0)] = pitchf + + # dv[i] = row[5] + sid[i] = row[5] + + return ( + phone_padded, + phone_lengths, + pitch_padded, + pitchf_padded, + spec_padded, + spec_lengths, + wave_padded, + wave_lengths, + # dv + sid, + ) + + +class TextAudioLoader(torch.utils.data.Dataset): + """ + 1) loads audio, text pairs + 2) normalizes text and converts them to sequences of integers + 3) computes spectrograms from audio files. + """ + + def __init__(self, audiopaths_and_text, hparams): + self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.filter_length = hparams.filter_length + self.hop_length = hparams.hop_length + self.win_length = hparams.win_length + self.sampling_rate = hparams.sampling_rate + self.min_text_len = getattr(hparams, "min_text_len", 1) + self.max_text_len = getattr(hparams, "max_text_len", 5000) + self._filter() + + def _filter(self): + """ + Filter text & store spec lengths + """ + # Store spectrogram lengths for Bucketing + # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) + # spec_length = wav_length // hop_length + audiopaths_and_text_new = [] + lengths = [] + for audiopath, text, dv in self.audiopaths_and_text: + if self.min_text_len <= len(text) and len(text) <= self.max_text_len: + audiopaths_and_text_new.append([audiopath, text, dv]) + lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) + self.audiopaths_and_text = audiopaths_and_text_new + self.lengths = lengths + + def get_sid(self, sid): + sid = torch.LongTensor([int(sid)]) + return sid + + def get_audio_text_pair(self, audiopath_and_text): + # separate filename and text + file = audiopath_and_text[0] + phone = audiopath_and_text[1] + dv = audiopath_and_text[2] + + phone = self.get_labels(phone) + spec, wav = self.get_audio(file) + dv = self.get_sid(dv) + + len_phone = phone.size()[0] + len_spec = spec.size()[-1] + if len_phone != len_spec: + len_min = min(len_phone, len_spec) + len_wav = len_min * self.hop_length + spec = spec[:, :len_min] + wav = wav[:, :len_wav] + phone = phone[:len_min, :] + return (spec, wav, phone, dv) + + def get_labels(self, phone): + phone = np.load(phone) + phone = np.repeat(phone, 2, axis=0) + n_num = min(phone.shape[0], 900) # DistributedBucketSampler + phone = phone[:n_num, :] + phone = torch.FloatTensor(phone) + return phone + + def get_audio(self, filename): + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.sampling_rate: + raise ValueError( + "{} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate + ) + ) + audio_norm = audio + # audio_norm = audio / self.max_wav_value + # audio_norm = audio / np.abs(audio).max() + + audio_norm = audio_norm.unsqueeze(0) + spec_filename = filename.replace(".wav", ".spec.pt") + if os.path.exists(spec_filename): + try: + spec = torch.load(spec_filename, weights_only=False) + except: + logger.warning("%s %s", spec_filename, traceback.format_exc()) + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + else: + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + return spec, audio_norm + + def __getitem__(self, index): + return self.get_audio_text_pair(self.audiopaths_and_text[index]) + + def __len__(self): + return len(self.audiopaths_and_text) + + +class TextAudioCollate: + """Zero-pads model inputs and targets""" + + def __init__(self, return_ids=False): + self.return_ids = return_ids + + def __call__(self, batch): + """Collate's training batch from normalized text and aduio + PARAMS + ------ + batch: [text_normalized, spec_normalized, wav_normalized] + """ + # Right zero-pad all one-hot text sequences to max input length + _, ids_sorted_decreasing = torch.sort( + torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True + ) + + max_spec_len = max([x[0].size(1) for x in batch]) + max_wave_len = max([x[1].size(1) for x in batch]) + spec_lengths = torch.LongTensor(len(batch)) + wave_lengths = torch.LongTensor(len(batch)) + spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) + wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) + spec_padded.zero_() + wave_padded.zero_() + + max_phone_len = max([x[2].size(0) for x in batch]) + phone_lengths = torch.LongTensor(len(batch)) + phone_padded = torch.FloatTensor( + len(batch), max_phone_len, batch[0][2].shape[1] + ) + phone_padded.zero_() + sid = torch.LongTensor(len(batch)) + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + + spec = row[0] + spec_padded[i, :, : spec.size(1)] = spec + spec_lengths[i] = spec.size(1) + + wave = row[1] + wave_padded[i, :, : wave.size(1)] = wave + wave_lengths[i] = wave.size(1) + + phone = row[2] + phone_padded[i, : phone.size(0), :] = phone + phone_lengths[i] = phone.size(0) + + sid[i] = row[3] + + return ( + phone_padded, + phone_lengths, + spec_padded, + spec_lengths, + wave_padded, + wave_lengths, + sid, + ) + + +class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): + """ + Maintain similar input lengths in a batch. + Length groups are specified by boundaries. + Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}. + + It removes samples which are not included in the boundaries. + Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. + """ + + def __init__( + self, + dataset, + batch_size, + boundaries, + num_replicas=None, + rank=None, + shuffle=True, + ): + super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) + self.lengths = dataset.lengths + self.batch_size = batch_size + self.boundaries = boundaries + + self.buckets, self.num_samples_per_bucket = self._create_buckets() + self.total_size = sum(self.num_samples_per_bucket) + self.num_samples = self.total_size // self.num_replicas + + def _create_buckets(self): + buckets = [[] for _ in range(len(self.boundaries) - 1)] + for i in range(len(self.lengths)): + length = self.lengths[i] + idx_bucket = self._bisect(length) + if idx_bucket != -1: + buckets[idx_bucket].append(i) + + for i in range(len(buckets) - 1, -1, -1): # + if len(buckets[i]) == 0: + buckets.pop(i) + self.boundaries.pop(i + 1) + + num_samples_per_bucket = [] + for i in range(len(buckets)): + len_bucket = len(buckets[i]) + total_batch_size = self.num_replicas * self.batch_size + rem = ( + total_batch_size - (len_bucket % total_batch_size) + ) % total_batch_size + num_samples_per_bucket.append(len_bucket + rem) + return buckets, num_samples_per_bucket + + def __iter__(self): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + + indices = [] + if self.shuffle: + for bucket in self.buckets: + indices.append(torch.randperm(len(bucket), generator=g).tolist()) + else: + for bucket in self.buckets: + indices.append(list(range(len(bucket)))) + + batches = [] + for i in range(len(self.buckets)): + bucket = self.buckets[i] + len_bucket = len(bucket) + ids_bucket = indices[i] + num_samples_bucket = self.num_samples_per_bucket[i] + + # add extra samples to make it evenly divisible + rem = num_samples_bucket - len_bucket + ids_bucket = ( + ids_bucket + + ids_bucket * (rem // len_bucket) + + ids_bucket[: (rem % len_bucket)] + ) + + # subsample + ids_bucket = ids_bucket[self.rank :: self.num_replicas] + + # batching + for j in range(len(ids_bucket) // self.batch_size): + batch = [ + bucket[idx] + for idx in ids_bucket[ + j * self.batch_size : (j + 1) * self.batch_size + ] + ] + batches.append(batch) + + if self.shuffle: + batch_ids = torch.randperm(len(batches), generator=g).tolist() + batches = [batches[i] for i in batch_ids] + self.batches = batches + + assert len(self.batches) * self.batch_size == self.num_samples + return iter(self.batches) + + def _bisect(self, x, lo=0, hi=None): + if hi is None: + hi = len(self.boundaries) - 1 + + if hi > lo: + mid = (hi + lo) // 2 + if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: + return mid + elif x <= self.boundaries[mid]: + return self._bisect(x, lo, mid) + else: + return self._bisect(x, mid + 1, hi) + else: + return -1 + + def __len__(self): + return self.num_samples // self.batch_size diff --git a/infer/lib/train/losses.py b/infer/lib/train/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..aa7bd81cf596884a8b33e802ae49254d7810a860 --- /dev/null +++ b/infer/lib/train/losses.py @@ -0,0 +1,58 @@ +import torch + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + rl = rl.float().detach() + gl = gl.float() + loss += torch.mean(torch.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + dr = dr.float() + dg = dg.float() + r_loss = torch.mean((1 - dr) ** 2) + g_loss = torch.mean(dg**2) + loss += r_loss + g_loss + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + dg = dg.float() + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses + + +def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): + """ + z_p, logs_q: [b, h, t_t] + m_p, logs_p: [b, h, t_t] + """ + z_p = z_p.float() + logs_q = logs_q.float() + m_p = m_p.float() + logs_p = logs_p.float() + z_mask = z_mask.float() + + kl = logs_p - logs_q - 0.5 + kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) + kl = torch.sum(kl * z_mask) + l = kl / torch.sum(z_mask) + return l diff --git a/infer/lib/train/mel_processing.py b/infer/lib/train/mel_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..3751f1eab1ea8137088f2f7d7c8294190403b4ce --- /dev/null +++ b/infer/lib/train/mel_processing.py @@ -0,0 +1,127 @@ +import torch +import torch.utils.data +from librosa.filters import mel as librosa_mel_fn +import logging + +logger = logging.getLogger(__name__) + +MAX_WAV_VALUE = 32768.0 + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + return dynamic_range_compression_torch(magnitudes) + + +def spectral_de_normalize_torch(magnitudes): + return dynamic_range_decompression_torch(magnitudes) + + +# Reusable banks +mel_basis = {} +hann_window = {} + + +def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): + """Convert waveform into Linear-frequency Linear-amplitude spectrogram. + + Args: + y :: (B, T) - Audio waveforms + n_fft + sampling_rate + hop_size + win_size + center + Returns: + :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram + """ + + # Window - Cache if needed + global hann_window + dtype_device = str(y.dtype) + "_" + str(y.device) + wnsize_dtype_device = str(win_size) + "_" + dtype_device + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( + dtype=y.dtype, device=y.device + ) + + # Padding + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + + # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2) + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + + # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame) + spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + 1e-6) + return spec + + +def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): + # MelBasis - Cache if needed + global mel_basis + dtype_device = str(spec.dtype) + "_" + str(spec.device) + fmax_dtype_device = str(fmax) + "_" + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn( + sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax + ) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( + dtype=spec.dtype, device=spec.device + ) + + # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame) + melspec = torch.matmul(mel_basis[fmax_dtype_device], spec) + melspec = spectral_normalize_torch(melspec) + return melspec + + +def mel_spectrogram_torch( + y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False +): + """Convert waveform into Mel-frequency Log-amplitude spectrogram. + + Args: + y :: (B, T) - Waveforms + Returns: + melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram + """ + # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame) + spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center) + + # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame) + melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax) + + return melspec diff --git a/infer/lib/train/process_ckpt.py b/infer/lib/train/process_ckpt.py new file mode 100644 index 0000000000000000000000000000000000000000..632dd784bfc083df396fe8490758fa0c1dc1aa55 --- /dev/null +++ b/infer/lib/train/process_ckpt.py @@ -0,0 +1,261 @@ +import os +import sys +import traceback +from collections import OrderedDict + +import torch + +from i18n.i18n import I18nAuto + +i18n = I18nAuto() + + +def savee(ckpt, sr, if_f0, name, epoch, version, hps): + try: + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = ckpt[key].half() + opt["config"] = [ + hps.data.filter_length // 2 + 1, + 32, + hps.model.inter_channels, + hps.model.hidden_channels, + hps.model.filter_channels, + hps.model.n_heads, + hps.model.n_layers, + hps.model.kernel_size, + hps.model.p_dropout, + hps.model.resblock, + hps.model.resblock_kernel_sizes, + hps.model.resblock_dilation_sizes, + hps.model.upsample_rates, + hps.model.upsample_initial_channel, + hps.model.upsample_kernel_sizes, + hps.model.spk_embed_dim, + hps.model.gin_channels, + hps.data.sampling_rate, + ] + opt["info"] = "%sepoch" % epoch + opt["sr"] = sr + opt["f0"] = if_f0 + opt["version"] = version + torch.save(opt, "assets/weights/%s.pth" % name) + return "Success." + except: + return traceback.format_exc() + + +def show_info(path): + try: + a = torch.load(path, map_location="cpu", weights_only=False) + return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s\n版本:%s" % ( + a.get("info", "None"), + a.get("sr", "None"), + a.get("f0", "None"), + a.get("version", "None"), + ) + except: + return traceback.format_exc() + + +def extract_small_model(path, name, sr, if_f0, info, version): + try: + ckpt = torch.load(path, map_location="cpu", weights_only=False) + if "model" in ckpt: + ckpt = ckpt["model"] + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = ckpt[key].half() + if sr == "40k": + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 10, 2, 2], + 512, + [16, 16, 4, 4], + 109, + 256, + 40000, + ] + elif sr == "48k": + if version == "v1": + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 6, 2, 2, 2], + 512, + [16, 16, 4, 4, 4], + 109, + 256, + 48000, + ] + else: + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [12, 10, 2, 2], + 512, + [24, 20, 4, 4], + 109, + 256, + 48000, + ] + elif sr == "32k": + if version == "v1": + opt["config"] = [ + 513, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 4, 2, 2, 2], + 512, + [16, 16, 4, 4, 4], + 109, + 256, + 32000, + ] + else: + opt["config"] = [ + 513, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 8, 2, 2], + 512, + [20, 16, 4, 4], + 109, + 256, + 32000, + ] + if info == "": + info = "Extracted model." + opt["info"] = info + opt["version"] = version + opt["sr"] = sr + opt["f0"] = int(if_f0) + torch.save(opt, "assets/weights/%s.pth" % name) + return "Success." + except: + return traceback.format_exc() + + +def change_info(path, info, name): + try: + ckpt = torch.load(path, map_location="cpu", weights_only=False) + ckpt["info"] = info + if name == "": + name = os.path.basename(path) + torch.save(ckpt, "assets/weights/%s" % name) + return "Success." + except: + return traceback.format_exc() + + +def merge(path1, path2, alpha1, sr, f0, info, name, version): + try: + + def extract(ckpt): + a = ckpt["model"] + opt = OrderedDict() + opt["weight"] = {} + for key in a.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = a[key] + return opt + + ckpt1 = torch.load(path1, map_location="cpu", weights_only=False) + ckpt2 = torch.load(path2, map_location="cpu", weights_only=False) + cfg = ckpt1["config"] + if "model" in ckpt1: + ckpt1 = extract(ckpt1) + else: + ckpt1 = ckpt1["weight"] + if "model" in ckpt2: + ckpt2 = extract(ckpt2) + else: + ckpt2 = ckpt2["weight"] + if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())): + return "Fail to merge the models. The model architectures are not the same." + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt1.keys(): + # try: + if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape: + min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0]) + opt["weight"][key] = ( + alpha1 * (ckpt1[key][:min_shape0].float()) + + (1 - alpha1) * (ckpt2[key][:min_shape0].float()) + ).half() + else: + opt["weight"][key] = ( + alpha1 * (ckpt1[key].float()) + (1 - alpha1) * (ckpt2[key].float()) + ).half() + # except: + # pdb.set_trace() + opt["config"] = cfg + """ + if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000] + elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000] + elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000] + """ + opt["sr"] = sr + opt["f0"] = 1 if f0 == i18n("是") else 0 + opt["version"] = version + opt["info"] = info + torch.save(opt, "assets/weights/%s.pth" % name) + return "Success." + except: + return traceback.format_exc() diff --git a/infer/lib/train/utils.py b/infer/lib/train/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5037d3ff1375ecf6a20961ad66056f12dbe6b161 --- /dev/null +++ b/infer/lib/train/utils.py @@ -0,0 +1,483 @@ +import argparse +import glob +import json +import logging +import os +import subprocess +import sys +import shutil + +import numpy as np +import torch +from scipy.io.wavfile import read + +MATPLOTLIB_FLAG = False + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +logger = logging + + +def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=False) + + ################## + def go(model, bkey): + saved_state_dict = checkpoint_dict[bkey] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): # 模型需要的shape + try: + new_state_dict[k] = saved_state_dict[k] + if saved_state_dict[k].shape != state_dict[k].shape: + logger.warning( + "shape-%s-mismatch. need: %s, get: %s", + k, + state_dict[k].shape, + saved_state_dict[k].shape, + ) # + raise KeyError + except: + # logger.info(traceback.format_exc()) + logger.info("%s is not in the checkpoint", k) # pretrain缺失的 + new_state_dict[k] = v # 模型自带的随机值 + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict, strict=False) + else: + model.load_state_dict(new_state_dict, strict=False) + return model + + go(combd, "combd") + model = go(sbd, "sbd") + ############# + logger.info("Loaded model weights") + + iteration = checkpoint_dict["iteration"] + learning_rate = checkpoint_dict["learning_rate"] + if ( + optimizer is not None and load_opt == 1 + ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch + # try: + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + # except: + # traceback.print_exc() + logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) + return model, optimizer, learning_rate, iteration + + +# def load_checkpoint(checkpoint_path, model, optimizer=None): +# assert os.path.isfile(checkpoint_path) +# checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') +# iteration = checkpoint_dict['iteration'] +# learning_rate = checkpoint_dict['learning_rate'] +# if optimizer is not None: +# optimizer.load_state_dict(checkpoint_dict['optimizer']) +# # print(1111) +# saved_state_dict = checkpoint_dict['model'] +# # print(1111) +# +# if hasattr(model, 'module'): +# state_dict = model.module.state_dict() +# else: +# state_dict = model.state_dict() +# new_state_dict= {} +# for k, v in state_dict.items(): +# try: +# new_state_dict[k] = saved_state_dict[k] +# except: +# logger.info("%s is not in the checkpoint" % k) +# new_state_dict[k] = v +# if hasattr(model, 'module'): +# model.module.load_state_dict(new_state_dict) +# else: +# model.load_state_dict(new_state_dict) +# logger.info("Loaded checkpoint '{}' (epoch {})" .format( +# checkpoint_path, iteration)) +# return model, optimizer, learning_rate, iteration +def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=False) + + saved_state_dict = checkpoint_dict["model"] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): # 模型需要的shape + try: + new_state_dict[k] = saved_state_dict[k] + if saved_state_dict[k].shape != state_dict[k].shape: + logger.warning( + "shape-%s-mismatch|need-%s|get-%s", + k, + state_dict[k].shape, + saved_state_dict[k].shape, + ) # + raise KeyError + except: + # logger.info(traceback.format_exc()) + logger.info("%s is not in the checkpoint", k) # pretrain缺失的 + new_state_dict[k] = v # 模型自带的随机值 + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict, strict=False) + else: + model.load_state_dict(new_state_dict, strict=False) + logger.info("Loaded model weights") + + iteration = checkpoint_dict["iteration"] + learning_rate = checkpoint_dict["learning_rate"] + if ( + optimizer is not None and load_opt == 1 + ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch + # try: + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + # except: + # traceback.print_exc() + logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) + return model, optimizer, learning_rate, iteration + + +def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): + logger.info( + "Saving model and optimizer state at epoch {} to {}".format( + iteration, checkpoint_path + ) + ) + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + torch.save( + { + "model": state_dict, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) + + +def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path): + logger.info( + "Saving model and optimizer state at epoch {} to {}".format( + iteration, checkpoint_path + ) + ) + if hasattr(combd, "module"): + state_dict_combd = combd.module.state_dict() + else: + state_dict_combd = combd.state_dict() + if hasattr(sbd, "module"): + state_dict_sbd = sbd.module.state_dict() + else: + state_dict_sbd = sbd.state_dict() + torch.save( + { + "combd": state_dict_combd, + "sbd": state_dict_sbd, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) + + +def summarize( + writer, + global_step, + scalars={}, + histograms={}, + images={}, + audios={}, + audio_sampling_rate=22050, +): + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats="HWC") + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sampling_rate) + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + f_list = glob.glob(os.path.join(dir_path, regex)) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + x = f_list[-1] + logger.debug(x) + return x + + +def plot_spectrogram_to_numpy(spectrogram): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def plot_alignment_to_numpy(alignment, info=None): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow( + alignment.transpose(), aspect="auto", origin="lower", interpolation="none" + ) + fig.colorbar(im, ax=ax) + xlabel = "Decoder timestep" + if info is not None: + xlabel += "\n\n" + info + plt.xlabel(xlabel) + plt.ylabel("Encoder timestep") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sampling_rate + + +def load_filepaths_and_text(filename, split="|"): + try: + with open(filename, encoding="utf-8") as f: + filepaths_and_text = [line.strip().split(split) for line in f] + except UnicodeDecodeError: + with open(filename) as f: + filepaths_and_text = [line.strip().split(split) for line in f] + + return filepaths_and_text + + +def get_hparams(init=True): + """ + todo: + 结尾七人组: + 保存频率、总epoch done + bs done + pretrainG、pretrainD done + 卡号:os.en["CUDA_VISIBLE_DEVICES"] done + if_latest done + 模型:if_f0 done + 采样率:自动选择config done + 是否缓存数据集进GPU:if_cache_data_in_gpu done + + -m: + 自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done + -c不要了 + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "-se", + "--save_every_epoch", + type=int, + required=True, + help="checkpoint save frequency (epoch)", + ) + parser.add_argument( + "-te", "--total_epoch", type=int, required=True, help="total_epoch" + ) + parser.add_argument( + "-pg", "--pretrainG", type=str, default="", help="Pretrained Generator path" + ) + parser.add_argument( + "-pd", "--pretrainD", type=str, default="", help="Pretrained Discriminator path" + ) + parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -") + parser.add_argument( + "-bs", "--batch_size", type=int, required=True, help="batch size" + ) + parser.add_argument( + "-e", "--experiment_dir", type=str, required=True, help="experiment dir" + ) # -m + parser.add_argument( + "-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k" + ) + parser.add_argument( + "-sw", + "--save_every_weights", + type=str, + default="0", + help="save the extracted model in weights directory when saving checkpoints", + ) + parser.add_argument( + "-v", "--version", type=str, required=True, help="model version" + ) + parser.add_argument( + "-f0", + "--if_f0", + type=int, + required=True, + help="use f0 as one of the inputs of the model, 1 or 0", + ) + parser.add_argument( + "-l", + "--if_latest", + type=int, + required=True, + help="if only save the latest G/D pth file, 1 or 0", + ) + parser.add_argument( + "-c", + "--if_cache_data_in_gpu", + type=int, + required=True, + help="if caching the dataset in GPU memory, 1 or 0", + ) + + args = parser.parse_args() + name = args.experiment_dir + experiment_dir = os.path.join("./logs", args.experiment_dir) + + config_save_path = os.path.join(experiment_dir, "config.json") + with open(config_save_path, "r") as f: + config = json.load(f) + + hparams = HParams(**config) + hparams.model_dir = hparams.experiment_dir = experiment_dir + hparams.save_every_epoch = args.save_every_epoch + hparams.name = name + hparams.total_epoch = args.total_epoch + hparams.pretrainG = args.pretrainG + hparams.pretrainD = args.pretrainD + hparams.version = args.version + hparams.gpus = args.gpus + hparams.train.batch_size = args.batch_size + hparams.sample_rate = args.sample_rate + hparams.if_f0 = args.if_f0 + hparams.if_latest = args.if_latest + hparams.save_every_weights = args.save_every_weights + hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu + hparams.data.training_files = "%s/filelist.txt" % experiment_dir + return hparams + + +def get_hparams_from_dir(model_dir): + config_save_path = os.path.join(model_dir, "config.json") + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_file(config_path): + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + return hparams + + +def check_git_hash(model_dir): + source_dir = os.path.dirname(os.path.realpath(__file__)) + if not os.path.exists(os.path.join(source_dir, ".git")): + logger.warning( + "{} is not a git repository, therefore hash value comparison will be ignored.".format( + source_dir + ) + ) + return + + cur_hash = subprocess.getoutput("git rev-parse HEAD") + + path = os.path.join(model_dir, "githash") + if os.path.exists(path): + saved_hash = open(path).read() + if saved_hash != cur_hash: + logger.warning( + "git hash values are different. {}(saved) != {}(current)".format( + saved_hash[:8], cur_hash[:8] + ) + ) + else: + open(path, "w").write(cur_hash) + + +def get_logger(model_dir, filename="train.log"): + global logger + logger = logging.getLogger(os.path.basename(model_dir)) + logger.setLevel(logging.DEBUG) + + formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + h = logging.FileHandler(os.path.join(model_dir, filename)) + h.setLevel(logging.DEBUG) + h.setFormatter(formatter) + logger.addHandler(h) + return logger + + +class HParams: + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + return setattr(self, key, value) + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__() diff --git a/infer/lib/uvr5_pack/lib_v5/dataset.py b/infer/lib/uvr5_pack/lib_v5/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..831a83cdfcec8009d0d77a71ac8fd0eadc77b926 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/dataset.py @@ -0,0 +1,183 @@ +import os +import random + +import numpy as np +import torch +import torch.utils.data +from tqdm import tqdm + +from . import spec_utils + + +class VocalRemoverValidationSet(torch.utils.data.Dataset): + def __init__(self, patch_list): + self.patch_list = patch_list + + def __len__(self): + return len(self.patch_list) + + def __getitem__(self, idx): + path = self.patch_list[idx] + data = np.load(path) + + X, y = data["X"], data["y"] + + X_mag = np.abs(X) + y_mag = np.abs(y) + + return X_mag, y_mag + + +def make_pair(mix_dir, inst_dir): + input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"] + + X_list = sorted( + [ + os.path.join(mix_dir, fname) + for fname in os.listdir(mix_dir) + if os.path.splitext(fname)[1] in input_exts + ] + ) + y_list = sorted( + [ + os.path.join(inst_dir, fname) + for fname in os.listdir(inst_dir) + if os.path.splitext(fname)[1] in input_exts + ] + ) + + filelist = list(zip(X_list, y_list)) + + return filelist + + +def train_val_split(dataset_dir, split_mode, val_rate, val_filelist): + if split_mode == "random": + filelist = make_pair( + os.path.join(dataset_dir, "mixtures"), + os.path.join(dataset_dir, "instruments"), + ) + + random.shuffle(filelist) + + if len(val_filelist) == 0: + val_size = int(len(filelist) * val_rate) + train_filelist = filelist[:-val_size] + val_filelist = filelist[-val_size:] + else: + train_filelist = [ + pair for pair in filelist if list(pair) not in val_filelist + ] + elif split_mode == "subdirs": + if len(val_filelist) != 0: + raise ValueError( + "The `val_filelist` option is not available in `subdirs` mode" + ) + + train_filelist = make_pair( + os.path.join(dataset_dir, "training/mixtures"), + os.path.join(dataset_dir, "training/instruments"), + ) + + val_filelist = make_pair( + os.path.join(dataset_dir, "validation/mixtures"), + os.path.join(dataset_dir, "validation/instruments"), + ) + + return train_filelist, val_filelist + + +def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha): + perm = np.random.permutation(len(X)) + for i, idx in enumerate(tqdm(perm)): + if np.random.uniform() < reduction_rate: + y[idx] = spec_utils.reduce_vocal_aggressively( + X[idx], y[idx], reduction_mask + ) + + if np.random.uniform() < 0.5: + # swap channel + X[idx] = X[idx, ::-1] + y[idx] = y[idx, ::-1] + if np.random.uniform() < 0.02: + # mono + X[idx] = X[idx].mean(axis=0, keepdims=True) + y[idx] = y[idx].mean(axis=0, keepdims=True) + if np.random.uniform() < 0.02: + # inst + X[idx] = y[idx] + + if np.random.uniform() < mixup_rate and i < len(perm) - 1: + lam = np.random.beta(mixup_alpha, mixup_alpha) + X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]] + y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]] + + return X, y + + +def make_padding(width, cropsize, offset): + left = offset + roi_size = cropsize - left * 2 + if roi_size == 0: + roi_size = cropsize + right = roi_size - (width % roi_size) + left + + return left, right, roi_size + + +def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset): + len_dataset = patches * len(filelist) + + X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) + y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64) + + for i, (X_path, y_path) in enumerate(tqdm(filelist)): + X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) + coef = np.max([np.abs(X).max(), np.abs(y).max()]) + X, y = X / coef, y / coef + + l, r, roi_size = make_padding(X.shape[2], cropsize, offset) + X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant") + y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant") + + starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches) + ends = starts + cropsize + for j in range(patches): + idx = i * patches + j + X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]] + y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]] + + return X_dataset, y_dataset + + +def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset): + patch_list = [] + patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format( + cropsize, sr, hop_length, n_fft, offset + ) + os.makedirs(patch_dir, exist_ok=True) + + for i, (X_path, y_path) in enumerate(tqdm(filelist)): + basename = os.path.splitext(os.path.basename(X_path))[0] + + X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft) + coef = np.max([np.abs(X).max(), np.abs(y).max()]) + X, y = X / coef, y / coef + + l, r, roi_size = make_padding(X.shape[2], cropsize, offset) + X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant") + y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant") + + len_dataset = int(np.ceil(X.shape[2] / roi_size)) + for j in range(len_dataset): + outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j)) + start = j * roi_size + if not os.path.exists(outpath): + np.savez( + outpath, + X=X_pad[:, :, start : start + cropsize], + y=y_pad[:, :, start : start + cropsize], + ) + patch_list.append(outpath) + + return VocalRemoverValidationSet(patch_list) diff --git a/infer/lib/uvr5_pack/lib_v5/layers.py b/infer/lib/uvr5_pack/lib_v5/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..3cc26678f461c4cb5f7fb6c5f088934aa5ffe18e --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/layers.py @@ -0,0 +1,118 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class Conv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nout, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + bias=False, + ), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class SeperableConv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(SeperableConv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nin, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + groups=nin, + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class Encoder(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) + + def __call__(self, x): + skip = self.conv1(x) + h = self.conv2(skip) + + return h, skip + + +class Decoder(nn.Module): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): + super(Decoder, self).__init__() + self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, x, skip=None): + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) + if skip is not None: + skip = spec_utils.crop_center(skip, x) + x = torch.cat([x, skip], dim=1) + h = self.conv(x) + + if self.dropout is not None: + h = self.dropout(h) + + return h + + +class ASPPModule(nn.Module): + def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): + super(ASPPModule, self).__init__() + self.conv1 = nn.Sequential( + nn.AdaptiveAvgPool2d((1, None)), + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), + ) + self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + self.conv3 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) + self.conv4 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) + self.conv5 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.bottleneck = nn.Sequential( + Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) + ) + + def forward(self, x): + _, _, h, w = x.size() + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) + bottle = self.bottleneck(out) + return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py b/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py new file mode 100644 index 0000000000000000000000000000000000000000..3cc26678f461c4cb5f7fb6c5f088934aa5ffe18e --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py @@ -0,0 +1,118 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class Conv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nout, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + bias=False, + ), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class SeperableConv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(SeperableConv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nin, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + groups=nin, + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class Encoder(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) + + def __call__(self, x): + skip = self.conv1(x) + h = self.conv2(skip) + + return h, skip + + +class Decoder(nn.Module): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): + super(Decoder, self).__init__() + self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, x, skip=None): + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) + if skip is not None: + skip = spec_utils.crop_center(skip, x) + x = torch.cat([x, skip], dim=1) + h = self.conv(x) + + if self.dropout is not None: + h = self.dropout(h) + + return h + + +class ASPPModule(nn.Module): + def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): + super(ASPPModule, self).__init__() + self.conv1 = nn.Sequential( + nn.AdaptiveAvgPool2d((1, None)), + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), + ) + self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + self.conv3 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) + self.conv4 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) + self.conv5 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.bottleneck = nn.Sequential( + Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) + ) + + def forward(self, x): + _, _, h, w = x.size() + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) + bottle = self.bottleneck(out) + return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py b/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py new file mode 100644 index 0000000000000000000000000000000000000000..3cc26678f461c4cb5f7fb6c5f088934aa5ffe18e --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py @@ -0,0 +1,118 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class Conv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nout, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + bias=False, + ), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class SeperableConv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(SeperableConv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nin, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + groups=nin, + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class Encoder(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) + + def __call__(self, x): + skip = self.conv1(x) + h = self.conv2(skip) + + return h, skip + + +class Decoder(nn.Module): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): + super(Decoder, self).__init__() + self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, x, skip=None): + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) + if skip is not None: + skip = spec_utils.crop_center(skip, x) + x = torch.cat([x, skip], dim=1) + h = self.conv(x) + + if self.dropout is not None: + h = self.dropout(h) + + return h + + +class ASPPModule(nn.Module): + def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU): + super(ASPPModule, self).__init__() + self.conv1 = nn.Sequential( + nn.AdaptiveAvgPool2d((1, None)), + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), + ) + self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + self.conv3 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) + self.conv4 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) + self.conv5 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.bottleneck = nn.Sequential( + Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) + ) + + def forward(self, x): + _, _, h, w = x.size() + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) + bottle = self.bottleneck(out) + return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py b/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py new file mode 100644 index 0000000000000000000000000000000000000000..50c214e1aa8202b0fb4466e52e317cb2ac5bb1e7 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py @@ -0,0 +1,126 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class Conv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nout, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + bias=False, + ), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class SeperableConv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(SeperableConv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nin, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + groups=nin, + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class Encoder(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) + + def __call__(self, x): + skip = self.conv1(x) + h = self.conv2(skip) + + return h, skip + + +class Decoder(nn.Module): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): + super(Decoder, self).__init__() + self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, x, skip=None): + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) + if skip is not None: + skip = spec_utils.crop_center(skip, x) + x = torch.cat([x, skip], dim=1) + h = self.conv(x) + + if self.dropout is not None: + h = self.dropout(h) + + return h + + +class ASPPModule(nn.Module): + def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): + super(ASPPModule, self).__init__() + self.conv1 = nn.Sequential( + nn.AdaptiveAvgPool2d((1, None)), + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), + ) + self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + self.conv3 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) + self.conv4 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) + self.conv5 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.conv6 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.conv7 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.bottleneck = nn.Sequential( + Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) + ) + + def forward(self, x): + _, _, h, w = x.size() + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + feat6 = self.conv6(x) + feat7 = self.conv7(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) + bottle = self.bottleneck(out) + return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py b/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py new file mode 100644 index 0000000000000000000000000000000000000000..50c214e1aa8202b0fb4466e52e317cb2ac5bb1e7 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py @@ -0,0 +1,126 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class Conv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nout, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + bias=False, + ), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class SeperableConv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(SeperableConv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nin, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + groups=nin, + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class Encoder(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) + + def __call__(self, x): + skip = self.conv1(x) + h = self.conv2(skip) + + return h, skip + + +class Decoder(nn.Module): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): + super(Decoder, self).__init__() + self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, x, skip=None): + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) + if skip is not None: + skip = spec_utils.crop_center(skip, x) + x = torch.cat([x, skip], dim=1) + h = self.conv(x) + + if self.dropout is not None: + h = self.dropout(h) + + return h + + +class ASPPModule(nn.Module): + def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): + super(ASPPModule, self).__init__() + self.conv1 = nn.Sequential( + nn.AdaptiveAvgPool2d((1, None)), + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), + ) + self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + self.conv3 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) + self.conv4 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) + self.conv5 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.conv6 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.conv7 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.bottleneck = nn.Sequential( + Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) + ) + + def forward(self, x): + _, _, h, w = x.size() + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + feat6 = self.conv6(x) + feat7 = self.conv7(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) + bottle = self.bottleneck(out) + return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py b/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py new file mode 100644 index 0000000000000000000000000000000000000000..50c214e1aa8202b0fb4466e52e317cb2ac5bb1e7 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py @@ -0,0 +1,126 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class Conv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nout, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + bias=False, + ), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class SeperableConv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(SeperableConv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nin, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + groups=nin, + bias=False, + ), + nn.Conv2d(nin, nout, kernel_size=1, bias=False), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class Encoder(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ) + + def __call__(self, x): + skip = self.conv1(x) + h = self.conv2(skip) + + return h, skip + + +class Decoder(nn.Module): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): + super(Decoder, self).__init__() + self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, x, skip=None): + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) + if skip is not None: + skip = spec_utils.crop_center(skip, x) + x = torch.cat([x, skip], dim=1) + h = self.conv(x) + + if self.dropout is not None: + h = self.dropout(h) + + return h + + +class ASPPModule(nn.Module): + def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU): + super(ASPPModule, self).__init__() + self.conv1 = nn.Sequential( + nn.AdaptiveAvgPool2d((1, None)), + Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ), + ) + self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ) + self.conv3 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[0], dilations[0], activ=activ + ) + self.conv4 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[1], dilations[1], activ=activ + ) + self.conv5 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.conv6 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.conv7 = SeperableConv2DBNActiv( + nin, nin, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.bottleneck = nn.Sequential( + Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1) + ) + + def forward(self, x): + _, _, h, w = x.size() + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + feat6 = self.conv6(x) + feat7 = self.conv7(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1) + bottle = self.bottleneck(out) + return bottle diff --git a/infer/lib/uvr5_pack/lib_v5/layers_new.py b/infer/lib/uvr5_pack/lib_v5/layers_new.py new file mode 100644 index 0000000000000000000000000000000000000000..83321555e0b488e2b6a08417ef4687bd186590c1 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/layers_new.py @@ -0,0 +1,125 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class Conv2DBNActiv(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU): + super(Conv2DBNActiv, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + nin, + nout, + kernel_size=ksize, + stride=stride, + padding=pad, + dilation=dilation, + bias=False, + ), + nn.BatchNorm2d(nout), + activ(), + ) + + def __call__(self, x): + return self.conv(x) + + +class Encoder(nn.Module): + def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU): + super(Encoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ) + self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) + + def __call__(self, x): + h = self.conv1(x) + h = self.conv2(h) + + return h + + +class Decoder(nn.Module): + def __init__( + self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False + ): + super(Decoder, self).__init__() + self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ) + # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def __call__(self, x, skip=None): + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True) + + if skip is not None: + skip = spec_utils.crop_center(skip, x) + x = torch.cat([x, skip], dim=1) + + h = self.conv1(x) + # h = self.conv2(h) + + if self.dropout is not None: + h = self.dropout(h) + + return h + + +class ASPPModule(nn.Module): + def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False): + super(ASPPModule, self).__init__() + self.conv1 = nn.Sequential( + nn.AdaptiveAvgPool2d((1, None)), + Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ), + ) + self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ) + self.conv3 = Conv2DBNActiv( + nin, nout, 3, 1, dilations[0], dilations[0], activ=activ + ) + self.conv4 = Conv2DBNActiv( + nin, nout, 3, 1, dilations[1], dilations[1], activ=activ + ) + self.conv5 = Conv2DBNActiv( + nin, nout, 3, 1, dilations[2], dilations[2], activ=activ + ) + self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ) + self.dropout = nn.Dropout2d(0.1) if dropout else None + + def forward(self, x): + _, _, h, w = x.size() + feat1 = F.interpolate( + self.conv1(x), size=(h, w), mode="bilinear", align_corners=True + ) + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1) + out = self.bottleneck(out) + + if self.dropout is not None: + out = self.dropout(out) + + return out + + +class LSTMModule(nn.Module): + def __init__(self, nin_conv, nin_lstm, nout_lstm): + super(LSTMModule, self).__init__() + self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0) + self.lstm = nn.LSTM( + input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True + ) + self.dense = nn.Sequential( + nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU() + ) + + def forward(self, x): + N, _, nbins, nframes = x.size() + h = self.conv(x)[:, 0] # N, nbins, nframes + h = h.permute(2, 0, 1) # nframes, N, nbins + h, _ = self.lstm(h) + h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins + h = h.reshape(nframes, N, 1, nbins) + h = h.permute(1, 2, 3, 0) + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/model_param_init.py b/infer/lib/uvr5_pack/lib_v5/model_param_init.py new file mode 100644 index 0000000000000000000000000000000000000000..3a886051ad37dcd8b7be29ff9443294c85f6add0 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/model_param_init.py @@ -0,0 +1,69 @@ +import json +import os +import pathlib + +default_param = {} +default_param["bins"] = 768 +default_param["unstable_bins"] = 9 # training only +default_param["reduction_bins"] = 762 # training only +default_param["sr"] = 44100 +default_param["pre_filter_start"] = 757 +default_param["pre_filter_stop"] = 768 +default_param["band"] = {} + + +default_param["band"][1] = { + "sr": 11025, + "hl": 128, + "n_fft": 960, + "crop_start": 0, + "crop_stop": 245, + "lpf_start": 61, # inference only + "res_type": "polyphase", +} + +default_param["band"][2] = { + "sr": 44100, + "hl": 512, + "n_fft": 1536, + "crop_start": 24, + "crop_stop": 547, + "hpf_start": 81, # inference only + "res_type": "sinc_best", +} + + +def int_keys(d): + r = {} + for k, v in d: + if k.isdigit(): + k = int(k) + r[k] = v + return r + + +class ModelParameters(object): + def __init__(self, config_path=""): + if ".pth" == pathlib.Path(config_path).suffix: + import zipfile + + with zipfile.ZipFile(config_path, "r") as zip: + self.param = json.loads( + zip.read("param.json"), object_pairs_hook=int_keys + ) + elif ".json" == pathlib.Path(config_path).suffix: + with open(config_path, "r") as f: + self.param = json.loads(f.read(), object_pairs_hook=int_keys) + else: + self.param = default_param + + for k in [ + "mid_side", + "mid_side_b", + "mid_side_b2", + "stereo_w", + "stereo_n", + "reverse", + ]: + if not k in self.param: + self.param[k] = False diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json new file mode 100644 index 0000000000000000000000000000000000000000..da097dc2277d18511bf66487f847fb745d93731f --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 16000, + "hl": 512, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 1024, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 16000, + "pre_filter_start": 1023, + "pre_filter_stop": 1024 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json new file mode 100644 index 0000000000000000000000000000000000000000..ac49901adccf0838b0792ea4da517f4b2093e168 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 32000, + "hl": 512, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 1024, + "hpf_start": -1, + "res_type": "kaiser_fast" + } + }, + "sr": 32000, + "pre_filter_start": 1000, + "pre_filter_stop": 1021 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json new file mode 100644 index 0000000000000000000000000000000000000000..1f5d9d9e175aeb2e82b9d64d168176808d44f521 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 33075, + "hl": 384, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 1024, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 33075, + "pre_filter_start": 1000, + "pre_filter_stop": 1021 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json new file mode 100644 index 0000000000000000000000000000000000000000..7cc7a4282f2530b8957ec6791355d6a2f98ad76d --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 44100, + "hl": 1024, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 1024, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 44100, + "pre_filter_start": 1023, + "pre_filter_stop": 1024 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json new file mode 100644 index 0000000000000000000000000000000000000000..c0b8c2c0f82b67a92aaff92822abded210c2a6a9 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json @@ -0,0 +1,19 @@ +{ + "bins": 256, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 44100, + "hl": 256, + "n_fft": 512, + "crop_start": 0, + "crop_stop": 256, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 44100, + "pre_filter_start": 256, + "pre_filter_stop": 256 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json new file mode 100644 index 0000000000000000000000000000000000000000..7cd0ed370c815803f4cede8aa7cf2f36d77e2a7a --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 44100, + "hl": 512, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 1024, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 44100, + "pre_filter_start": 1023, + "pre_filter_stop": 1024 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json new file mode 100644 index 0000000000000000000000000000000000000000..2663ce443f3f2b3416029080356ad564281a991b --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json @@ -0,0 +1,19 @@ +{ + "bins": 1024, + "unstable_bins": 0, + "reduction_bins": 0, + "band": { + "1": { + "sr": 44100, + "hl": 512, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 700, + "hpf_start": -1, + "res_type": "sinc_best" + } + }, + "sr": 44100, + "pre_filter_start": 1023, + "pre_filter_stop": 700 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json new file mode 100644 index 0000000000000000000000000000000000000000..f537c435f1786c3eca8e082ee21dc0ce0eba817c --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json @@ -0,0 +1,30 @@ +{ + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 705, + "band": { + "1": { + "sr": 6000, + "hl": 66, + "n_fft": 512, + "crop_start": 0, + "crop_stop": 240, + "lpf_start": 60, + "lpf_stop": 118, + "res_type": "sinc_fastest" + }, + "2": { + "sr": 32000, + "hl": 352, + "n_fft": 1024, + "crop_start": 22, + "crop_stop": 505, + "hpf_start": 44, + "hpf_stop": 23, + "res_type": "sinc_medium" + } + }, + "sr": 32000, + "pre_filter_start": 710, + "pre_filter_stop": 731 +} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json new file mode 100644 index 0000000000000000000000000000000000000000..7d4d6f3ccfaa85af9e0e54b23ef1f7d3de1613d3 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json @@ -0,0 +1,30 @@ +{ + "bins": 512, + "unstable_bins": 7, + "reduction_bins": 510, + "band": { + "1": { + "sr": 11025, + "hl": 160, + "n_fft": 768, + "crop_start": 0, + "crop_stop": 192, + "lpf_start": 41, + "lpf_stop": 139, + "res_type": "sinc_fastest" + }, + "2": { + "sr": 44100, + "hl": 640, + "n_fft": 1024, + "crop_start": 10, + "crop_stop": 320, + "hpf_start": 47, + "hpf_stop": 15, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 510, + "pre_filter_stop": 512 +} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json new file mode 100644 index 0000000000000000000000000000000000000000..be075f52e4a8ddba952cb2fc608b29e089e7f9f9 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json @@ -0,0 +1,30 @@ +{ + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 705, + "band": { + "1": { + "sr": 6000, + "hl": 66, + "n_fft": 512, + "crop_start": 0, + "crop_stop": 240, + "lpf_start": 60, + "lpf_stop": 240, + "res_type": "sinc_fastest" + }, + "2": { + "sr": 48000, + "hl": 528, + "n_fft": 1536, + "crop_start": 22, + "crop_stop": 505, + "hpf_start": 82, + "hpf_stop": 22, + "res_type": "sinc_medium" + } + }, + "sr": 48000, + "pre_filter_start": 710, + "pre_filter_stop": 731 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json new file mode 100644 index 0000000000000000000000000000000000000000..d99e23986cf7e68be023e3cf382b5d131409095d --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json @@ -0,0 +1,42 @@ +{ + "bins": 768, + "unstable_bins": 5, + "reduction_bins": 733, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 768, + "crop_start": 0, + "crop_stop": 278, + "lpf_start": 28, + "lpf_stop": 140, + "res_type": "polyphase" + }, + "2": { + "sr": 22050, + "hl": 256, + "n_fft": 768, + "crop_start": 14, + "crop_stop": 322, + "hpf_start": 70, + "hpf_stop": 14, + "lpf_start": 283, + "lpf_stop": 314, + "res_type": "polyphase" + }, + "3": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 131, + "crop_stop": 313, + "hpf_start": 154, + "hpf_stop": 141, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 757, + "pre_filter_stop": 768 +} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json new file mode 100644 index 0000000000000000000000000000000000000000..fc2c487dd52d91beb32d69bc36ad8e3b6124978b --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json @@ -0,0 +1,43 @@ +{ + "mid_side": true, + "bins": 768, + "unstable_bins": 5, + "reduction_bins": 733, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 768, + "crop_start": 0, + "crop_stop": 278, + "lpf_start": 28, + "lpf_stop": 140, + "res_type": "polyphase" + }, + "2": { + "sr": 22050, + "hl": 256, + "n_fft": 768, + "crop_start": 14, + "crop_stop": 322, + "hpf_start": 70, + "hpf_stop": 14, + "lpf_start": 283, + "lpf_stop": 314, + "res_type": "polyphase" + }, + "3": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 131, + "crop_stop": 313, + "hpf_start": 154, + "hpf_stop": 141, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 757, + "pre_filter_stop": 768 +} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json new file mode 100644 index 0000000000000000000000000000000000000000..33b0877c2e964657af2c648b71cbb84ff6b1e581 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json @@ -0,0 +1,43 @@ +{ + "mid_side_b2": true, + "bins": 640, + "unstable_bins": 7, + "reduction_bins": 565, + "band": { + "1": { + "sr": 11025, + "hl": 108, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 187, + "lpf_start": 92, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "2": { + "sr": 22050, + "hl": 216, + "n_fft": 768, + "crop_start": 0, + "crop_stop": 212, + "hpf_start": 68, + "hpf_stop": 34, + "lpf_start": 174, + "lpf_stop": 209, + "res_type": "polyphase" + }, + "3": { + "sr": 44100, + "hl": 432, + "n_fft": 640, + "crop_start": 66, + "crop_stop": 307, + "hpf_start": 86, + "hpf_stop": 72, + "res_type": "kaiser_fast" + } + }, + "sr": 44100, + "pre_filter_start": 639, + "pre_filter_stop": 640 +} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json new file mode 100644 index 0000000000000000000000000000000000000000..4ae850a08f6fe11e7a5a0267f3be35f993cc4eb6 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json @@ -0,0 +1,54 @@ +{ + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json new file mode 100644 index 0000000000000000000000000000000000000000..6346701543891938e69fc35754b58b8da9b561d6 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json @@ -0,0 +1,55 @@ +{ + "bins": 768, + "unstable_bins": 7, + "mid_side": true, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json new file mode 100644 index 0000000000000000000000000000000000000000..0bf477114c585236da7c48ffd81960919da38b81 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json @@ -0,0 +1,55 @@ +{ + "mid_side_b": true, + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json new file mode 100644 index 0000000000000000000000000000000000000000..0bf477114c585236da7c48ffd81960919da38b81 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json @@ -0,0 +1,55 @@ +{ + "mid_side_b": true, + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json new file mode 100644 index 0000000000000000000000000000000000000000..779a1c908357cccedcd22b695ca68df13c1967bd --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json @@ -0,0 +1,55 @@ +{ + "reverse": true, + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json new file mode 100644 index 0000000000000000000000000000000000000000..1fefd4aa50bf6c744294fbb305888742c96e4c4c --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json @@ -0,0 +1,55 @@ +{ + "stereo_w": true, + "bins": 768, + "unstable_bins": 7, + "reduction_bins": 668, + "band": { + "1": { + "sr": 11025, + "hl": 128, + "n_fft": 1024, + "crop_start": 0, + "crop_stop": 186, + "lpf_start": 37, + "lpf_stop": 73, + "res_type": "polyphase" + }, + "2": { + "sr": 11025, + "hl": 128, + "n_fft": 512, + "crop_start": 4, + "crop_stop": 185, + "hpf_start": 36, + "hpf_stop": 18, + "lpf_start": 93, + "lpf_stop": 185, + "res_type": "polyphase" + }, + "3": { + "sr": 22050, + "hl": 256, + "n_fft": 512, + "crop_start": 46, + "crop_stop": 186, + "hpf_start": 93, + "hpf_stop": 46, + "lpf_start": 164, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 512, + "n_fft": 768, + "crop_start": 121, + "crop_stop": 382, + "hpf_start": 138, + "hpf_stop": 123, + "res_type": "sinc_medium" + } + }, + "sr": 44100, + "pre_filter_start": 740, + "pre_filter_stop": 768 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json new file mode 100644 index 0000000000000000000000000000000000000000..af798108de02a7243335e71be5c57e4094a5d7b1 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json @@ -0,0 +1,54 @@ +{ + "bins": 672, + "unstable_bins": 8, + "reduction_bins": 637, + "band": { + "1": { + "sr": 7350, + "hl": 80, + "n_fft": 640, + "crop_start": 0, + "crop_stop": 85, + "lpf_start": 25, + "lpf_stop": 53, + "res_type": "polyphase" + }, + "2": { + "sr": 7350, + "hl": 80, + "n_fft": 320, + "crop_start": 4, + "crop_stop": 87, + "hpf_start": 25, + "hpf_stop": 12, + "lpf_start": 31, + "lpf_stop": 62, + "res_type": "polyphase" + }, + "3": { + "sr": 14700, + "hl": 160, + "n_fft": 512, + "crop_start": 17, + "crop_stop": 216, + "hpf_start": 48, + "hpf_stop": 24, + "lpf_start": 139, + "lpf_stop": 210, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 480, + "n_fft": 960, + "crop_start": 78, + "crop_stop": 383, + "hpf_start": 130, + "hpf_stop": 86, + "res_type": "kaiser_fast" + } + }, + "sr": 44100, + "pre_filter_start": 668, + "pre_filter_stop": 672 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json new file mode 100644 index 0000000000000000000000000000000000000000..319b99810f364946da7a30b15b916a5309981608 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json @@ -0,0 +1,55 @@ +{ + "bins": 672, + "unstable_bins": 8, + "reduction_bins": 637, + "band": { + "1": { + "sr": 7350, + "hl": 80, + "n_fft": 640, + "crop_start": 0, + "crop_stop": 85, + "lpf_start": 25, + "lpf_stop": 53, + "res_type": "polyphase" + }, + "2": { + "sr": 7350, + "hl": 80, + "n_fft": 320, + "crop_start": 4, + "crop_stop": 87, + "hpf_start": 25, + "hpf_stop": 12, + "lpf_start": 31, + "lpf_stop": 62, + "res_type": "polyphase" + }, + "3": { + "sr": 14700, + "hl": 160, + "n_fft": 512, + "crop_start": 17, + "crop_stop": 216, + "hpf_start": 48, + "hpf_stop": 24, + "lpf_start": 139, + "lpf_stop": 210, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 480, + "n_fft": 960, + "crop_start": 78, + "crop_stop": 383, + "hpf_start": 130, + "hpf_stop": 86, + "convert_channels": "stereo_n", + "res_type": "kaiser_fast" + } + }, + "sr": 44100, + "pre_filter_start": 668, + "pre_filter_stop": 672 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json new file mode 100644 index 0000000000000000000000000000000000000000..2a73bc97ac545145a75bdca7addc5d59f5b8574b --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json @@ -0,0 +1,54 @@ +{ + "bins": 672, + "unstable_bins": 8, + "reduction_bins": 530, + "band": { + "1": { + "sr": 7350, + "hl": 80, + "n_fft": 640, + "crop_start": 0, + "crop_stop": 85, + "lpf_start": 25, + "lpf_stop": 53, + "res_type": "polyphase" + }, + "2": { + "sr": 7350, + "hl": 80, + "n_fft": 320, + "crop_start": 4, + "crop_stop": 87, + "hpf_start": 25, + "hpf_stop": 12, + "lpf_start": 31, + "lpf_stop": 62, + "res_type": "polyphase" + }, + "3": { + "sr": 14700, + "hl": 160, + "n_fft": 512, + "crop_start": 17, + "crop_stop": 216, + "hpf_start": 48, + "hpf_stop": 24, + "lpf_start": 139, + "lpf_stop": 210, + "res_type": "polyphase" + }, + "4": { + "sr": 44100, + "hl": 480, + "n_fft": 960, + "crop_start": 78, + "crop_stop": 383, + "hpf_start": 130, + "hpf_stop": 86, + "res_type": "kaiser_fast" + } + }, + "sr": 44100, + "pre_filter_start": 668, + "pre_filter_stop": 672 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json b/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json new file mode 100644 index 0000000000000000000000000000000000000000..ca96bf19c593dbe127e1a013ae456ac093602e28 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json @@ -0,0 +1,43 @@ +{ + "mid_side_b2": true, + "bins": 1280, + "unstable_bins": 7, + "reduction_bins": 565, + "band": { + "1": { + "sr": 11025, + "hl": 108, + "n_fft": 2048, + "crop_start": 0, + "crop_stop": 374, + "lpf_start": 92, + "lpf_stop": 186, + "res_type": "polyphase" + }, + "2": { + "sr": 22050, + "hl": 216, + "n_fft": 1536, + "crop_start": 0, + "crop_stop": 424, + "hpf_start": 68, + "hpf_stop": 34, + "lpf_start": 348, + "lpf_stop": 418, + "res_type": "polyphase" + }, + "3": { + "sr": 44100, + "hl": 432, + "n_fft": 1280, + "crop_start": 132, + "crop_stop": 614, + "hpf_start": 172, + "hpf_stop": 144, + "res_type": "polyphase" + } + }, + "sr": 44100, + "pre_filter_start": 1280, + "pre_filter_stop": 1280 +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/lib_v5/nets.py b/infer/lib/uvr5_pack/lib_v5/nets.py new file mode 100644 index 0000000000000000000000000000000000000000..7d0341f05217f875e5975cf1449cd1578fc1edd5 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets.py @@ -0,0 +1,123 @@ +import layers +import torch +import torch.nn.functional as F +from torch import nn + +from . import spec_utils + + +class BaseASPPNet(nn.Module): + def __init__(self, nin, ch, dilations=(4, 8, 16)): + super(BaseASPPNet, self).__init__() + self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) + self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) + self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) + self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) + + self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) + + self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) + self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) + self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) + self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + + def __call__(self, x): + h, e1 = self.enc1(x) + h, e2 = self.enc2(h) + h, e3 = self.enc3(h) + h, e4 = self.enc4(h) + + h = self.aspp(h) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = self.dec1(h, e1) + + return h + + +class CascadedASPPNet(nn.Module): + def __init__(self, n_fft): + super(CascadedASPPNet, self).__init__() + self.stg1_low_band_net = BaseASPPNet(2, 16) + self.stg1_high_band_net = BaseASPPNet(2, 16) + + self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) + self.stg2_full_band_net = BaseASPPNet(8, 16) + + self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) + self.stg3_full_band_net = BaseASPPNet(16, 32) + + self.out = nn.Conv2d(32, 2, 1, bias=False) + self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) + self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + + self.offset = 128 + + def forward(self, x, aggressiveness=None): + mix = x.detach() + x = x.clone() + + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) + + h = torch.cat([x, aux1], dim=1) + aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) + + h = torch.cat([x, aux1, aux2], dim=1) + h = self.stg3_full_band_net(self.stg3_bridge(h)) + + mask = torch.sigmoid(self.out(h)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux1 = torch.sigmoid(self.aux1_out(aux1)) + aux1 = F.pad( + input=aux1, + pad=(0, 0, 0, self.output_bin - aux1.size()[2]), + mode="replicate", + ) + aux2 = torch.sigmoid(self.aux2_out(aux2)) + aux2 = F.pad( + input=aux2, + pad=(0, 0, 0, self.output_bin - aux2.size()[2]), + mode="replicate", + ) + return mask * mix, aux1 * mix, aux2 * mix + else: + if aggressiveness: + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) + + return mask * mix + + def predict(self, x_mag, aggressiveness=None): + h = self.forward(x_mag, aggressiveness) + + if self.offset > 0: + h = h[:, :, :, self.offset : -self.offset] + assert h.size()[3] > 0 + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py b/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py new file mode 100644 index 0000000000000000000000000000000000000000..64ed70ea012b799849f58ff0f6c3172b0576a505 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py @@ -0,0 +1,122 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import layers_123821KB as layers + + +class BaseASPPNet(nn.Module): + def __init__(self, nin, ch, dilations=(4, 8, 16)): + super(BaseASPPNet, self).__init__() + self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) + self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) + self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) + self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) + + self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) + + self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) + self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) + self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) + self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + + def __call__(self, x): + h, e1 = self.enc1(x) + h, e2 = self.enc2(h) + h, e3 = self.enc3(h) + h, e4 = self.enc4(h) + + h = self.aspp(h) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = self.dec1(h, e1) + + return h + + +class CascadedASPPNet(nn.Module): + def __init__(self, n_fft): + super(CascadedASPPNet, self).__init__() + self.stg1_low_band_net = BaseASPPNet(2, 32) + self.stg1_high_band_net = BaseASPPNet(2, 32) + + self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) + self.stg2_full_band_net = BaseASPPNet(16, 32) + + self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) + self.stg3_full_band_net = BaseASPPNet(32, 64) + + self.out = nn.Conv2d(64, 2, 1, bias=False) + self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) + self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + + self.offset = 128 + + def forward(self, x, aggressiveness=None): + mix = x.detach() + x = x.clone() + + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) + + h = torch.cat([x, aux1], dim=1) + aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) + + h = torch.cat([x, aux1, aux2], dim=1) + h = self.stg3_full_band_net(self.stg3_bridge(h)) + + mask = torch.sigmoid(self.out(h)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux1 = torch.sigmoid(self.aux1_out(aux1)) + aux1 = F.pad( + input=aux1, + pad=(0, 0, 0, self.output_bin - aux1.size()[2]), + mode="replicate", + ) + aux2 = torch.sigmoid(self.aux2_out(aux2)) + aux2 = F.pad( + input=aux2, + pad=(0, 0, 0, self.output_bin - aux2.size()[2]), + mode="replicate", + ) + return mask * mix, aux1 * mix, aux2 * mix + else: + if aggressiveness: + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) + + return mask * mix + + def predict(self, x_mag, aggressiveness=None): + h = self.forward(x_mag, aggressiveness) + + if self.offset > 0: + h = h[:, :, :, self.offset : -self.offset] + assert h.size()[3] > 0 + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py b/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py new file mode 100644 index 0000000000000000000000000000000000000000..64ed70ea012b799849f58ff0f6c3172b0576a505 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py @@ -0,0 +1,122 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import layers_123821KB as layers + + +class BaseASPPNet(nn.Module): + def __init__(self, nin, ch, dilations=(4, 8, 16)): + super(BaseASPPNet, self).__init__() + self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) + self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) + self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) + self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) + + self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) + + self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) + self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) + self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) + self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + + def __call__(self, x): + h, e1 = self.enc1(x) + h, e2 = self.enc2(h) + h, e3 = self.enc3(h) + h, e4 = self.enc4(h) + + h = self.aspp(h) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = self.dec1(h, e1) + + return h + + +class CascadedASPPNet(nn.Module): + def __init__(self, n_fft): + super(CascadedASPPNet, self).__init__() + self.stg1_low_band_net = BaseASPPNet(2, 32) + self.stg1_high_band_net = BaseASPPNet(2, 32) + + self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) + self.stg2_full_band_net = BaseASPPNet(16, 32) + + self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) + self.stg3_full_band_net = BaseASPPNet(32, 64) + + self.out = nn.Conv2d(64, 2, 1, bias=False) + self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) + self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + + self.offset = 128 + + def forward(self, x, aggressiveness=None): + mix = x.detach() + x = x.clone() + + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) + + h = torch.cat([x, aux1], dim=1) + aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) + + h = torch.cat([x, aux1, aux2], dim=1) + h = self.stg3_full_band_net(self.stg3_bridge(h)) + + mask = torch.sigmoid(self.out(h)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux1 = torch.sigmoid(self.aux1_out(aux1)) + aux1 = F.pad( + input=aux1, + pad=(0, 0, 0, self.output_bin - aux1.size()[2]), + mode="replicate", + ) + aux2 = torch.sigmoid(self.aux2_out(aux2)) + aux2 = F.pad( + input=aux2, + pad=(0, 0, 0, self.output_bin - aux2.size()[2]), + mode="replicate", + ) + return mask * mix, aux1 * mix, aux2 * mix + else: + if aggressiveness: + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) + + return mask * mix + + def predict(self, x_mag, aggressiveness=None): + h = self.forward(x_mag, aggressiveness) + + if self.offset > 0: + h = h[:, :, :, self.offset : -self.offset] + assert h.size()[3] > 0 + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py b/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py new file mode 100644 index 0000000000000000000000000000000000000000..6cb1e208d9684d98a9a19223e6753d7d3e7f3b31 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py @@ -0,0 +1,122 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import layers_33966KB as layers + + +class BaseASPPNet(nn.Module): + def __init__(self, nin, ch, dilations=(4, 8, 16, 32)): + super(BaseASPPNet, self).__init__() + self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) + self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) + self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) + self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) + + self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) + + self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) + self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) + self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) + self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + + def __call__(self, x): + h, e1 = self.enc1(x) + h, e2 = self.enc2(h) + h, e3 = self.enc3(h) + h, e4 = self.enc4(h) + + h = self.aspp(h) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = self.dec1(h, e1) + + return h + + +class CascadedASPPNet(nn.Module): + def __init__(self, n_fft): + super(CascadedASPPNet, self).__init__() + self.stg1_low_band_net = BaseASPPNet(2, 16) + self.stg1_high_band_net = BaseASPPNet(2, 16) + + self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0) + self.stg2_full_band_net = BaseASPPNet(8, 16) + + self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) + self.stg3_full_band_net = BaseASPPNet(16, 32) + + self.out = nn.Conv2d(32, 2, 1, bias=False) + self.aux1_out = nn.Conv2d(16, 2, 1, bias=False) + self.aux2_out = nn.Conv2d(16, 2, 1, bias=False) + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + + self.offset = 128 + + def forward(self, x, aggressiveness=None): + mix = x.detach() + x = x.clone() + + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) + + h = torch.cat([x, aux1], dim=1) + aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) + + h = torch.cat([x, aux1, aux2], dim=1) + h = self.stg3_full_band_net(self.stg3_bridge(h)) + + mask = torch.sigmoid(self.out(h)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux1 = torch.sigmoid(self.aux1_out(aux1)) + aux1 = F.pad( + input=aux1, + pad=(0, 0, 0, self.output_bin - aux1.size()[2]), + mode="replicate", + ) + aux2 = torch.sigmoid(self.aux2_out(aux2)) + aux2 = F.pad( + input=aux2, + pad=(0, 0, 0, self.output_bin - aux2.size()[2]), + mode="replicate", + ) + return mask * mix, aux1 * mix, aux2 * mix + else: + if aggressiveness: + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) + + return mask * mix + + def predict(self, x_mag, aggressiveness=None): + h = self.forward(x_mag, aggressiveness) + + if self.offset > 0: + h = h[:, :, :, self.offset : -self.offset] + assert h.size()[3] > 0 + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py b/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py new file mode 100644 index 0000000000000000000000000000000000000000..dda9b8fe03874ad9a609855c609bdbe4d70a5a6b --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py @@ -0,0 +1,123 @@ +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + +from . import layers_537238KB as layers + + +class BaseASPPNet(nn.Module): + def __init__(self, nin, ch, dilations=(4, 8, 16)): + super(BaseASPPNet, self).__init__() + self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) + self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) + self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) + self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) + + self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) + + self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) + self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) + self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) + self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + + def __call__(self, x): + h, e1 = self.enc1(x) + h, e2 = self.enc2(h) + h, e3 = self.enc3(h) + h, e4 = self.enc4(h) + + h = self.aspp(h) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = self.dec1(h, e1) + + return h + + +class CascadedASPPNet(nn.Module): + def __init__(self, n_fft): + super(CascadedASPPNet, self).__init__() + self.stg1_low_band_net = BaseASPPNet(2, 64) + self.stg1_high_band_net = BaseASPPNet(2, 64) + + self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) + self.stg2_full_band_net = BaseASPPNet(32, 64) + + self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) + self.stg3_full_band_net = BaseASPPNet(64, 128) + + self.out = nn.Conv2d(128, 2, 1, bias=False) + self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) + self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + + self.offset = 128 + + def forward(self, x, aggressiveness=None): + mix = x.detach() + x = x.clone() + + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) + + h = torch.cat([x, aux1], dim=1) + aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) + + h = torch.cat([x, aux1, aux2], dim=1) + h = self.stg3_full_band_net(self.stg3_bridge(h)) + + mask = torch.sigmoid(self.out(h)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux1 = torch.sigmoid(self.aux1_out(aux1)) + aux1 = F.pad( + input=aux1, + pad=(0, 0, 0, self.output_bin - aux1.size()[2]), + mode="replicate", + ) + aux2 = torch.sigmoid(self.aux2_out(aux2)) + aux2 = F.pad( + input=aux2, + pad=(0, 0, 0, self.output_bin - aux2.size()[2]), + mode="replicate", + ) + return mask * mix, aux1 * mix, aux2 * mix + else: + if aggressiveness: + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) + + return mask * mix + + def predict(self, x_mag, aggressiveness=None): + h = self.forward(x_mag, aggressiveness) + + if self.offset > 0: + h = h[:, :, :, self.offset : -self.offset] + assert h.size()[3] > 0 + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py b/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py new file mode 100644 index 0000000000000000000000000000000000000000..dda9b8fe03874ad9a609855c609bdbe4d70a5a6b --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py @@ -0,0 +1,123 @@ +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + +from . import layers_537238KB as layers + + +class BaseASPPNet(nn.Module): + def __init__(self, nin, ch, dilations=(4, 8, 16)): + super(BaseASPPNet, self).__init__() + self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) + self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) + self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) + self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) + + self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) + + self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) + self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) + self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) + self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + + def __call__(self, x): + h, e1 = self.enc1(x) + h, e2 = self.enc2(h) + h, e3 = self.enc3(h) + h, e4 = self.enc4(h) + + h = self.aspp(h) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = self.dec1(h, e1) + + return h + + +class CascadedASPPNet(nn.Module): + def __init__(self, n_fft): + super(CascadedASPPNet, self).__init__() + self.stg1_low_band_net = BaseASPPNet(2, 64) + self.stg1_high_band_net = BaseASPPNet(2, 64) + + self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) + self.stg2_full_band_net = BaseASPPNet(32, 64) + + self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0) + self.stg3_full_band_net = BaseASPPNet(64, 128) + + self.out = nn.Conv2d(128, 2, 1, bias=False) + self.aux1_out = nn.Conv2d(64, 2, 1, bias=False) + self.aux2_out = nn.Conv2d(64, 2, 1, bias=False) + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + + self.offset = 128 + + def forward(self, x, aggressiveness=None): + mix = x.detach() + x = x.clone() + + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) + + h = torch.cat([x, aux1], dim=1) + aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) + + h = torch.cat([x, aux1, aux2], dim=1) + h = self.stg3_full_band_net(self.stg3_bridge(h)) + + mask = torch.sigmoid(self.out(h)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux1 = torch.sigmoid(self.aux1_out(aux1)) + aux1 = F.pad( + input=aux1, + pad=(0, 0, 0, self.output_bin - aux1.size()[2]), + mode="replicate", + ) + aux2 = torch.sigmoid(self.aux2_out(aux2)) + aux2 = F.pad( + input=aux2, + pad=(0, 0, 0, self.output_bin - aux2.size()[2]), + mode="replicate", + ) + return mask * mix, aux1 * mix, aux2 * mix + else: + if aggressiveness: + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) + + return mask * mix + + def predict(self, x_mag, aggressiveness=None): + h = self.forward(x_mag, aggressiveness) + + if self.offset > 0: + h = h[:, :, :, self.offset : -self.offset] + assert h.size()[3] > 0 + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py b/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py new file mode 100644 index 0000000000000000000000000000000000000000..64ed70ea012b799849f58ff0f6c3172b0576a505 --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py @@ -0,0 +1,122 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import layers_123821KB as layers + + +class BaseASPPNet(nn.Module): + def __init__(self, nin, ch, dilations=(4, 8, 16)): + super(BaseASPPNet, self).__init__() + self.enc1 = layers.Encoder(nin, ch, 3, 2, 1) + self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1) + self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1) + self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1) + + self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations) + + self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1) + self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1) + self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1) + self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1) + + def __call__(self, x): + h, e1 = self.enc1(x) + h, e2 = self.enc2(h) + h, e3 = self.enc3(h) + h, e4 = self.enc4(h) + + h = self.aspp(h) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = self.dec1(h, e1) + + return h + + +class CascadedASPPNet(nn.Module): + def __init__(self, n_fft): + super(CascadedASPPNet, self).__init__() + self.stg1_low_band_net = BaseASPPNet(2, 32) + self.stg1_high_band_net = BaseASPPNet(2, 32) + + self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0) + self.stg2_full_band_net = BaseASPPNet(16, 32) + + self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0) + self.stg3_full_band_net = BaseASPPNet(32, 64) + + self.out = nn.Conv2d(64, 2, 1, bias=False) + self.aux1_out = nn.Conv2d(32, 2, 1, bias=False) + self.aux2_out = nn.Conv2d(32, 2, 1, bias=False) + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + + self.offset = 128 + + def forward(self, x, aggressiveness=None): + mix = x.detach() + x = x.clone() + + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + aux1 = torch.cat( + [ + self.stg1_low_band_net(x[:, :, :bandw]), + self.stg1_high_band_net(x[:, :, bandw:]), + ], + dim=2, + ) + + h = torch.cat([x, aux1], dim=1) + aux2 = self.stg2_full_band_net(self.stg2_bridge(h)) + + h = torch.cat([x, aux1, aux2], dim=1) + h = self.stg3_full_band_net(self.stg3_bridge(h)) + + mask = torch.sigmoid(self.out(h)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux1 = torch.sigmoid(self.aux1_out(aux1)) + aux1 = F.pad( + input=aux1, + pad=(0, 0, 0, self.output_bin - aux1.size()[2]), + mode="replicate", + ) + aux2 = torch.sigmoid(self.aux2_out(aux2)) + aux2 = F.pad( + input=aux2, + pad=(0, 0, 0, self.output_bin - aux2.size()[2]), + mode="replicate", + ) + return mask * mix, aux1 * mix, aux2 * mix + else: + if aggressiveness: + mask[:, :, : aggressiveness["split_bin"]] = torch.pow( + mask[:, :, : aggressiveness["split_bin"]], + 1 + aggressiveness["value"] / 3, + ) + mask[:, :, aggressiveness["split_bin"] :] = torch.pow( + mask[:, :, aggressiveness["split_bin"] :], + 1 + aggressiveness["value"], + ) + + return mask * mix + + def predict(self, x_mag, aggressiveness=None): + h = self.forward(x_mag, aggressiveness) + + if self.offset > 0: + h = h[:, :, :, self.offset : -self.offset] + assert h.size()[3] > 0 + + return h diff --git a/infer/lib/uvr5_pack/lib_v5/nets_new.py b/infer/lib/uvr5_pack/lib_v5/nets_new.py new file mode 100644 index 0000000000000000000000000000000000000000..9170aae4c7b5026a1e8c0dd80d6df1f632fc995d --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/nets_new.py @@ -0,0 +1,133 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from . import layers_new + + +class BaseNet(nn.Module): + def __init__( + self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6)) + ): + super(BaseNet, self).__init__() + self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1) + self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1) + self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1) + self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1) + self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1) + + self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True) + + self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1) + self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1) + self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1) + self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm) + self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1) + + def __call__(self, x): + e1 = self.enc1(x) + e2 = self.enc2(e1) + e3 = self.enc3(e2) + e4 = self.enc4(e3) + e5 = self.enc5(e4) + + h = self.aspp(e5) + + h = self.dec4(h, e4) + h = self.dec3(h, e3) + h = self.dec2(h, e2) + h = torch.cat([h, self.lstm_dec2(h)], dim=1) + h = self.dec1(h, e1) + + return h + + +class CascadedNet(nn.Module): + def __init__(self, n_fft, nout=32, nout_lstm=128): + super(CascadedNet, self).__init__() + + self.max_bin = n_fft // 2 + self.output_bin = n_fft // 2 + 1 + self.nin_lstm = self.max_bin // 2 + self.offset = 64 + + self.stg1_low_band_net = nn.Sequential( + BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm), + layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0), + ) + + self.stg1_high_band_net = BaseNet( + 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2 + ) + + self.stg2_low_band_net = nn.Sequential( + BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm), + layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0), + ) + self.stg2_high_band_net = BaseNet( + nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2 + ) + + self.stg3_full_band_net = BaseNet( + 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm + ) + + self.out = nn.Conv2d(nout, 2, 1, bias=False) + self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False) + + def forward(self, x): + x = x[:, :, : self.max_bin] + + bandw = x.size()[2] // 2 + l1_in = x[:, :, :bandw] + h1_in = x[:, :, bandw:] + l1 = self.stg1_low_band_net(l1_in) + h1 = self.stg1_high_band_net(h1_in) + aux1 = torch.cat([l1, h1], dim=2) + + l2_in = torch.cat([l1_in, l1], dim=1) + h2_in = torch.cat([h1_in, h1], dim=1) + l2 = self.stg2_low_band_net(l2_in) + h2 = self.stg2_high_band_net(h2_in) + aux2 = torch.cat([l2, h2], dim=2) + + f3_in = torch.cat([x, aux1, aux2], dim=1) + f3 = self.stg3_full_band_net(f3_in) + + mask = torch.sigmoid(self.out(f3)) + mask = F.pad( + input=mask, + pad=(0, 0, 0, self.output_bin - mask.size()[2]), + mode="replicate", + ) + + if self.training: + aux = torch.cat([aux1, aux2], dim=1) + aux = torch.sigmoid(self.aux_out(aux)) + aux = F.pad( + input=aux, + pad=(0, 0, 0, self.output_bin - aux.size()[2]), + mode="replicate", + ) + return mask, aux + else: + return mask + + def predict_mask(self, x): + mask = self.forward(x) + + if self.offset > 0: + mask = mask[:, :, :, self.offset : -self.offset] + assert mask.size()[3] > 0 + + return mask + + def predict(self, x, aggressiveness=None): + mask = self.forward(x) + pred_mag = x * mask + + if self.offset > 0: + pred_mag = pred_mag[:, :, :, self.offset : -self.offset] + assert pred_mag.size()[3] > 0 + + return pred_mag diff --git a/infer/lib/uvr5_pack/lib_v5/spec_utils.py b/infer/lib/uvr5_pack/lib_v5/spec_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9ee75d426385851fee2eae9dfaeaa569468d0e2c --- /dev/null +++ b/infer/lib/uvr5_pack/lib_v5/spec_utils.py @@ -0,0 +1,676 @@ +import hashlib +import json +import math +import os + +import librosa +import numpy as np +import soundfile as sf +from tqdm import tqdm + + +def crop_center(h1, h2): + h1_shape = h1.size() + h2_shape = h2.size() + + if h1_shape[3] == h2_shape[3]: + return h1 + elif h1_shape[3] < h2_shape[3]: + raise ValueError("h1_shape[3] must be greater than h2_shape[3]") + + # s_freq = (h2_shape[2] - h1_shape[2]) // 2 + # e_freq = s_freq + h1_shape[2] + s_time = (h1_shape[3] - h2_shape[3]) // 2 + e_time = s_time + h2_shape[3] + h1 = h1[:, :, :, s_time:e_time] + + return h1 + + +def wave_to_spectrogram( + wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False +): + if reverse: + wave_left = np.flip(np.asfortranarray(wave[0])) + wave_right = np.flip(np.asfortranarray(wave[1])) + elif mid_side: + wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) + wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) + elif mid_side_b2: + wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5)) + wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5)) + else: + wave_left = np.asfortranarray(wave[0]) + wave_right = np.asfortranarray(wave[1]) + + spec_left = librosa.stft(wave_left, n_fft=n_fft, hop_length=hop_length) + spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length) + + spec = np.asfortranarray([spec_left, spec_right]) + + return spec + + +def wave_to_spectrogram_mt( + wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False +): + import threading + + if reverse: + wave_left = np.flip(np.asfortranarray(wave[0])) + wave_right = np.flip(np.asfortranarray(wave[1])) + elif mid_side: + wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2) + wave_right = np.asfortranarray(np.subtract(wave[0], wave[1])) + elif mid_side_b2: + wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5)) + wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5)) + else: + wave_left = np.asfortranarray(wave[0]) + wave_right = np.asfortranarray(wave[1]) + + def run_thread(**kwargs): + global spec_left + spec_left = librosa.stft(**kwargs) + + thread = threading.Thread( + target=run_thread, + kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length}, + ) + thread.start() + spec_right = librosa.stft(wave_right, n_fft=n_fft, hop_length=hop_length) + thread.join() + + spec = np.asfortranarray([spec_left, spec_right]) + + return spec + + +def combine_spectrograms(specs, mp): + l = min([specs[i].shape[2] for i in specs]) + spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64) + offset = 0 + bands_n = len(mp.param["band"]) + + for d in range(1, bands_n + 1): + h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"] + spec_c[:, offset : offset + h, :l] = specs[d][ + :, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l + ] + offset += h + + if offset > mp.param["bins"]: + raise ValueError("Too much bins") + + # lowpass fiter + if ( + mp.param["pre_filter_start"] > 0 + ): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']: + if bands_n == 1: + spec_c = fft_lp_filter( + spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"] + ) + else: + gp = 1 + for b in range( + mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"] + ): + g = math.pow( + 10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0 + ) + gp = g + spec_c[:, b, :] *= g + + return np.asfortranarray(spec_c) + + +def spectrogram_to_image(spec, mode="magnitude"): + if mode == "magnitude": + if np.iscomplexobj(spec): + y = np.abs(spec) + else: + y = spec + y = np.log10(y**2 + 1e-8) + elif mode == "phase": + if np.iscomplexobj(spec): + y = np.angle(spec) + else: + y = spec + + y -= y.min() + y *= 255 / y.max() + img = np.uint8(y) + + if y.ndim == 3: + img = img.transpose(1, 2, 0) + img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2) + + return img + + +def reduce_vocal_aggressively(X, y, softmask): + v = X - y + y_mag_tmp = np.abs(y) + v_mag_tmp = np.abs(v) + + v_mask = v_mag_tmp > y_mag_tmp + y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf) + + return y_mag * np.exp(1.0j * np.angle(y)) + + +def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32): + if min_range < fade_size * 2: + raise ValueError("min_range must be >= fade_area * 2") + + mag = mag.copy() + + idx = np.where(ref.mean(axis=(0, 1)) < thres)[0] + starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0]) + ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1]) + uninformative = np.where(ends - starts > min_range)[0] + if len(uninformative) > 0: + starts = starts[uninformative] + ends = ends[uninformative] + old_e = None + for s, e in zip(starts, ends): + if old_e is not None and s - old_e < fade_size: + s = old_e - fade_size * 2 + + if s != 0: + weight = np.linspace(0, 1, fade_size) + mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size] + else: + s -= fade_size + + if e != mag.shape[2]: + weight = np.linspace(1, 0, fade_size) + mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e] + else: + e += fade_size + + mag[:, :, s + fade_size : e - fade_size] += ref[ + :, :, s + fade_size : e - fade_size + ] + old_e = e + + return mag + + +def align_wave_head_and_tail(a, b): + l = min([a[0].size, b[0].size]) + + return a[:l, :l], b[:l, :l] + + +def cache_or_load(mix_path, inst_path, mp): + mix_basename = os.path.splitext(os.path.basename(mix_path))[0] + inst_basename = os.path.splitext(os.path.basename(inst_path))[0] + + cache_dir = "mph{}".format( + hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest() + ) + mix_cache_dir = os.path.join("cache", cache_dir) + inst_cache_dir = os.path.join("cache", cache_dir) + + os.makedirs(mix_cache_dir, exist_ok=True) + os.makedirs(inst_cache_dir, exist_ok=True) + + mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy") + inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy") + + if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path): + X_spec_m = np.load(mix_cache_path) + y_spec_m = np.load(inst_cache_path) + else: + X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} + + for d in range(len(mp.param["band"]), 0, -1): + bp = mp.param["band"][d] + + if d == len(mp.param["band"]): # high-end band + X_wave[d], _ = librosa.load( + mix_path, + sr=bp["sr"], + mono=False, + dtype=np.float32, + res_type=bp["res_type"] + ) + y_wave[d], _ = librosa.load( + inst_path, + sr=bp["sr"], + mono=False, + dtype=np.float32, + res_type=bp["res_type"], + ) + else: # lower bands + X_wave[d] = librosa.resample( + X_wave[d + 1], + orig_sr=mp.param["band"][d + 1]["sr"], + target_sr=bp["sr"], + res_type=bp["res_type"], + ) + y_wave[d] = librosa.resample( + y_wave[d + 1], + orig_sr=mp.param["band"][d + 1]["sr"], + target_sr=bp["sr"], + res_type=bp["res_type"], + ) + + X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d]) + + X_spec_s[d] = wave_to_spectrogram( + X_wave[d], + bp["hl"], + bp["n_fft"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ) + y_spec_s[d] = wave_to_spectrogram( + y_wave[d], + bp["hl"], + bp["n_fft"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ) + + del X_wave, y_wave + + X_spec_m = combine_spectrograms(X_spec_s, mp) + y_spec_m = combine_spectrograms(y_spec_s, mp) + + if X_spec_m.shape != y_spec_m.shape: + raise ValueError("The combined spectrograms are different: " + mix_path) + + _, ext = os.path.splitext(mix_path) + + np.save(mix_cache_path, X_spec_m) + np.save(inst_cache_path, y_spec_m) + + return X_spec_m, y_spec_m + + +def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse): + spec_left = np.asfortranarray(spec[0]) + spec_right = np.asfortranarray(spec[1]) + + wave_left = librosa.istft(spec_left, hop_length=hop_length) + wave_right = librosa.istft(spec_right, hop_length=hop_length) + + if reverse: + return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) + elif mid_side: + return np.asfortranarray( + [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)] + ) + elif mid_side_b2: + return np.asfortranarray( + [ + np.add(wave_right / 1.25, 0.4 * wave_left), + np.subtract(wave_left / 1.25, 0.4 * wave_right), + ] + ) + else: + return np.asfortranarray([wave_left, wave_right]) + + +def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2): + import threading + + spec_left = np.asfortranarray(spec[0]) + spec_right = np.asfortranarray(spec[1]) + + def run_thread(**kwargs): + global wave_left + wave_left = librosa.istft(**kwargs) + + thread = threading.Thread( + target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length} + ) + thread.start() + wave_right = librosa.istft(spec_right, hop_length=hop_length) + thread.join() + + if reverse: + return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)]) + elif mid_side: + return np.asfortranarray( + [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)] + ) + elif mid_side_b2: + return np.asfortranarray( + [ + np.add(wave_right / 1.25, 0.4 * wave_left), + np.subtract(wave_left / 1.25, 0.4 * wave_right), + ] + ) + else: + return np.asfortranarray([wave_left, wave_right]) + + +def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None): + wave_band = {} + bands_n = len(mp.param["band"]) + offset = 0 + + for d in range(1, bands_n + 1): + bp = mp.param["band"][d] + spec_s = np.ndarray( + shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex + ) + h = bp["crop_stop"] - bp["crop_start"] + spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[ + :, offset : offset + h, : + ] + + offset += h + if d == bands_n: # higher + if extra_bins_h: # if --high_end_process bypass + max_bin = bp["n_fft"] // 2 + spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[ + :, :extra_bins_h, : + ] + if bp["hpf_start"] > 0: + spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) + if bands_n == 1: + wave = spectrogram_to_wave( + spec_s, + bp["hl"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ) + else: + wave = np.add( + wave, + spectrogram_to_wave( + spec_s, + bp["hl"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ), + ) + else: + sr = mp.param["band"][d + 1]["sr"] + if d == 1: # lower + spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) + wave = librosa.resample( + spectrogram_to_wave( + spec_s, + bp["hl"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ), + orig_sr=bp["sr"], + target_sr=sr, + res_type="sinc_fastest", + ) + else: # mid + spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1) + spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"]) + wave2 = np.add( + wave, + spectrogram_to_wave( + spec_s, + bp["hl"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ), + ) + # wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest") + wave = librosa.resample(wave2, orig_sr=bp["sr"], target_sr=sr, res_type="scipy") + + return wave.T + + +def fft_lp_filter(spec, bin_start, bin_stop): + g = 1.0 + for b in range(bin_start, bin_stop): + g -= 1 / (bin_stop - bin_start) + spec[:, b, :] = g * spec[:, b, :] + + spec[:, bin_stop:, :] *= 0 + + return spec + + +def fft_hp_filter(spec, bin_start, bin_stop): + g = 1.0 + for b in range(bin_start, bin_stop, -1): + g -= 1 / (bin_start - bin_stop) + spec[:, b, :] = g * spec[:, b, :] + + spec[:, 0 : bin_stop + 1, :] *= 0 + + return spec + + +def mirroring(a, spec_m, input_high_end, mp): + if "mirroring" == a: + mirror = np.flip( + np.abs( + spec_m[ + :, + mp.param["pre_filter_start"] + - 10 + - input_high_end.shape[1] : mp.param["pre_filter_start"] + - 10, + :, + ] + ), + 1, + ) + mirror = mirror * np.exp(1.0j * np.angle(input_high_end)) + + return np.where( + np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror + ) + + if "mirroring2" == a: + mirror = np.flip( + np.abs( + spec_m[ + :, + mp.param["pre_filter_start"] + - 10 + - input_high_end.shape[1] : mp.param["pre_filter_start"] + - 10, + :, + ] + ), + 1, + ) + mi = np.multiply(mirror, input_high_end * 1.7) + + return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi) + + +def ensembling(a, specs): + for i in range(1, len(specs)): + if i == 1: + spec = specs[0] + + ln = min([spec.shape[2], specs[i].shape[2]]) + spec = spec[:, :, :ln] + specs[i] = specs[i][:, :, :ln] + + if "min_mag" == a: + spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec) + if "max_mag" == a: + spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec) + + return spec + + +def stft(wave, nfft, hl): + wave_left = np.asfortranarray(wave[0]) + wave_right = np.asfortranarray(wave[1]) + spec_left = librosa.stft(wave_left, n_fft=nfft, hop_length=hl) + spec_right = librosa.stft(wave_right, n_fft=nfft, hop_length=hl) + spec = np.asfortranarray([spec_left, spec_right]) + + return spec + + +def istft(spec, hl): + spec_left = np.asfortranarray(spec[0]) + spec_right = np.asfortranarray(spec[1]) + + wave_left = librosa.istft(spec_left, hop_length=hl) + wave_right = librosa.istft(spec_right, hop_length=hl) + wave = np.asfortranarray([wave_left, wave_right]) + + +if __name__ == "__main__": + import argparse + import sys + import time + + import cv2 + from model_param_init import ModelParameters + + p = argparse.ArgumentParser() + p.add_argument( + "--algorithm", + "-a", + type=str, + choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"], + default="min_mag", + ) + p.add_argument( + "--model_params", + "-m", + type=str, + default=os.path.join("modelparams", "1band_sr44100_hl512.json"), + ) + p.add_argument("--output_name", "-o", type=str, default="output") + p.add_argument("--vocals_only", "-v", action="store_true") + p.add_argument("input", nargs="+") + args = p.parse_args() + + start_time = time.time() + + if args.algorithm.startswith("invert") and len(args.input) != 2: + raise ValueError("There should be two input files.") + + if not args.algorithm.startswith("invert") and len(args.input) < 2: + raise ValueError("There must be at least two input files.") + + wave, specs = {}, {} + mp = ModelParameters(args.model_params) + + for i in range(len(args.input)): + spec = {} + + for d in range(len(mp.param["band"]), 0, -1): + bp = mp.param["band"][d] + + if d == len(mp.param["band"]): # high-end band + wave[d], _ = librosa.load( + args.input[i], + sr=bp["sr"], + mono=False, + dtype=np.float32, + res_type=bp["res_type"], + ) + + if len(wave[d].shape) == 1: # mono to stereo + wave[d] = np.array([wave[d], wave[d]]) + else: # lower bands + wave[d] = librosa.resample( + wave[d + 1], + orig_sr=mp.param["band"][d + 1]["sr"], + target_sr=bp["sr"], + res_type=bp["res_type"], + ) + + spec[d] = wave_to_spectrogram( + wave[d], + bp["hl"], + bp["n_fft"], + mp.param["mid_side"], + mp.param["mid_side_b2"], + mp.param["reverse"], + ) + + specs[i] = combine_spectrograms(spec, mp) + + del wave + + if args.algorithm == "deep": + d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1]) + v_spec = d_spec - specs[1] + sf.write( + os.path.join("{}.wav".format(args.output_name)), + cmb_spectrogram_to_wave(v_spec, mp), + mp.param["sr"], + ) + + if args.algorithm.startswith("invert"): + ln = min([specs[0].shape[2], specs[1].shape[2]]) + specs[0] = specs[0][:, :, :ln] + specs[1] = specs[1][:, :, :ln] + + if "invert_p" == args.algorithm: + X_mag = np.abs(specs[0]) + y_mag = np.abs(specs[1]) + max_mag = np.where(X_mag >= y_mag, X_mag, y_mag) + v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0])) + else: + specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2) + v_spec = specs[0] - specs[1] + + if not args.vocals_only: + X_mag = np.abs(specs[0]) + y_mag = np.abs(specs[1]) + v_mag = np.abs(v_spec) + + X_image = spectrogram_to_image(X_mag) + y_image = spectrogram_to_image(y_mag) + v_image = spectrogram_to_image(v_mag) + + cv2.imwrite("{}_X.png".format(args.output_name), X_image) + cv2.imwrite("{}_y.png".format(args.output_name), y_image) + cv2.imwrite("{}_v.png".format(args.output_name), v_image) + + sf.write( + "{}_X.wav".format(args.output_name), + cmb_spectrogram_to_wave(specs[0], mp), + mp.param["sr"], + ) + sf.write( + "{}_y.wav".format(args.output_name), + cmb_spectrogram_to_wave(specs[1], mp), + mp.param["sr"], + ) + + sf.write( + "{}_v.wav".format(args.output_name), + cmb_spectrogram_to_wave(v_spec, mp), + mp.param["sr"], + ) + else: + if not args.algorithm == "deep": + sf.write( + os.path.join("ensembled", "{}.wav".format(args.output_name)), + cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp), + mp.param["sr"], + ) + + if args.algorithm == "align": + trackalignment = [ + { + "file1": '"{}"'.format(args.input[0]), + "file2": '"{}"'.format(args.input[1]), + } + ] + + for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."): + os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}") + + # print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1)) diff --git a/infer/lib/uvr5_pack/name_params.json b/infer/lib/uvr5_pack/name_params.json new file mode 100644 index 0000000000000000000000000000000000000000..39f096e1bbfb0e5e69b22d86e96e946c43884bea --- /dev/null +++ b/infer/lib/uvr5_pack/name_params.json @@ -0,0 +1,263 @@ +{ + "equivalent" : [ + { + "model_hash_name" : [ + { + "hash_name": "47939caf0cfe52a0e81442b85b971dfd", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "4e4ecb9764c50a8c414fee6e10395bbe", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", + "param_name": "4band_v2" + }, + { + "hash_name": "ca106edd563e034bde0bdec4bb7a4b36", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", + "param_name": "4band_v2" + }, + { + "hash_name": "e60a1e84803ce4efc0a6551206cc4b71", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "a82f14e75892e55e994376edbf0c8435", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "param_name": "4band_v2_sn" + }, + { + "hash_name": "08611fb99bd59eaa79ad27c58d137727", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "param_name": "4band_v2_sn" + }, + { + "hash_name": "5c7bbca45a187e81abbbd351606164e5", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "param_name": "3band_44100_msb2" + }, + { + "hash_name": "d6b2cb685a058a091e5e7098192d3233", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "param_name": "3band_44100_msb2" + }, + { + "hash_name": "c1b9f38170a7c90e96f027992eb7c62b", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "c3448ec923fa0edf3d03a19e633faa53", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "68aa2c8093d0080704b200d140f59e54", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", + "param_name": "3band_44100" + }, + { + "hash_name": "fdc83be5b798e4bd29fe00fe6600e147", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "param_name": "3band_44100_mid.json" + }, + { + "hash_name": "2ce34bc92fd57f55db16b7a4def3d745", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "param_name": "3band_44100_mid.json" + }, + { + "hash_name": "52fdca89576f06cf4340b74a4730ee5f", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100.json" + }, + { + "hash_name": "41191165b05d38fc77f072fa9e8e8a30", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100.json" + }, + { + "hash_name": "89e83b511ad474592689e562d5b1f80e", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", + "param_name": "2band_32000.json" + }, + { + "hash_name": "0b954da81d453b716b114d6d7c95177f", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", + "param_name": "2band_32000.json" + } + + ], + "v4 Models": [ + { + "hash_name": "6a00461c51c2920fd68937d4609ed6c8", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", + "param_name": "1band_sr16000_hl512" + }, + { + "hash_name": "0ab504864d20f1bd378fe9c81ef37140", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "param_name": "1band_sr32000_hl512" + }, + { + "hash_name": "7dd21065bf91c10f7fccb57d7d83b07f", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "param_name": "1band_sr32000_hl512" + }, + { + "hash_name": "80ab74d65e515caa3622728d2de07d23", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "param_name": "1band_sr32000_hl512" + }, + { + "hash_name": "edc115e7fc523245062200c00caa847f", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "param_name": "1band_sr33075_hl384" + }, + { + "hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "param_name": "1band_sr33075_hl384" + }, + { + "hash_name": "b58090534c52cbc3e9b5104bad666ef2", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "param_name": "1band_sr44100_hl512" + }, + { + "hash_name": "0cdab9947f1b0928705f518f3c78ea8f", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "param_name": "1band_sr44100_hl512" + }, + { + "hash_name": "ae702fed0238afb5346db8356fe25f13", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", + "param_name": "1band_sr44100_hl1024" + } + ] + } + ], + "User Models" : [ + { + "1 Band": [ + { + "hash_name": "1band_sr16000_hl512", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", + "param_name": "1band_sr16000_hl512" + }, + { + "hash_name": "1band_sr32000_hl512", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "param_name": "1band_sr16000_hl512" + }, + { + "hash_name": "1band_sr33075_hl384", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "param_name": "1band_sr33075_hl384" + }, + { + "hash_name": "1band_sr44100_hl256", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json", + "param_name": "1band_sr44100_hl256" + }, + { + "hash_name": "1band_sr44100_hl512", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "param_name": "1band_sr44100_hl512" + }, + { + "hash_name": "1band_sr44100_hl1024", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", + "param_name": "1band_sr44100_hl1024" + } + ], + "2 Band": [ + { + "hash_name": "2band_44100_lofi", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json", + "param_name": "2band_44100_lofi" + }, + { + "hash_name": "2band_32000", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", + "param_name": "2band_32000" + }, + { + "hash_name": "2band_48000", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json", + "param_name": "2band_48000" + } + ], + "3 Band": [ + { + "hash_name": "3band_44100", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", + "param_name": "3band_44100" + }, + { + "hash_name": "3band_44100_mid", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "param_name": "3band_44100_mid" + }, + { + "hash_name": "3band_44100_msb2", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "param_name": "3band_44100_msb2" + } + ], + "4 Band": [ + { + "hash_name": "4band_44100", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "param_name": "4band_44100" + }, + { + "hash_name": "4band_44100_mid", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json", + "param_name": "4band_44100_mid" + }, + { + "hash_name": "4band_44100_msb", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json", + "param_name": "4band_44100_msb" + }, + { + "hash_name": "4band_44100_msb2", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json", + "param_name": "4band_44100_msb2" + }, + { + "hash_name": "4band_44100_reverse", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json", + "param_name": "4band_44100_reverse" + }, + { + "hash_name": "4band_44100_sw", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json", + "param_name": "4band_44100_sw" + }, + { + "hash_name": "4band_v2", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", + "param_name": "4band_v2" + }, + { + "hash_name": "4band_v2_sn", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "param_name": "4band_v2_sn" + }, + { + "hash_name": "tmodelparam", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/tmodelparam.json", + "param_name": "User Model Param Set" + } + ] + } + ] +} \ No newline at end of file diff --git a/infer/lib/uvr5_pack/utils.py b/infer/lib/uvr5_pack/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2cefcd4208fb324a14bb7a05ba57607309358919 --- /dev/null +++ b/infer/lib/uvr5_pack/utils.py @@ -0,0 +1,121 @@ +import json + +import numpy as np +import torch +from tqdm import tqdm + + +def load_data(file_name: str = "./infer/lib/uvr5_pack/name_params.json") -> dict: + with open(file_name, "r") as f: + data = json.load(f) + + return data + + +def make_padding(width, cropsize, offset): + left = offset + roi_size = cropsize - left * 2 + if roi_size == 0: + roi_size = cropsize + right = roi_size - (width % roi_size) + left + + return left, right, roi_size + + +def inference(X_spec, device, model, aggressiveness, data): + """ + data : dic configs + """ + + def _execute( + X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True + ): + model.eval() + with torch.no_grad(): + preds = [] + + iterations = [n_window] + + total_iterations = sum(iterations) + for i in tqdm(range(n_window)): + start = i * roi_size + X_mag_window = X_mag_pad[ + None, :, :, start : start + data["window_size"] + ] + X_mag_window = torch.from_numpy(X_mag_window) + if is_half: + X_mag_window = X_mag_window.half() + X_mag_window = X_mag_window.to(device) + + pred = model.predict(X_mag_window, aggressiveness) + + pred = pred.detach().cpu().numpy() + preds.append(pred[0]) + + pred = np.concatenate(preds, axis=2) + return pred + + def preprocess(X_spec): + X_mag = np.abs(X_spec) + X_phase = np.angle(X_spec) + + return X_mag, X_phase + + X_mag, X_phase = preprocess(X_spec) + + coef = X_mag.max() + X_mag_pre = X_mag / coef + + n_frame = X_mag_pre.shape[2] + pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset) + n_window = int(np.ceil(n_frame / roi_size)) + + X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") + + if list(model.state_dict().values())[0].dtype == torch.float16: + is_half = True + else: + is_half = False + pred = _execute( + X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half + ) + pred = pred[:, :, :n_frame] + + if data["tta"]: + pad_l += roi_size // 2 + pad_r += roi_size // 2 + n_window += 1 + + X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant") + + pred_tta = _execute( + X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half + ) + pred_tta = pred_tta[:, :, roi_size // 2 :] + pred_tta = pred_tta[:, :, :n_frame] + + return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase) + else: + return pred * coef, X_mag, np.exp(1.0j * X_phase) + + +def _get_name_params(model_path, model_hash): + data = load_data() + flag = False + ModelName = model_path + for type in list(data): + for model in list(data[type][0]): + for i in range(len(data[type][0][model])): + if str(data[type][0][model][i]["hash_name"]) == model_hash: + flag = True + elif str(data[type][0][model][i]["hash_name"]) in ModelName: + flag = True + + if flag: + model_params_auto = data[type][0][model][i]["model_params"] + param_name_auto = data[type][0][model][i]["param_name"] + if type == "equivalent": + return param_name_auto, model_params_auto + else: + flag = False + return param_name_auto, model_params_auto diff --git a/infer/modules/ipex/__init__.py b/infer/modules/ipex/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..beebe50b563d09a0f5db704285291eaf7f214453 --- /dev/null +++ b/infer/modules/ipex/__init__.py @@ -0,0 +1,190 @@ +import os +import sys +import contextlib +import torch +import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import +from .hijacks import ipex_hijacks +from .attention import attention_init + +# pylint: disable=protected-access, missing-function-docstring, line-too-long + + +def ipex_init(): # pylint: disable=too-many-statements + try: + # Replace cuda with xpu: + torch.cuda.current_device = torch.xpu.current_device + torch.cuda.current_stream = torch.xpu.current_stream + torch.cuda.device = torch.xpu.device + torch.cuda.device_count = torch.xpu.device_count + torch.cuda.device_of = torch.xpu.device_of + torch.cuda.get_device_name = torch.xpu.get_device_name + torch.cuda.get_device_properties = torch.xpu.get_device_properties + torch.cuda.init = torch.xpu.init + torch.cuda.is_available = torch.xpu.is_available + torch.cuda.is_initialized = torch.xpu.is_initialized + torch.cuda.is_current_stream_capturing = lambda: False + torch.cuda.set_device = torch.xpu.set_device + torch.cuda.stream = torch.xpu.stream + torch.cuda.synchronize = torch.xpu.synchronize + torch.cuda.Event = torch.xpu.Event + torch.cuda.Stream = torch.xpu.Stream + torch.cuda.FloatTensor = torch.xpu.FloatTensor + torch.Tensor.cuda = torch.Tensor.xpu + torch.Tensor.is_cuda = torch.Tensor.is_xpu + torch.cuda._initialization_lock = torch.xpu.lazy_init._initialization_lock + torch.cuda._initialized = torch.xpu.lazy_init._initialized + torch.cuda._lazy_seed_tracker = torch.xpu.lazy_init._lazy_seed_tracker + torch.cuda._queued_calls = torch.xpu.lazy_init._queued_calls + torch.cuda._tls = torch.xpu.lazy_init._tls + torch.cuda.threading = torch.xpu.lazy_init.threading + torch.cuda.traceback = torch.xpu.lazy_init.traceback + torch.cuda.Optional = torch.xpu.Optional + torch.cuda.__cached__ = torch.xpu.__cached__ + torch.cuda.__loader__ = torch.xpu.__loader__ + torch.cuda.ComplexFloatStorage = torch.xpu.ComplexFloatStorage + torch.cuda.Tuple = torch.xpu.Tuple + torch.cuda.streams = torch.xpu.streams + torch.cuda._lazy_new = torch.xpu._lazy_new + torch.cuda.FloatStorage = torch.xpu.FloatStorage + torch.cuda.Any = torch.xpu.Any + torch.cuda.__doc__ = torch.xpu.__doc__ + torch.cuda.default_generators = torch.xpu.default_generators + torch.cuda.HalfTensor = torch.xpu.HalfTensor + torch.cuda._get_device_index = torch.xpu._get_device_index + torch.cuda.__path__ = torch.xpu.__path__ + torch.cuda.Device = torch.xpu.Device + torch.cuda.IntTensor = torch.xpu.IntTensor + torch.cuda.ByteStorage = torch.xpu.ByteStorage + torch.cuda.set_stream = torch.xpu.set_stream + torch.cuda.BoolStorage = torch.xpu.BoolStorage + torch.cuda.os = torch.xpu.os + torch.cuda.torch = torch.xpu.torch + torch.cuda.BFloat16Storage = torch.xpu.BFloat16Storage + torch.cuda.Union = torch.xpu.Union + torch.cuda.DoubleTensor = torch.xpu.DoubleTensor + torch.cuda.ShortTensor = torch.xpu.ShortTensor + torch.cuda.LongTensor = torch.xpu.LongTensor + torch.cuda.IntStorage = torch.xpu.IntStorage + torch.cuda.LongStorage = torch.xpu.LongStorage + torch.cuda.__annotations__ = torch.xpu.__annotations__ + torch.cuda.__package__ = torch.xpu.__package__ + torch.cuda.__builtins__ = torch.xpu.__builtins__ + torch.cuda.CharTensor = torch.xpu.CharTensor + torch.cuda.List = torch.xpu.List + torch.cuda._lazy_init = torch.xpu._lazy_init + torch.cuda.BFloat16Tensor = torch.xpu.BFloat16Tensor + torch.cuda.DoubleStorage = torch.xpu.DoubleStorage + torch.cuda.ByteTensor = torch.xpu.ByteTensor + torch.cuda.StreamContext = torch.xpu.StreamContext + torch.cuda.ComplexDoubleStorage = torch.xpu.ComplexDoubleStorage + torch.cuda.ShortStorage = torch.xpu.ShortStorage + torch.cuda._lazy_call = torch.xpu._lazy_call + torch.cuda.HalfStorage = torch.xpu.HalfStorage + torch.cuda.random = torch.xpu.random + torch.cuda._device = torch.xpu._device + torch.cuda.classproperty = torch.xpu.classproperty + torch.cuda.__name__ = torch.xpu.__name__ + torch.cuda._device_t = torch.xpu._device_t + torch.cuda.warnings = torch.xpu.warnings + torch.cuda.__spec__ = torch.xpu.__spec__ + torch.cuda.BoolTensor = torch.xpu.BoolTensor + torch.cuda.CharStorage = torch.xpu.CharStorage + torch.cuda.__file__ = torch.xpu.__file__ + torch.cuda._is_in_bad_fork = torch.xpu.lazy_init._is_in_bad_fork + # torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing + + # Memory: + torch.cuda.memory = torch.xpu.memory + if "linux" in sys.platform and "WSL2" in os.popen("uname -a").read(): + torch.xpu.empty_cache = lambda: None + torch.cuda.empty_cache = torch.xpu.empty_cache + torch.cuda.memory_stats = torch.xpu.memory_stats + torch.cuda.memory_summary = torch.xpu.memory_summary + torch.cuda.memory_snapshot = torch.xpu.memory_snapshot + torch.cuda.memory_allocated = torch.xpu.memory_allocated + torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated + torch.cuda.memory_reserved = torch.xpu.memory_reserved + torch.cuda.memory_cached = torch.xpu.memory_reserved + torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved + torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved + torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats + torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats + torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats + torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict + torch.cuda.reset_accumulated_memory_stats = ( + torch.xpu.reset_accumulated_memory_stats + ) + + # RNG: + torch.cuda.get_rng_state = torch.xpu.get_rng_state + torch.cuda.get_rng_state_all = torch.xpu.get_rng_state_all + torch.cuda.set_rng_state = torch.xpu.set_rng_state + torch.cuda.set_rng_state_all = torch.xpu.set_rng_state_all + torch.cuda.manual_seed = torch.xpu.manual_seed + torch.cuda.manual_seed_all = torch.xpu.manual_seed_all + torch.cuda.seed = torch.xpu.seed + torch.cuda.seed_all = torch.xpu.seed_all + torch.cuda.initial_seed = torch.xpu.initial_seed + + # AMP: + torch.cuda.amp = torch.xpu.amp + if not hasattr(torch.cuda.amp, "common"): + torch.cuda.amp.common = contextlib.nullcontext() + torch.cuda.amp.common.amp_definitely_not_available = lambda: False + try: + torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler + except Exception: # pylint: disable=broad-exception-caught + try: + from .gradscaler import ( + gradscaler_init, + ) # pylint: disable=import-outside-toplevel, import-error + + gradscaler_init() + torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler + except Exception: # pylint: disable=broad-exception-caught + torch.cuda.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler + + # C + torch._C._cuda_getCurrentRawStream = ipex._C._getCurrentStream + ipex._C._DeviceProperties.major = 2023 + ipex._C._DeviceProperties.minor = 2 + + # Fix functions with ipex: + torch.cuda.mem_get_info = lambda device=None: [ + ( + torch.xpu.get_device_properties(device).total_memory + - torch.xpu.memory_allocated(device) + ), + torch.xpu.get_device_properties(device).total_memory, + ] + torch._utils._get_available_device_type = lambda: "xpu" + torch.has_cuda = True + torch.cuda.has_half = True + torch.cuda.is_bf16_supported = lambda *args, **kwargs: True + torch.cuda.is_fp16_supported = lambda *args, **kwargs: True + torch.version.cuda = "11.7" + torch.cuda.get_device_capability = lambda *args, **kwargs: [11, 7] + torch.cuda.get_device_properties.major = 11 + torch.cuda.get_device_properties.minor = 7 + torch.cuda.ipc_collect = lambda *args, **kwargs: None + torch.cuda.utilization = lambda *args, **kwargs: 0 + if hasattr(torch.xpu, "getDeviceIdListForCard"): + torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard + torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard + else: + torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card + torch.cuda.get_device_id_list_per_card = ( + torch.xpu.get_device_id_list_per_card + ) + + ipex_hijacks() + attention_init() + try: + from .diffusers import ipex_diffusers + + ipex_diffusers() + except Exception: # pylint: disable=broad-exception-caught + pass + except Exception as e: + return False, e + return True, None diff --git a/infer/modules/ipex/attention.py b/infer/modules/ipex/attention.py new file mode 100644 index 0000000000000000000000000000000000000000..d9f2a1352fa72ef5b51bfb4737b3f6bae7363ac2 --- /dev/null +++ b/infer/modules/ipex/attention.py @@ -0,0 +1,218 @@ +import torch +import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import + +# pylint: disable=protected-access, missing-function-docstring, line-too-long + +original_torch_bmm = torch.bmm + + +def torch_bmm(input, mat2, *, out=None): + if input.dtype != mat2.dtype: + mat2 = mat2.to(input.dtype) + + # ARC GPUs can't allocate more than 4GB to a single block, Slice it: + batch_size_attention, input_tokens, mat2_shape = ( + input.shape[0], + input.shape[1], + mat2.shape[2], + ) + block_multiply = input.element_size() + slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply + block_size = batch_size_attention * slice_block_size + + split_slice_size = batch_size_attention + if block_size > 4: + do_split = True + # Find something divisible with the input_tokens + while (split_slice_size * slice_block_size) > 4: + split_slice_size = split_slice_size // 2 + if split_slice_size <= 1: + split_slice_size = 1 + break + else: + do_split = False + + split_2_slice_size = input_tokens + if split_slice_size * slice_block_size > 4: + slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply + do_split_2 = True + # Find something divisible with the input_tokens + while (split_2_slice_size * slice_block_size2) > 4: + split_2_slice_size = split_2_slice_size // 2 + if split_2_slice_size <= 1: + split_2_slice_size = 1 + break + else: + do_split_2 = False + + if do_split: + hidden_states = torch.zeros( + input.shape[0], + input.shape[1], + mat2.shape[2], + device=input.device, + dtype=input.dtype, + ) + for i in range(batch_size_attention // split_slice_size): + start_idx = i * split_slice_size + end_idx = (i + 1) * split_slice_size + if do_split_2: + for i2 in range( + input_tokens // split_2_slice_size + ): # pylint: disable=invalid-name + start_idx_2 = i2 * split_2_slice_size + end_idx_2 = (i2 + 1) * split_2_slice_size + hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = ( + original_torch_bmm( + input[start_idx:end_idx, start_idx_2:end_idx_2], + mat2[start_idx:end_idx, start_idx_2:end_idx_2], + out=out, + ) + ) + else: + hidden_states[start_idx:end_idx] = original_torch_bmm( + input[start_idx:end_idx], mat2[start_idx:end_idx], out=out + ) + else: + return original_torch_bmm(input, mat2, out=out) + return hidden_states + + +original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention + + +def scaled_dot_product_attention( + query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False +): + # ARC GPUs can't allocate more than 4GB to a single block, Slice it: + if len(query.shape) == 3: + batch_size_attention, query_tokens, shape_four = query.shape + shape_one = 1 + no_shape_one = True + else: + shape_one, batch_size_attention, query_tokens, shape_four = query.shape + no_shape_one = False + + block_multiply = query.element_size() + slice_block_size = ( + shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply + ) + block_size = batch_size_attention * slice_block_size + + split_slice_size = batch_size_attention + if block_size > 4: + do_split = True + # Find something divisible with the shape_one + while (split_slice_size * slice_block_size) > 4: + split_slice_size = split_slice_size // 2 + if split_slice_size <= 1: + split_slice_size = 1 + break + else: + do_split = False + + split_2_slice_size = query_tokens + if split_slice_size * slice_block_size > 4: + slice_block_size2 = ( + shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply + ) + do_split_2 = True + # Find something divisible with the batch_size_attention + while (split_2_slice_size * slice_block_size2) > 4: + split_2_slice_size = split_2_slice_size // 2 + if split_2_slice_size <= 1: + split_2_slice_size = 1 + break + else: + do_split_2 = False + + if do_split: + hidden_states = torch.zeros(query.shape, device=query.device, dtype=query.dtype) + for i in range(batch_size_attention // split_slice_size): + start_idx = i * split_slice_size + end_idx = (i + 1) * split_slice_size + if do_split_2: + for i2 in range( + query_tokens // split_2_slice_size + ): # pylint: disable=invalid-name + start_idx_2 = i2 * split_2_slice_size + end_idx_2 = (i2 + 1) * split_2_slice_size + if no_shape_one: + hidden_states[start_idx:end_idx, start_idx_2:end_idx_2] = ( + original_scaled_dot_product_attention( + query[start_idx:end_idx, start_idx_2:end_idx_2], + key[start_idx:end_idx, start_idx_2:end_idx_2], + value[start_idx:end_idx, start_idx_2:end_idx_2], + attn_mask=( + attn_mask[start_idx:end_idx, start_idx_2:end_idx_2] + if attn_mask is not None + else attn_mask + ), + dropout_p=dropout_p, + is_causal=is_causal, + ) + ) + else: + hidden_states[:, start_idx:end_idx, start_idx_2:end_idx_2] = ( + original_scaled_dot_product_attention( + query[:, start_idx:end_idx, start_idx_2:end_idx_2], + key[:, start_idx:end_idx, start_idx_2:end_idx_2], + value[:, start_idx:end_idx, start_idx_2:end_idx_2], + attn_mask=( + attn_mask[ + :, start_idx:end_idx, start_idx_2:end_idx_2 + ] + if attn_mask is not None + else attn_mask + ), + dropout_p=dropout_p, + is_causal=is_causal, + ) + ) + else: + if no_shape_one: + hidden_states[start_idx:end_idx] = ( + original_scaled_dot_product_attention( + query[start_idx:end_idx], + key[start_idx:end_idx], + value[start_idx:end_idx], + attn_mask=( + attn_mask[start_idx:end_idx] + if attn_mask is not None + else attn_mask + ), + dropout_p=dropout_p, + is_causal=is_causal, + ) + ) + else: + hidden_states[:, start_idx:end_idx] = ( + original_scaled_dot_product_attention( + query[:, start_idx:end_idx], + key[:, start_idx:end_idx], + value[:, start_idx:end_idx], + attn_mask=( + attn_mask[:, start_idx:end_idx] + if attn_mask is not None + else attn_mask + ), + dropout_p=dropout_p, + is_causal=is_causal, + ) + ) + else: + return original_scaled_dot_product_attention( + query, + key, + value, + attn_mask=attn_mask, + dropout_p=dropout_p, + is_causal=is_causal, + ) + return hidden_states + + +def attention_init(): + # ARC GPUs can't allocate more than 4GB to a single block: + torch.bmm = torch_bmm + torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention diff --git a/infer/modules/ipex/gradscaler.py b/infer/modules/ipex/gradscaler.py new file mode 100644 index 0000000000000000000000000000000000000000..9b505007ff88e333ced3252d2e2476d4dd5937e5 --- /dev/null +++ b/infer/modules/ipex/gradscaler.py @@ -0,0 +1,187 @@ +from collections import defaultdict +import torch +import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import +import intel_extension_for_pytorch._C as core # pylint: disable=import-error, unused-import + +# pylint: disable=protected-access, missing-function-docstring, line-too-long + +OptState = ipex.cpu.autocast._grad_scaler.OptState +_MultiDeviceReplicator = ipex.cpu.autocast._grad_scaler._MultiDeviceReplicator +_refresh_per_optimizer_state = ( + ipex.cpu.autocast._grad_scaler._refresh_per_optimizer_state +) + + +def _unscale_grads_( + self, optimizer, inv_scale, found_inf, allow_fp16 +): # pylint: disable=unused-argument + per_device_inv_scale = _MultiDeviceReplicator(inv_scale) + per_device_found_inf = _MultiDeviceReplicator(found_inf) + + # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype. + # There could be hundreds of grads, so we'd like to iterate through them just once. + # However, we don't know their devices or dtypes in advance. + + # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict + # Google says mypy struggles with defaultdicts type annotations. + per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list)) # type: ignore[var-annotated] + # sync grad to master weight + if hasattr(optimizer, "sync_grad"): + optimizer.sync_grad() + with torch.no_grad(): + for group in optimizer.param_groups: + for param in group["params"]: + if param.grad is None: + continue + if (not allow_fp16) and param.grad.dtype == torch.float16: + raise ValueError("Attempting to unscale FP16 gradients.") + if param.grad.is_sparse: + # is_coalesced() == False means the sparse grad has values with duplicate indices. + # coalesce() deduplicates indices and adds all values that have the same index. + # For scaled fp16 values, there's a good chance coalescing will cause overflow, + # so we should check the coalesced _values(). + if param.grad.dtype is torch.float16: + param.grad = param.grad.coalesce() + to_unscale = param.grad._values() + else: + to_unscale = param.grad + + # -: is there a way to split by device and dtype without appending in the inner loop? + to_unscale = to_unscale.to("cpu") + per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append( + to_unscale + ) + + for _, per_dtype_grads in per_device_and_dtype_grads.items(): + for grads in per_dtype_grads.values(): + core._amp_foreach_non_finite_check_and_unscale_( + grads, + per_device_found_inf.get("cpu"), + per_device_inv_scale.get("cpu"), + ) + + return per_device_found_inf._per_device_tensors + + +def unscale_(self, optimizer): + """ + Divides ("unscales") the optimizer's gradient tensors by the scale factor. + :meth:`unscale_` is optional, serving cases where you need to + :ref:`modify or inspect gradients` + between the backward pass(es) and :meth:`step`. + If :meth:`unscale_` is not called explicitly, gradients will be unscaled automatically during :meth:`step`. + Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients:: + ... + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) + scaler.step(optimizer) + scaler.update() + Args: + optimizer (torch.optim.Optimizer): Optimizer that owns the gradients to be unscaled. + .. warning:: + :meth:`unscale_` should only be called once per optimizer per :meth:`step` call, + and only after all gradients for that optimizer's assigned parameters have been accumulated. + Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError. + .. warning:: + :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute. + """ + if not self._enabled: + return + + self._check_scale_growth_tracker("unscale_") + + optimizer_state = self._per_optimizer_states[id(optimizer)] + + if optimizer_state["stage"] is OptState.UNSCALED: # pylint: disable=no-else-raise + raise RuntimeError( + "unscale_() has already been called on this optimizer since the last update()." + ) + elif optimizer_state["stage"] is OptState.STEPPED: + raise RuntimeError("unscale_() is being called after step().") + + # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64. + assert self._scale is not None + inv_scale = ( + self._scale.to("cpu").double().reciprocal().float().to(self._scale.device) + ) + found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device) + + optimizer_state["found_inf_per_device"] = self._unscale_grads_( + optimizer, inv_scale, found_inf, False + ) + optimizer_state["stage"] = OptState.UNSCALED + + +def update(self, new_scale=None): + """ + Updates the scale factor. + If any optimizer steps were skipped the scale is multiplied by ``backoff_factor`` + to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively, + the scale is multiplied by ``growth_factor`` to increase it. + Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not + used directly, it's used to fill GradScaler's internal scale tensor. So if + ``new_scale`` was a tensor, later in-place changes to that tensor will not further + affect the scale GradScaler uses internally.) + Args: + new_scale (float or :class:`torch.FloatTensor`, optional, default=None): New scale factor. + .. warning:: + :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has + been invoked for all optimizers used this iteration. + """ + if not self._enabled: + return + + _scale, _growth_tracker = self._check_scale_growth_tracker("update") + + if new_scale is not None: + # Accept a new user-defined scale. + if isinstance(new_scale, float): + self._scale.fill_(new_scale) # type: ignore[union-attr] + else: + reason = "new_scale should be a float or a 1-element torch.FloatTensor with requires_grad=False." + assert isinstance(new_scale, torch.FloatTensor), reason # type: ignore[attr-defined] + assert new_scale.numel() == 1, reason + assert new_scale.requires_grad is False, reason + self._scale.copy_(new_scale) # type: ignore[union-attr] + else: + # Consume shared inf/nan data collected from optimizers to update the scale. + # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous. + found_infs = [ + found_inf.to(device="cpu", non_blocking=True) + for state in self._per_optimizer_states.values() + for found_inf in state["found_inf_per_device"].values() + ] + + assert len(found_infs) > 0, "No inf checks were recorded prior to update." + + found_inf_combined = found_infs[0] + if len(found_infs) > 1: + for i in range(1, len(found_infs)): + found_inf_combined += found_infs[i] + + to_device = _scale.device + _scale = _scale.to("cpu") + _growth_tracker = _growth_tracker.to("cpu") + + core._amp_update_scale_( + _scale, + _growth_tracker, + found_inf_combined, + self._growth_factor, + self._backoff_factor, + self._growth_interval, + ) + + _scale = _scale.to(to_device) + _growth_tracker = _growth_tracker.to(to_device) + # To prepare for next iteration, clear the data collected from optimizers this iteration. + self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state) + + +def gradscaler_init(): + torch.xpu.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler + torch.xpu.amp.GradScaler._unscale_grads_ = _unscale_grads_ + torch.xpu.amp.GradScaler.unscale_ = unscale_ + torch.xpu.amp.GradScaler.update = update + return torch.xpu.amp.GradScaler diff --git a/infer/modules/ipex/hijacks.py b/infer/modules/ipex/hijacks.py new file mode 100644 index 0000000000000000000000000000000000000000..6d5b36855b3e1da01482029fd0b77d6dc105d95a --- /dev/null +++ b/infer/modules/ipex/hijacks.py @@ -0,0 +1,365 @@ +import contextlib +import importlib +import torch +import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import + +# pylint: disable=protected-access, missing-function-docstring, line-too-long, unnecessary-lambda, no-else-return + + +class CondFunc: # pylint: disable=missing-class-docstring + def __new__(cls, orig_func, sub_func, cond_func): + self = super(CondFunc, cls).__new__(cls) + if isinstance(orig_func, str): + func_path = orig_func.split(".") + for i in range(len(func_path) - 1, -1, -1): + try: + resolved_obj = importlib.import_module(".".join(func_path[:i])) + break + except ImportError: + pass + for attr_name in func_path[i:-1]: + resolved_obj = getattr(resolved_obj, attr_name) + orig_func = getattr(resolved_obj, func_path[-1]) + setattr( + resolved_obj, + func_path[-1], + lambda *args, **kwargs: self(*args, **kwargs), + ) + self.__init__(orig_func, sub_func, cond_func) + return lambda *args, **kwargs: self(*args, **kwargs) + + def __init__(self, orig_func, sub_func, cond_func): + self.__orig_func = orig_func + self.__sub_func = sub_func + self.__cond_func = cond_func + + def __call__(self, *args, **kwargs): + if not self.__cond_func or self.__cond_func(self.__orig_func, *args, **kwargs): + return self.__sub_func(self.__orig_func, *args, **kwargs) + else: + return self.__orig_func(*args, **kwargs) + + +_utils = torch.utils.data._utils + + +def _shutdown_workers(self): + if ( + torch.utils.data._utils is None + or torch.utils.data._utils.python_exit_status is True + or torch.utils.data._utils.python_exit_status is None + ): + return + if hasattr(self, "_shutdown") and not self._shutdown: + self._shutdown = True + try: + if hasattr(self, "_pin_memory_thread"): + self._pin_memory_thread_done_event.set() + self._worker_result_queue.put((None, None)) + self._pin_memory_thread.join() + self._worker_result_queue.cancel_join_thread() + self._worker_result_queue.close() + self._workers_done_event.set() + for worker_id in range(len(self._workers)): + if self._persistent_workers or self._workers_status[worker_id]: + self._mark_worker_as_unavailable(worker_id, shutdown=True) + for w in self._workers: # pylint: disable=invalid-name + w.join(timeout=torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL) + for q in self._index_queues: # pylint: disable=invalid-name + q.cancel_join_thread() + q.close() + finally: + if self._worker_pids_set: + torch.utils.data._utils.signal_handling._remove_worker_pids(id(self)) + self._worker_pids_set = False + for w in self._workers: # pylint: disable=invalid-name + if w.is_alive(): + w.terminate() + + +class DummyDataParallel( + torch.nn.Module +): # pylint: disable=missing-class-docstring, unused-argument, too-few-public-methods + def __new__( + cls, module, device_ids=None, output_device=None, dim=0 + ): # pylint: disable=unused-argument + if isinstance(device_ids, list) and len(device_ids) > 1: + print("IPEX backend doesn't support DataParallel on multiple XPU devices") + return module.to("xpu") + + +def return_null_context(*args, **kwargs): # pylint: disable=unused-argument + return contextlib.nullcontext() + + +def check_device(device): + return bool( + (isinstance(device, torch.device) and device.type == "cuda") + or (isinstance(device, str) and "cuda" in device) + or isinstance(device, int) + ) + + +def return_xpu(device): + return ( + f"xpu:{device[-1]}" + if isinstance(device, str) and ":" in device + else ( + f"xpu:{device}" + if isinstance(device, int) + else torch.device("xpu") if isinstance(device, torch.device) else "xpu" + ) + ) + + +def ipex_no_cuda(orig_func, *args, **kwargs): + torch.cuda.is_available = lambda: False + orig_func(*args, **kwargs) + torch.cuda.is_available = torch.xpu.is_available + + +original_autocast = torch.autocast + + +def ipex_autocast(*args, **kwargs): + if len(args) > 0 and args[0] == "cuda": + return original_autocast("xpu", *args[1:], **kwargs) + else: + return original_autocast(*args, **kwargs) + + +original_torch_cat = torch.cat + + +def torch_cat(tensor, *args, **kwargs): + if len(tensor) == 3 and ( + tensor[0].dtype != tensor[1].dtype or tensor[2].dtype != tensor[1].dtype + ): + return original_torch_cat( + [tensor[0].to(tensor[1].dtype), tensor[1], tensor[2].to(tensor[1].dtype)], + *args, + **kwargs, + ) + else: + return original_torch_cat(tensor, *args, **kwargs) + + +original_interpolate = torch.nn.functional.interpolate + + +def interpolate( + tensor, + size=None, + scale_factor=None, + mode="nearest", + align_corners=None, + recompute_scale_factor=None, + antialias=False, +): # pylint: disable=too-many-arguments + if antialias or align_corners is not None: + return_device = tensor.device + return_dtype = tensor.dtype + return original_interpolate( + tensor.to("cpu", dtype=torch.float32), + size=size, + scale_factor=scale_factor, + mode=mode, + align_corners=align_corners, + recompute_scale_factor=recompute_scale_factor, + antialias=antialias, + ).to(return_device, dtype=return_dtype) + else: + return original_interpolate( + tensor, + size=size, + scale_factor=scale_factor, + mode=mode, + align_corners=align_corners, + recompute_scale_factor=recompute_scale_factor, + antialias=antialias, + ) + + +original_linalg_solve = torch.linalg.solve + + +def linalg_solve(A, B, *args, **kwargs): # pylint: disable=invalid-name + if A.device != torch.device("cpu") or B.device != torch.device("cpu"): + return_device = A.device + return original_linalg_solve(A.to("cpu"), B.to("cpu"), *args, **kwargs).to( + return_device + ) + else: + return original_linalg_solve(A, B, *args, **kwargs) + + +def ipex_hijacks(): + CondFunc( + "torch.Tensor.to", + lambda orig_func, self, device=None, *args, **kwargs: orig_func( + self, return_xpu(device), *args, **kwargs + ), + lambda orig_func, self, device=None, *args, **kwargs: check_device(device), + ) + CondFunc( + "torch.Tensor.cuda", + lambda orig_func, self, device=None, *args, **kwargs: orig_func( + self, return_xpu(device), *args, **kwargs + ), + lambda orig_func, self, device=None, *args, **kwargs: check_device(device), + ) + CondFunc( + "torch.empty", + lambda orig_func, *args, device=None, **kwargs: orig_func( + *args, device=return_xpu(device), **kwargs + ), + lambda orig_func, *args, device=None, **kwargs: check_device(device), + ) + CondFunc( + "torch.load", + lambda orig_func, *args, map_location=None, **kwargs: orig_func( + *args, return_xpu(map_location), **kwargs + ), + lambda orig_func, *args, map_location=None, **kwargs: map_location is None + or check_device(map_location), + ) + CondFunc( + "torch.randn", + lambda orig_func, *args, device=None, **kwargs: orig_func( + *args, device=return_xpu(device), **kwargs + ), + lambda orig_func, *args, device=None, **kwargs: check_device(device), + ) + CondFunc( + "torch.ones", + lambda orig_func, *args, device=None, **kwargs: orig_func( + *args, device=return_xpu(device), **kwargs + ), + lambda orig_func, *args, device=None, **kwargs: check_device(device), + ) + CondFunc( + "torch.zeros", + lambda orig_func, *args, device=None, **kwargs: orig_func( + *args, device=return_xpu(device), **kwargs + ), + lambda orig_func, *args, device=None, **kwargs: check_device(device), + ) + CondFunc( + "torch.tensor", + lambda orig_func, *args, device=None, **kwargs: orig_func( + *args, device=return_xpu(device), **kwargs + ), + lambda orig_func, *args, device=None, **kwargs: check_device(device), + ) + CondFunc( + "torch.linspace", + lambda orig_func, *args, device=None, **kwargs: orig_func( + *args, device=return_xpu(device), **kwargs + ), + lambda orig_func, *args, device=None, **kwargs: check_device(device), + ) + + CondFunc( + "torch.Generator", + lambda orig_func, device=None: torch.xpu.Generator(device), + lambda orig_func, device=None: device is not None + and device != torch.device("cpu") + and device != "cpu", + ) + + CondFunc( + "torch.batch_norm", + lambda orig_func, input, weight, bias, *args, **kwargs: orig_func( + input, + ( + weight + if weight is not None + else torch.ones(input.size()[1], device=input.device) + ), + ( + bias + if bias is not None + else torch.zeros(input.size()[1], device=input.device) + ), + *args, + **kwargs, + ), + lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"), + ) + CondFunc( + "torch.instance_norm", + lambda orig_func, input, weight, bias, *args, **kwargs: orig_func( + input, + ( + weight + if weight is not None + else torch.ones(input.size()[1], device=input.device) + ), + ( + bias + if bias is not None + else torch.zeros(input.size()[1], device=input.device) + ), + *args, + **kwargs, + ), + lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"), + ) + + # Functions with dtype errors: + CondFunc( + "torch.nn.modules.GroupNorm.forward", + lambda orig_func, self, input: orig_func( + self, input.to(self.weight.data.dtype) + ), + lambda orig_func, self, input: input.dtype != self.weight.data.dtype, + ) + CondFunc( + "torch.nn.modules.linear.Linear.forward", + lambda orig_func, self, input: orig_func( + self, input.to(self.weight.data.dtype) + ), + lambda orig_func, self, input: input.dtype != self.weight.data.dtype, + ) + CondFunc( + "torch.nn.modules.conv.Conv2d.forward", + lambda orig_func, self, input: orig_func( + self, input.to(self.weight.data.dtype) + ), + lambda orig_func, self, input: input.dtype != self.weight.data.dtype, + ) + CondFunc( + "torch.nn.functional.layer_norm", + lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs: orig_func( + input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs + ), + lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs: weight + is not None + and input.dtype != weight.data.dtype, + ) + + # Diffusers Float64 (ARC GPUs doesn't support double or Float64): + if not torch.xpu.has_fp64_dtype(): + CondFunc( + "torch.from_numpy", + lambda orig_func, ndarray: orig_func(ndarray.astype("float32")), + lambda orig_func, ndarray: ndarray.dtype == float, + ) + + # Broken functions when torch.cuda.is_available is True: + CondFunc( + "torch.utils.data.dataloader._BaseDataLoaderIter.__init__", + lambda orig_func, *args, **kwargs: ipex_no_cuda(orig_func, *args, **kwargs), + lambda orig_func, *args, **kwargs: True, + ) + + # Functions that make compile mad with CondFunc: + torch.utils.data.dataloader._MultiProcessingDataLoaderIter._shutdown_workers = ( + _shutdown_workers + ) + torch.nn.DataParallel = DummyDataParallel + torch.autocast = ipex_autocast + torch.cat = torch_cat + torch.linalg.solve = linalg_solve + torch.nn.functional.interpolate = interpolate + torch.backends.cuda.sdp_kernel = return_null_context diff --git a/infer/modules/onnx/export.py b/infer/modules/onnx/export.py new file mode 100644 index 0000000000000000000000000000000000000000..bfc84578c6b10c8891c64856bce36c8b94e331c4 --- /dev/null +++ b/infer/modules/onnx/export.py @@ -0,0 +1,54 @@ +import torch +import onnxsim +import onnx +from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM + +def export_onnx(ModelPath, ExportedPath): + cpt = torch.load(ModelPath, map_location="cpu", weights_only=False) + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] + vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768 + + test_phone = torch.rand(1, 200, vec_channels) # hidden unit + test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) + test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) + test_pitchf = torch.rand(1, 200) # nsf基频 + test_ds = torch.LongTensor([0]) # 说话人ID + test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) + + device = "cpu" # 导出时设备(不影响使用模型) + + net_g = SynthesizerTrnMsNSFsidM( + *cpt["config"], is_half=False, version=cpt.get("version", "v1") + ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) + net_g.load_state_dict(cpt["weight"], strict=False) + input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] + output_names = [ + "audio", + ] + # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出 + torch.onnx.export( + net_g, + ( + test_phone.to(device), + test_phone_lengths.to(device), + test_pitch.to(device), + test_pitchf.to(device), + test_ds.to(device), + test_rnd.to(device), + ), + ExportedPath, + dynamic_axes={ + "phone": [1], + "pitch": [1], + "pitchf": [1], + "rnd": [2], + }, + do_constant_folding=False, + opset_version=18, + verbose=False, + input_names=input_names, + output_names=output_names, + ) + model, _ = onnxsim.simplify(ExportedPath) + onnx.save(model, ExportedPath) + return "Finished" diff --git a/infer/modules/train/extract/extract_f0_print.py b/infer/modules/train/extract/extract_f0_print.py new file mode 100644 index 0000000000000000000000000000000000000000..9d231e4e86db204704ce894a2b12ebad38665064 --- /dev/null +++ b/infer/modules/train/extract/extract_f0_print.py @@ -0,0 +1,175 @@ +import os +import sys +import traceback + +import parselmouth + +now_dir = os.getcwd() +sys.path.append(now_dir) +import logging + +import numpy as np +import pyworld + +from infer.lib.audio import load_audio + +logging.getLogger("numba").setLevel(logging.WARNING) +from multiprocessing import Process + +exp_dir = sys.argv[1] +f = open("%s/extract_f0_feature.log" % exp_dir, "a+") + + +def printt(strr): + print(strr) + f.write("%s\n" % strr) + f.flush() + + +n_p = int(sys.argv[2]) +f0method = sys.argv[3] + + +class FeatureInput(object): + def __init__(self, samplerate=16000, hop_size=160): + self.fs = samplerate + self.hop = hop_size + + self.f0_bin = 256 + self.f0_max = 1100.0 + self.f0_min = 50.0 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + + def compute_f0(self, path, f0_method): + x = load_audio(path, self.fs) + p_len = x.shape[0] // self.hop + if f0_method == "pm": + time_step = 160 / 16000 * 1000 + f0_min = 50 + f0_max = 1100 + f0 = ( + parselmouth.Sound(x, self.fs) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max, + ) + .selected_array["frequency"] + ) + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad( + f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" + ) + elif f0_method == "harvest": + f0, t = pyworld.harvest( + x.astype(np.double), + fs=self.fs, + f0_ceil=self.f0_max, + f0_floor=self.f0_min, + frame_period=1000 * self.hop / self.fs, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) + elif f0_method == "dio": + f0, t = pyworld.dio( + x.astype(np.double), + fs=self.fs, + f0_ceil=self.f0_max, + f0_floor=self.f0_min, + frame_period=1000 * self.hop / self.fs, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) + elif f0_method == "rmvpe": + if hasattr(self, "model_rmvpe") == False: + from infer.lib.rmvpe import RMVPE + + print("Loading rmvpe model") + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", is_half=False, device="cpu" + ) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + return f0 + + def coarse_f0(self, f0): + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( + self.f0_bin - 2 + ) / (self.f0_mel_max - self.f0_mel_min) + 1 + + # use 0 or 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 + f0_coarse = np.rint(f0_mel).astype(int) + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( + f0_coarse.max(), + f0_coarse.min(), + ) + return f0_coarse + + def go(self, paths, f0_method): + if len(paths) == 0: + printt("no-f0-todo") + else: + printt("todo-f0-%s" % len(paths)) + n = max(len(paths) // 5, 1) # 每个进程最多打印5条 + for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): + try: + if idx % n == 0: + printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path)) + if ( + os.path.exists(opt_path1 + ".npy") == True + and os.path.exists(opt_path2 + ".npy") == True + ): + continue + featur_pit = self.compute_f0(inp_path, f0_method) + np.save( + opt_path2, + featur_pit, + allow_pickle=False, + ) # nsf + coarse_pit = self.coarse_f0(featur_pit) + np.save( + opt_path1, + coarse_pit, + allow_pickle=False, + ) # ori + except: + printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())) + + +if __name__ == "__main__": + # exp_dir=r"E:\codes\py39\dataset\mi-test" + # n_p=16 + # f = open("%s/log_extract_f0.log"%exp_dir, "w") + printt(" ".join(sys.argv)) + featureInput = FeatureInput() + paths = [] + inp_root = "%s/1_16k_wavs" % (exp_dir) + opt_root1 = "%s/2a_f0" % (exp_dir) + opt_root2 = "%s/2b-f0nsf" % (exp_dir) + + os.makedirs(opt_root1, exist_ok=True) + os.makedirs(opt_root2, exist_ok=True) + for name in sorted(list(os.listdir(inp_root))): + inp_path = "%s/%s" % (inp_root, name) + if "spec" in inp_path: + continue + opt_path1 = "%s/%s" % (opt_root1, name) + opt_path2 = "%s/%s" % (opt_root2, name) + paths.append([inp_path, opt_path1, opt_path2]) + + ps = [] + for i in range(n_p): + p = Process( + target=featureInput.go, + args=( + paths[i::n_p], + f0method, + ), + ) + ps.append(p) + p.start() + for i in range(n_p): + ps[i].join() diff --git a/infer/modules/train/extract/extract_f0_rmvpe.py b/infer/modules/train/extract/extract_f0_rmvpe.py new file mode 100644 index 0000000000000000000000000000000000000000..90b20738c2efe719c3c053be25755b55da65e666 --- /dev/null +++ b/infer/modules/train/extract/extract_f0_rmvpe.py @@ -0,0 +1,141 @@ +import os +import sys +import traceback + +import parselmouth + +now_dir = os.getcwd() +sys.path.append(now_dir) +import logging + +import numpy as np +import pyworld + +from infer.lib.audio import load_audio + +logging.getLogger("numba").setLevel(logging.WARNING) + +n_part = int(sys.argv[1]) +i_part = int(sys.argv[2]) +i_gpu = sys.argv[3] +os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) +exp_dir = sys.argv[4] +is_half = sys.argv[5] +f = open("%s/extract_f0_feature.log" % exp_dir, "a+") + + +def printt(strr): + print(strr) + f.write("%s\n" % strr) + f.flush() + + +class FeatureInput(object): + def __init__(self, samplerate=16000, hop_size=160): + self.fs = samplerate + self.hop = hop_size + + self.f0_bin = 256 + self.f0_max = 1100.0 + self.f0_min = 50.0 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + + def compute_f0(self, path, f0_method): + x = load_audio(path, self.fs) + # p_len = x.shape[0] // self.hop + if f0_method == "rmvpe": + if hasattr(self, "model_rmvpe") == False: + from infer.lib.rmvpe import RMVPE + + print("Loading rmvpe model") + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda" + ) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + return f0 + + def coarse_f0(self, f0): + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( + self.f0_bin - 2 + ) / (self.f0_mel_max - self.f0_mel_min) + 1 + + # use 0 or 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 + f0_coarse = np.rint(f0_mel).astype(int) + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( + f0_coarse.max(), + f0_coarse.min(), + ) + return f0_coarse + + def go(self, paths, f0_method): + if len(paths) == 0: + printt("no-f0-todo") + else: + printt("todo-f0-%s" % len(paths)) + n = max(len(paths) // 5, 1) # 每个进程最多打印5条 + for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): + try: + if idx % n == 0: + printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path)) + if ( + os.path.exists(opt_path1 + ".npy") == True + and os.path.exists(opt_path2 + ".npy") == True + ): + continue + featur_pit = self.compute_f0(inp_path, f0_method) + np.save( + opt_path2, + featur_pit, + allow_pickle=False, + ) # nsf + coarse_pit = self.coarse_f0(featur_pit) + np.save( + opt_path1, + coarse_pit, + allow_pickle=False, + ) # ori + except: + printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())) + + +if __name__ == "__main__": + # exp_dir=r"E:\codes\py39\dataset\mi-test" + # n_p=16 + # f = open("%s/log_extract_f0.log"%exp_dir, "w") + printt(" ".join(sys.argv)) + featureInput = FeatureInput() + paths = [] + inp_root = "%s/1_16k_wavs" % (exp_dir) + opt_root1 = "%s/2a_f0" % (exp_dir) + opt_root2 = "%s/2b-f0nsf" % (exp_dir) + + os.makedirs(opt_root1, exist_ok=True) + os.makedirs(opt_root2, exist_ok=True) + for name in sorted(list(os.listdir(inp_root))): + inp_path = "%s/%s" % (inp_root, name) + if "spec" in inp_path: + continue + opt_path1 = "%s/%s" % (opt_root1, name) + opt_path2 = "%s/%s" % (opt_root2, name) + paths.append([inp_path, opt_path1, opt_path2]) + try: + featureInput.go(paths[i_part::n_part], "rmvpe") + except: + printt("f0_all_fail-%s" % (traceback.format_exc())) + # ps = [] + # for i in range(n_p): + # p = Process( + # target=featureInput.go, + # args=( + # paths[i::n_p], + # f0method, + # ), + # ) + # ps.append(p) + # p.start() + # for i in range(n_p): + # ps[i].join() diff --git a/infer/modules/train/extract/extract_f0_rmvpe_dml.py b/infer/modules/train/extract/extract_f0_rmvpe_dml.py new file mode 100644 index 0000000000000000000000000000000000000000..243e825005bd46dfd464f6d49ecf78f0abf03dc2 --- /dev/null +++ b/infer/modules/train/extract/extract_f0_rmvpe_dml.py @@ -0,0 +1,139 @@ +import os +import sys +import traceback + +import parselmouth + +now_dir = os.getcwd() +sys.path.append(now_dir) +import logging + +import numpy as np +import pyworld + +from infer.lib.audio import load_audio + +logging.getLogger("numba").setLevel(logging.WARNING) + +exp_dir = sys.argv[1] +import torch_directml + +device = torch_directml.device(torch_directml.default_device()) +f = open("%s/extract_f0_feature.log" % exp_dir, "a+") + + +def printt(strr): + print(strr) + f.write("%s\n" % strr) + f.flush() + + +class FeatureInput(object): + def __init__(self, samplerate=16000, hop_size=160): + self.fs = samplerate + self.hop = hop_size + + self.f0_bin = 256 + self.f0_max = 1100.0 + self.f0_min = 50.0 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + + def compute_f0(self, path, f0_method): + x = load_audio(path, self.fs) + # p_len = x.shape[0] // self.hop + if f0_method == "rmvpe": + if hasattr(self, "model_rmvpe") == False: + from infer.lib.rmvpe import RMVPE + + print("Loading rmvpe model") + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", is_half=False, device=device + ) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + return f0 + + def coarse_f0(self, f0): + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * ( + self.f0_bin - 2 + ) / (self.f0_mel_max - self.f0_mel_min) + 1 + + # use 0 or 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > self.f0_bin - 1] = self.f0_bin - 1 + f0_coarse = np.rint(f0_mel).astype(int) + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( + f0_coarse.max(), + f0_coarse.min(), + ) + return f0_coarse + + def go(self, paths, f0_method): + if len(paths) == 0: + printt("no-f0-todo") + else: + printt("todo-f0-%s" % len(paths)) + n = max(len(paths) // 5, 1) # 每个进程最多打印5条 + for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): + try: + if idx % n == 0: + printt("f0ing,now-%s,all-%s,-%s" % (idx, len(paths), inp_path)) + if ( + os.path.exists(opt_path1 + ".npy") == True + and os.path.exists(opt_path2 + ".npy") == True + ): + continue + featur_pit = self.compute_f0(inp_path, f0_method) + np.save( + opt_path2, + featur_pit, + allow_pickle=False, + ) # nsf + coarse_pit = self.coarse_f0(featur_pit) + np.save( + opt_path1, + coarse_pit, + allow_pickle=False, + ) # ori + except: + printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())) + + +if __name__ == "__main__": + # exp_dir=r"E:\codes\py39\dataset\mi-test" + # n_p=16 + # f = open("%s/log_extract_f0.log"%exp_dir, "w") + printt(" ".join(sys.argv)) + featureInput = FeatureInput() + paths = [] + inp_root = "%s/1_16k_wavs" % (exp_dir) + opt_root1 = "%s/2a_f0" % (exp_dir) + opt_root2 = "%s/2b-f0nsf" % (exp_dir) + + os.makedirs(opt_root1, exist_ok=True) + os.makedirs(opt_root2, exist_ok=True) + for name in sorted(list(os.listdir(inp_root))): + inp_path = "%s/%s" % (inp_root, name) + if "spec" in inp_path: + continue + opt_path1 = "%s/%s" % (opt_root1, name) + opt_path2 = "%s/%s" % (opt_root2, name) + paths.append([inp_path, opt_path1, opt_path2]) + try: + featureInput.go(paths, "rmvpe") + except: + printt("f0_all_fail-%s" % (traceback.format_exc())) + # ps = [] + # for i in range(n_p): + # p = Process( + # target=featureInput.go, + # args=( + # paths[i::n_p], + # f0method, + # ), + # ) + # ps.append(p) + # p.start() + # for i in range(n_p): + # ps[i].join() diff --git a/infer/modules/train/extract_feature_print.py b/infer/modules/train/extract_feature_print.py new file mode 100644 index 0000000000000000000000000000000000000000..96a69dee4614dc9c7cbafd24247f244b4e28f9d4 --- /dev/null +++ b/infer/modules/train/extract_feature_print.py @@ -0,0 +1,142 @@ +import os +import sys +import traceback + +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" +os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" + +device = sys.argv[1] +n_part = int(sys.argv[2]) +i_part = int(sys.argv[3]) +if len(sys.argv) == 7: + exp_dir = sys.argv[4] + version = sys.argv[5] + is_half = sys.argv[6].lower() == "true" +else: + i_gpu = sys.argv[4] + exp_dir = sys.argv[5] + os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) + version = sys.argv[6] + is_half = sys.argv[7].lower() == "true" +import fairseq +import numpy as np +import soundfile as sf +import torch +import torch.nn.functional as F + +if "privateuseone" not in device: + device = "cpu" + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" +else: + import torch_directml + + device = torch_directml.device(torch_directml.default_device()) + + def forward_dml(ctx, x, scale): + ctx.scale = scale + res = x.clone().detach() + return res + + fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml + +f = open("%s/extract_f0_feature.log" % exp_dir, "a+") + + +def printt(strr): + print(strr) + f.write("%s\n" % strr) + f.flush() + + +printt(" ".join(sys.argv)) +model_path = "assets/hubert/hubert_base.pt" + +printt("exp_dir: " + exp_dir) +wavPath = "%s/1_16k_wavs" % exp_dir +outPath = ( + "%s/3_feature256" % exp_dir if version == "v1" else "%s/3_feature768" % exp_dir +) +os.makedirs(outPath, exist_ok=True) + + +# wave must be 16k, hop_size=320 +def readwave(wav_path, normalize=False): + wav, sr = sf.read(wav_path) + assert sr == 16000 + feats = torch.from_numpy(wav).float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + if normalize: + with torch.no_grad(): + feats = F.layer_norm(feats, feats.shape) + feats = feats.view(1, -1) + return feats + + +# HuBERT model +printt("load model(s) from {}".format(model_path)) +# if hubert model is exist +if os.access(model_path, os.F_OK) == False: + printt( + "Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main" + % model_path + ) + exit(0) +models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( + [model_path], + suffix="", +) +model = models[0] +model = model.to(device) +printt("move model to %s" % device) +if is_half: + if device not in ["mps", "cpu"]: + model = model.half() +model.eval() + +todo = sorted(list(os.listdir(wavPath)))[i_part::n_part] +n = max(1, len(todo) // 10) # 最多打印十条 +if len(todo) == 0: + printt("no-feature-todo") +else: + printt("all-feature-%s" % len(todo)) + for idx, file in enumerate(todo): + try: + if file.endswith(".wav"): + wav_path = "%s/%s" % (wavPath, file) + out_path = "%s/%s" % (outPath, file.replace("wav", "npy")) + + if os.path.exists(out_path): + continue + + feats = readwave(wav_path, normalize=saved_cfg.task.normalize) + padding_mask = torch.BoolTensor(feats.shape).fill_(False) + inputs = { + "source": ( + feats.half().to(device) + if is_half and device not in ["mps", "cpu"] + else feats.to(device) + ), + "padding_mask": padding_mask.to(device), + "output_layer": 9 if version == "v1" else 12, # layer 9 + } + with torch.no_grad(): + logits = model.extract_features(**inputs) + feats = ( + model.final_proj(logits[0]) if version == "v1" else logits[0] + ) + + feats = feats.squeeze(0).float().cpu().numpy() + if np.isnan(feats).sum() == 0: + np.save(out_path, feats, allow_pickle=False) + else: + printt("%s-contains nan" % file) + if idx % n == 0: + printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape)) + except: + printt(traceback.format_exc()) + printt("all-feature-done") diff --git a/infer/modules/train/preprocess.py b/infer/modules/train/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..00c7445aaec7ef5d8dd9834852a6bdb16c2b417f --- /dev/null +++ b/infer/modules/train/preprocess.py @@ -0,0 +1,142 @@ +import multiprocessing +import os +import sys + +from scipy import signal + +now_dir = os.getcwd() +sys.path.append(now_dir) +print(*sys.argv[1:]) +inp_root = sys.argv[1] +sr = int(sys.argv[2]) +n_p = int(sys.argv[3]) +exp_dir = sys.argv[4] +noparallel = sys.argv[5] == "True" +per = float(sys.argv[6]) +import os +import traceback + +import librosa +import numpy as np +from scipy.io import wavfile + +from infer.lib.audio import load_audio +from infer.lib.slicer2 import Slicer + +f = open("%s/preprocess.log" % exp_dir, "a+") + + +def println(strr): + print(strr) + f.write("%s\n" % strr) + f.flush() + + +class PreProcess: + def __init__(self, sr, exp_dir, per=3.7): + self.slicer = Slicer( + sr=sr, + threshold=-42, + min_length=1500, + min_interval=400, + hop_size=15, + max_sil_kept=500, + ) + self.sr = sr + self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr) + self.per = per + self.overlap = 0.3 + self.tail = self.per + self.overlap + self.max = 0.9 + self.alpha = 0.75 + self.exp_dir = exp_dir + self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir + self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir + os.makedirs(self.exp_dir, exist_ok=True) + os.makedirs(self.gt_wavs_dir, exist_ok=True) + os.makedirs(self.wavs16k_dir, exist_ok=True) + + def norm_write(self, tmp_audio, idx0, idx1): + tmp_max = np.abs(tmp_audio).max() + if tmp_max > 2.5: + print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max)) + return + tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + ( + 1 - self.alpha + ) * tmp_audio + wavfile.write( + "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1), + self.sr, + tmp_audio.astype(np.float32), + ) + tmp_audio = librosa.resample( + tmp_audio, orig_sr=self.sr, target_sr=16000 + ) # , res_type="soxr_vhq" + wavfile.write( + "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1), + 16000, + tmp_audio.astype(np.float32), + ) + + def pipeline(self, path, idx0): + try: + audio = load_audio(path, self.sr) + # zero phased digital filter cause pre-ringing noise... + # audio = signal.filtfilt(self.bh, self.ah, audio) + audio = signal.lfilter(self.bh, self.ah, audio) + + idx1 = 0 + for audio in self.slicer.slice(audio): + i = 0 + while 1: + start = int(self.sr * (self.per - self.overlap) * i) + i += 1 + if len(audio[start:]) > self.tail * self.sr: + tmp_audio = audio[start : start + int(self.per * self.sr)] + self.norm_write(tmp_audio, idx0, idx1) + idx1 += 1 + else: + tmp_audio = audio[start:] + idx1 += 1 + break + self.norm_write(tmp_audio, idx0, idx1) + println("%s\t-> Success" % path) + except: + println("%s\t-> %s" % (path, traceback.format_exc())) + + def pipeline_mp(self, infos): + for path, idx0 in infos: + self.pipeline(path, idx0) + + def pipeline_mp_inp_dir(self, inp_root, n_p): + try: + infos = [ + ("%s/%s" % (inp_root, name), idx) + for idx, name in enumerate(sorted(list(os.listdir(inp_root)))) + ] + if noparallel: + for i in range(n_p): + self.pipeline_mp(infos[i::n_p]) + else: + ps = [] + for i in range(n_p): + p = multiprocessing.Process( + target=self.pipeline_mp, args=(infos[i::n_p],) + ) + ps.append(p) + p.start() + for i in range(n_p): + ps[i].join() + except: + println("Fail. %s" % traceback.format_exc()) + + +def preprocess_trainset(inp_root, sr, n_p, exp_dir, per): + pp = PreProcess(sr, exp_dir, per) + println("start preprocess") + pp.pipeline_mp_inp_dir(inp_root, n_p) + println("end preprocess") + + +if __name__ == "__main__": + preprocess_trainset(inp_root, sr, n_p, exp_dir, per) diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py new file mode 100644 index 0000000000000000000000000000000000000000..ccda369ca6da4857da41f3d6217977cb35327f3b --- /dev/null +++ b/infer/modules/train/train.py @@ -0,0 +1,640 @@ +import os +import sys +import logging + +logger = logging.getLogger(__name__) + +now_dir = os.getcwd() +sys.path.append(os.path.join(now_dir)) + +import datetime + +from infer.lib.train import utils + +hps = utils.get_hparams() +os.environ["CUDA_VISIBLE_DEVICES"] = hps.gpus.replace("-", ",") +n_gpus = len(hps.gpus.split("-")) +from random import randint, shuffle + +import torch + +try: + import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import + + if torch.xpu.is_available(): + from infer.modules.ipex import ipex_init + from infer.modules.ipex.gradscaler import gradscaler_init + from torch.xpu.amp import autocast + + GradScaler = gradscaler_init() + ipex_init() + else: + from torch.cuda.amp import GradScaler, autocast +except Exception: + from torch.cuda.amp import GradScaler, autocast + +torch.backends.cudnn.deterministic = False +torch.backends.cudnn.benchmark = False +from time import sleep +from time import time as ttime + +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.nn import functional as F +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter + +from infer.lib.infer_pack import commons +from infer.lib.train.data_utils import ( + DistributedBucketSampler, + TextAudioCollate, + TextAudioCollateMultiNSFsid, + TextAudioLoader, + TextAudioLoaderMultiNSFsid, +) + +if hps.version == "v1": + from infer.lib.infer_pack.models import MultiPeriodDiscriminator + from infer.lib.infer_pack.models import SynthesizerTrnMs256NSFsid as RVC_Model_f0 + from infer.lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid_nono as RVC_Model_nof0, + ) +else: + from infer.lib.infer_pack.models import ( + SynthesizerTrnMs768NSFsid as RVC_Model_f0, + SynthesizerTrnMs768NSFsid_nono as RVC_Model_nof0, + MultiPeriodDiscriminatorV2 as MultiPeriodDiscriminator, + ) + +from infer.lib.train.losses import ( + discriminator_loss, + feature_loss, + generator_loss, + kl_loss, +) +from infer.lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch +from infer.lib.train.process_ckpt import savee + +global_step = 0 + + +class EpochRecorder: + def __init__(self): + self.last_time = ttime() + + def record(self): + now_time = ttime() + elapsed_time = now_time - self.last_time + self.last_time = now_time + elapsed_time_str = str(datetime.timedelta(seconds=elapsed_time)) + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + return f"[{current_time}] | ({elapsed_time_str})" + + +def main(): + n_gpus = torch.cuda.device_count() + + if torch.cuda.is_available() == False and torch.backends.mps.is_available() == True: + n_gpus = 1 + if n_gpus < 1: + # patch to unblock people without gpus. there is probably a better way. + print("NO GPU DETECTED: falling back to CPU - this may take a while") + n_gpus = 1 + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(randint(20000, 55555)) + children = [] + logger = utils.get_logger(hps.model_dir) + for i in range(n_gpus): + subproc = mp.Process( + target=run, + args=(i, n_gpus, hps, logger), + ) + children.append(subproc) + subproc.start() + + for i in range(n_gpus): + children[i].join() + + +def run(rank, n_gpus, hps, logger: logging.Logger): + global global_step + if rank == 0: + # logger = utils.get_logger(hps.model_dir) + logger.info(hps) + # utils.check_git_hash(hps.model_dir) + writer = SummaryWriter(log_dir=hps.model_dir) + writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval")) + + dist.init_process_group( + backend="gloo", init_method="env://", world_size=n_gpus, rank=rank + ) + torch.manual_seed(hps.train.seed) + if torch.cuda.is_available(): + torch.cuda.set_device(rank) + + if hps.if_f0 == 1: + train_dataset = TextAudioLoaderMultiNSFsid(hps.data.training_files, hps.data) + else: + train_dataset = TextAudioLoader(hps.data.training_files, hps.data) + train_sampler = DistributedBucketSampler( + train_dataset, + hps.train.batch_size * n_gpus, + # [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200,1400], # 16s + [100, 200, 300, 400, 500, 600, 700, 800, 900], # 16s + num_replicas=n_gpus, + rank=rank, + shuffle=True, + ) + # It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit. + # num_workers=8 -> num_workers=4 + if hps.if_f0 == 1: + collate_fn = TextAudioCollateMultiNSFsid() + else: + collate_fn = TextAudioCollate() + train_loader = DataLoader( + train_dataset, + num_workers=4, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + batch_sampler=train_sampler, + persistent_workers=True, + prefetch_factor=8, + ) + if hps.if_f0 == 1: + net_g = RVC_Model_f0( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model, + is_half=hps.train.fp16_run, + sr=hps.sample_rate, + ) + else: + net_g = RVC_Model_nof0( + hps.data.filter_length // 2 + 1, + hps.train.segment_size // hps.data.hop_length, + **hps.model, + is_half=hps.train.fp16_run, + ) + if torch.cuda.is_available(): + net_g = net_g.cuda(rank) + net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm) + if torch.cuda.is_available(): + net_d = net_d.cuda(rank) + optim_g = torch.optim.AdamW( + net_g.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + optim_d = torch.optim.AdamW( + net_d.parameters(), + hps.train.learning_rate, + betas=hps.train.betas, + eps=hps.train.eps, + ) + # net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True) + # net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True) + if hasattr(torch, "xpu") and torch.xpu.is_available(): + pass + elif torch.cuda.is_available(): + net_g = DDP(net_g, device_ids=[rank]) + net_d = DDP(net_d, device_ids=[rank]) + else: + net_g = DDP(net_g) + net_d = DDP(net_d) + + try: # 如果能加载自动resume + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "D_*.pth"), net_d, optim_d + ) # D多半加载没事 + if rank == 0: + logger.info("loaded D") + # _, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g,load_opt=0) + _, _, _, epoch_str = utils.load_checkpoint( + utils.latest_checkpoint_path(hps.model_dir, "G_*.pth"), net_g, optim_g + ) + global_step = (epoch_str - 1) * len(train_loader) + # epoch_str = 1 + # global_step = 0 + except: # 如果首次不能加载,加载pretrain + # traceback.print_exc() + epoch_str = 1 + global_step = 0 + if hps.pretrainG != "": + if rank == 0: + logger.info("loaded pretrained %s" % (hps.pretrainG)) + if hasattr(net_g, "module"): + logger.info( + net_g.module.load_state_dict( + torch.load(hps.pretrainG, map_location="cpu", weights_only=False)["model"] + ) + ) ##测试不加载优化器 + else: + logger.info( + net_g.load_state_dict( + torch.load(hps.pretrainG, map_location="cpu", weights_only=False)["model"] + ) + ) ##测试不加载优化器 + if hps.pretrainD != "": + if rank == 0: + logger.info("loaded pretrained %s" % (hps.pretrainD)) + if hasattr(net_d, "module"): + logger.info( + net_d.module.load_state_dict( + torch.load(hps.pretrainD, map_location="cpu", weights_only=False)["model"] + ) + ) + else: + logger.info( + net_d.load_state_dict( + torch.load(hps.pretrainD, map_location="cpu", weights_only=False)["model"] + ) + ) + + scheduler_g = torch.optim.lr_scheduler.ExponentialLR( + optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 + ) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR( + optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2 + ) + + scaler = GradScaler(enabled=hps.train.fp16_run) + + cache = [] + for epoch in range(epoch_str, hps.train.epochs + 1): + if rank == 0: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + [train_loader, None], + logger, + [writer, writer_eval], + cache, + ) + else: + train_and_evaluate( + rank, + epoch, + hps, + [net_g, net_d], + [optim_g, optim_d], + [scheduler_g, scheduler_d], + scaler, + [train_loader, None], + None, + None, + cache, + ) + scheduler_g.step() + scheduler_d.step() + + +def train_and_evaluate( + rank, epoch, hps, nets, optims, schedulers, scaler, loaders, logger, writers, cache +): + net_g, net_d = nets + optim_g, optim_d = optims + train_loader, eval_loader = loaders + if writers is not None: + writer, writer_eval = writers + + train_loader.batch_sampler.set_epoch(epoch) + global global_step + + net_g.train() + net_d.train() + + # Prepare data iterator + if hps.if_cache_data_in_gpu == True: + # Use Cache + data_iterator = cache + if cache == []: + # Make new cache + for batch_idx, info in enumerate(train_loader): + # Unpack + if hps.if_f0 == 1: + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + else: + ( + phone, + phone_lengths, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + # Load on CUDA + if torch.cuda.is_available(): + phone = phone.cuda(rank, non_blocking=True) + phone_lengths = phone_lengths.cuda(rank, non_blocking=True) + if hps.if_f0 == 1: + pitch = pitch.cuda(rank, non_blocking=True) + pitchf = pitchf.cuda(rank, non_blocking=True) + sid = sid.cuda(rank, non_blocking=True) + spec = spec.cuda(rank, non_blocking=True) + spec_lengths = spec_lengths.cuda(rank, non_blocking=True) + wave = wave.cuda(rank, non_blocking=True) + wave_lengths = wave_lengths.cuda(rank, non_blocking=True) + # Cache on list + if hps.if_f0 == 1: + cache.append( + ( + batch_idx, + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ), + ) + ) + else: + cache.append( + ( + batch_idx, + ( + phone, + phone_lengths, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ), + ) + ) + else: + # Load shuffled cache + shuffle(cache) + else: + # Loader + data_iterator = enumerate(train_loader) + + # Run steps + epoch_recorder = EpochRecorder() + for batch_idx, info in data_iterator: + # Data + ## Unpack + if hps.if_f0 == 1: + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = info + else: + phone, phone_lengths, spec, spec_lengths, wave, wave_lengths, sid = info + ## Load on CUDA + if (hps.if_cache_data_in_gpu == False) and torch.cuda.is_available(): + phone = phone.cuda(rank, non_blocking=True) + phone_lengths = phone_lengths.cuda(rank, non_blocking=True) + if hps.if_f0 == 1: + pitch = pitch.cuda(rank, non_blocking=True) + pitchf = pitchf.cuda(rank, non_blocking=True) + sid = sid.cuda(rank, non_blocking=True) + spec = spec.cuda(rank, non_blocking=True) + spec_lengths = spec_lengths.cuda(rank, non_blocking=True) + wave = wave.cuda(rank, non_blocking=True) + # wave_lengths = wave_lengths.cuda(rank, non_blocking=True) + + # Calculate + with autocast(enabled=hps.train.fp16_run): + if hps.if_f0 == 1: + ( + y_hat, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + ) = net_g(phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid) + else: + ( + y_hat, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + ) = net_g(phone, phone_lengths, spec, spec_lengths, sid) + mel = spec_to_mel_torch( + spec, + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + y_mel = commons.slice_segments( + mel, ids_slice, hps.train.segment_size // hps.data.hop_length + ) + with autocast(enabled=False): + y_hat_mel = mel_spectrogram_torch( + y_hat.float().squeeze(1), + hps.data.filter_length, + hps.data.n_mel_channels, + hps.data.sampling_rate, + hps.data.hop_length, + hps.data.win_length, + hps.data.mel_fmin, + hps.data.mel_fmax, + ) + if hps.train.fp16_run == True: + y_hat_mel = y_hat_mel.half() + wave = commons.slice_segments( + wave, ids_slice * hps.data.hop_length, hps.train.segment_size + ) # slice + + # Discriminator + y_d_hat_r, y_d_hat_g, _, _ = net_d(wave, y_hat.detach()) + with autocast(enabled=False): + loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( + y_d_hat_r, y_d_hat_g + ) + optim_d.zero_grad() + scaler.scale(loss_disc).backward() + scaler.unscale_(optim_d) + grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) + scaler.step(optim_d) + + with autocast(enabled=hps.train.fp16_run): + # Generator + y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave, y_hat) + with autocast(enabled=False): + loss_mel = F.l1_loss(y_mel, y_hat_mel) * hps.train.c_mel + loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * hps.train.c_kl + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, losses_gen = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + optim_g.zero_grad() + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) + scaler.step(optim_g) + scaler.update() + + if rank == 0: + if global_step % hps.train.log_interval == 0: + lr = optim_g.param_groups[0]["lr"] + logger.info( + "Train Epoch: {} [{:.0f}%]".format( + epoch, 100.0 * batch_idx / len(train_loader) + ) + ) + # Amor For Tensorboard display + if loss_mel > 75: + loss_mel = 75 + if loss_kl > 9: + loss_kl = 9 + + logger.info([global_step, lr]) + logger.info( + f"loss_disc={loss_disc:.3f}, loss_gen={loss_gen:.3f}, loss_fm={loss_fm:.3f},loss_mel={loss_mel:.3f}, loss_kl={loss_kl:.3f}" + ) + scalar_dict = { + "loss/g/total": loss_gen_all, + "loss/d/total": loss_disc, + "learning_rate": lr, + "grad_norm_d": grad_norm_d, + "grad_norm_g": grad_norm_g, + } + scalar_dict.update( + { + "loss/g/fm": loss_fm, + "loss/g/mel": loss_mel, + "loss/g/kl": loss_kl, + } + ) + + scalar_dict.update( + {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)} + ) + scalar_dict.update( + {"loss/d_r/{}".format(i): v for i, v in enumerate(losses_disc_r)} + ) + scalar_dict.update( + {"loss/d_g/{}".format(i): v for i, v in enumerate(losses_disc_g)} + ) + image_dict = { + "slice/mel_org": utils.plot_spectrogram_to_numpy( + y_mel[0].data.cpu().numpy() + ), + "slice/mel_gen": utils.plot_spectrogram_to_numpy( + y_hat_mel[0].data.cpu().numpy() + ), + "all/mel": utils.plot_spectrogram_to_numpy( + mel[0].data.cpu().numpy() + ), + } + utils.summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + ) + global_step += 1 + # /Run steps + + if epoch % hps.save_every_epoch == 0 and rank == 0: + if hps.if_latest == 0: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(global_step)), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "D_{}.pth".format(global_step)), + ) + else: + utils.save_checkpoint( + net_g, + optim_g, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "G_{}.pth".format(2333333)), + ) + utils.save_checkpoint( + net_d, + optim_d, + hps.train.learning_rate, + epoch, + os.path.join(hps.model_dir, "D_{}.pth".format(2333333)), + ) + if rank == 0 and hps.save_every_weights == "1": + if hasattr(net_g, "module"): + ckpt = net_g.module.state_dict() + else: + ckpt = net_g.state_dict() + logger.info( + "saving ckpt %s_e%s:%s" + % ( + hps.name, + epoch, + savee( + ckpt, + hps.sample_rate, + hps.if_f0, + hps.name + "_e%s_s%s" % (epoch, global_step), + epoch, + hps.version, + hps, + ), + ) + ) + + if rank == 0: + logger.info("====> Epoch: {} {}".format(epoch, epoch_recorder.record())) + if epoch >= hps.total_epoch and rank == 0: + logger.info("Training is done. The program is closed.") + + if hasattr(net_g, "module"): + ckpt = net_g.module.state_dict() + else: + ckpt = net_g.state_dict() + logger.info( + "saving final ckpt:%s" + % ( + savee( + ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch, hps.version, hps + ) + ) + ) + sleep(1) + os._exit(2333333) + + +if __name__ == "__main__": + torch.multiprocessing.set_start_method("spawn") + main() diff --git a/infer/modules/uvr5/mdxnet.py b/infer/modules/uvr5/mdxnet.py new file mode 100644 index 0000000000000000000000000000000000000000..daa5ca314cb0f289137a68943d2845b88c5e8cea --- /dev/null +++ b/infer/modules/uvr5/mdxnet.py @@ -0,0 +1,256 @@ +import os +import logging + +logger = logging.getLogger(__name__) + +import librosa +import numpy as np +import soundfile as sf +import torch +from tqdm import tqdm + +cpu = torch.device("cpu") + + +class ConvTDFNetTrim: + def __init__( + self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024 + ): + super(ConvTDFNetTrim, self).__init__() + + self.dim_f = dim_f + self.dim_t = 2**dim_t + self.n_fft = n_fft + self.hop = hop + self.n_bins = self.n_fft // 2 + 1 + self.chunk_size = hop * (self.dim_t - 1) + self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to( + device + ) + self.target_name = target_name + self.blender = "blender" in model_name + + self.dim_c = 4 + out_c = self.dim_c * 4 if target_name == "*" else self.dim_c + self.freq_pad = torch.zeros( + [1, out_c, self.n_bins - self.dim_f, self.dim_t] + ).to(device) + + self.n = L // 2 + + def stft(self, x): + x = x.reshape([-1, self.chunk_size]) + x = torch.stft( + x, + n_fft=self.n_fft, + hop_length=self.hop, + window=self.window, + center=True, + return_complex=True, + ) + x = torch.view_as_real(x) + x = x.permute([0, 3, 1, 2]) + x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape( + [-1, self.dim_c, self.n_bins, self.dim_t] + ) + return x[:, :, : self.dim_f] + + def istft(self, x, freq_pad=None): + freq_pad = ( + self.freq_pad.repeat([x.shape[0], 1, 1, 1]) + if freq_pad is None + else freq_pad + ) + x = torch.cat([x, freq_pad], -2) + c = 4 * 2 if self.target_name == "*" else 2 + x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape( + [-1, 2, self.n_bins, self.dim_t] + ) + x = x.permute([0, 2, 3, 1]) + x = x.contiguous() + x = torch.view_as_complex(x) + x = torch.istft( + x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True + ) + return x.reshape([-1, c, self.chunk_size]) + + +def get_models(device, dim_f, dim_t, n_fft): + return ConvTDFNetTrim( + device=device, + model_name="Conv-TDF", + target_name="vocals", + L=11, + dim_f=dim_f, + dim_t=dim_t, + n_fft=n_fft, + ) + + +class Predictor: + def __init__(self, args): + import onnxruntime as ort + + logger.info(ort.get_available_providers()) + self.args = args + self.model_ = get_models( + device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft + ) + self.model = ort.InferenceSession( + os.path.join(args.onnx, self.model_.target_name + ".onnx"), + providers=[ + "CUDAExecutionProvider", + "DmlExecutionProvider", + "CPUExecutionProvider", + ], + ) + logger.info("ONNX load done") + + def demix(self, mix): + samples = mix.shape[-1] + margin = self.args.margin + chunk_size = self.args.chunks * 44100 + assert not margin == 0, "margin cannot be zero!" + if margin > chunk_size: + margin = chunk_size + + segmented_mix = {} + + if self.args.chunks == 0 or samples < chunk_size: + chunk_size = samples + + counter = -1 + for skip in range(0, samples, chunk_size): + counter += 1 + + s_margin = 0 if counter == 0 else margin + end = min(skip + chunk_size + margin, samples) + + start = skip - s_margin + + segmented_mix[skip] = mix[:, start:end].copy() + if end == samples: + break + + sources = self.demix_base(segmented_mix, margin_size=margin) + """ + mix:(2,big_sample) + segmented_mix:offset->(2,small_sample) + sources:(1,2,big_sample) + """ + return sources + + def demix_base(self, mixes, margin_size): + chunked_sources = [] + progress_bar = tqdm(total=len(mixes)) + progress_bar.set_description("Processing") + for mix in mixes: + cmix = mixes[mix] + sources = [] + n_sample = cmix.shape[1] + model = self.model_ + trim = model.n_fft // 2 + gen_size = model.chunk_size - 2 * trim + pad = gen_size - n_sample % gen_size + mix_p = np.concatenate( + (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1 + ) + mix_waves = [] + i = 0 + while i < n_sample + pad: + waves = np.array(mix_p[:, i : i + model.chunk_size]) + mix_waves.append(waves) + i += gen_size + mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu) + with torch.no_grad(): + _ort = self.model + spek = model.stft(mix_waves) + if self.args.denoise: + spec_pred = ( + -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5 + + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5 + ) + tar_waves = model.istft(torch.tensor(spec_pred)) + else: + tar_waves = model.istft( + torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0]) + ) + tar_signal = ( + tar_waves[:, :, trim:-trim] + .transpose(0, 1) + .reshape(2, -1) + .numpy()[:, :-pad] + ) + + start = 0 if mix == 0 else margin_size + end = None if mix == list(mixes.keys())[::-1][0] else -margin_size + if margin_size == 0: + end = None + sources.append(tar_signal[:, start:end]) + + progress_bar.update(1) + + chunked_sources.append(sources) + _sources = np.concatenate(chunked_sources, axis=-1) + # del self.model + progress_bar.close() + return _sources + + def prediction(self, m, vocal_root, others_root, format): + os.makedirs(vocal_root, exist_ok=True) + os.makedirs(others_root, exist_ok=True) + basename = os.path.basename(m) + mix, rate = librosa.load(m, mono=False, sr=44100) + if mix.ndim == 1: + mix = np.asfortranarray([mix, mix]) + mix = mix.T + sources = self.demix(mix.T) + opt = sources[0].T + if format in ["wav", "flac"]: + sf.write( + "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate + ) + sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate) + else: + path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename) + path_other = "%s/%s_others.wav" % (others_root, basename) + sf.write(path_vocal, mix - opt, rate) + sf.write(path_other, opt, rate) + opt_path_vocal = path_vocal[:-4] + ".%s" % format + opt_path_other = path_other[:-4] + ".%s" % format + if os.path.exists(path_vocal): + os.system( + 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path_vocal, opt_path_vocal) + ) + if os.path.exists(opt_path_vocal): + try: + os.remove(path_vocal) + except: + pass + if os.path.exists(path_other): + os.system( + 'ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path_other, opt_path_other) + ) + if os.path.exists(opt_path_other): + try: + os.remove(path_other) + except: + pass + + +class MDXNetDereverb: + def __init__(self, chunks, device): + self.onnx = "assets/uvr5_weights/onnx_dereverb_By_FoxJoy" + self.shifts = 10 # 'Predict with randomised equivariant stabilisation' + self.mixing = "min_mag" # ['default','min_mag','max_mag'] + self.chunks = chunks + self.margin = 44100 + self.dim_t = 9 + self.dim_f = 3072 + self.n_fft = 6144 + self.denoise = True + self.pred = Predictor(self) + self.device = device + + def _path_audio_(self, input, vocal_root, others_root, format, is_hp3=False): + self.pred.prediction(input, vocal_root, others_root, format) diff --git a/infer/modules/uvr5/modules.py b/infer/modules/uvr5/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..e2a511888cd0739720a30fa226831755225d0368 --- /dev/null +++ b/infer/modules/uvr5/modules.py @@ -0,0 +1,177 @@ +import os +import traceback +import logging + +logger = logging.getLogger(__name__) + +import ffmpeg +import torch + +from configs.config import Config +from infer.modules.uvr5.mdxnet import MDXNetDereverb +from infer.modules.uvr5.vr import AudioPre, AudioPreDeEcho + +# 导入彩色日志 +try: + from lib.logger import log +except ImportError: + log = None + +config = Config() + + +def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): + infos = [] + try: + if log: + log.progress(f"开始UVR5人声分离...") + log.model(f"模型: {model_name}") + log.detail(f"输入目录: {inp_root}") + log.detail(f"人声输出: {save_root_vocal}") + log.detail(f"伴奏输出: {save_root_ins}") + log.config(f"激进度: {agg}, 格式: {format0}") + + inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + save_root_vocal = ( + save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) + save_root_ins = ( + save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) + if model_name == "onnx_dereverb_By_FoxJoy": + if log: + log.model("加载MDXNet去混响模型...") + pre_fun = MDXNetDereverb(15, config.device) + else: + func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho + if log: + log.model(f"加载VR模型: {func.__name__}") + pre_fun = func( + agg=int(agg), + model_path=os.path.join( + os.getenv("weight_uvr5_root"), model_name + ".pth" + ), + device=config.device, + is_half=config.is_half, + ) + is_hp3 = "HP3" in model_name + if log: + log.detail(f"HP3模式: {is_hp3}") + + if inp_root != "": + paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] + else: + paths = [path.name for path in paths] + + if log: + log.detail(f"待处理文件数: {len(paths)}") + + for idx, path in enumerate(paths): + if log: + log.progress(f"处理文件 {idx+1}/{len(paths)}: {os.path.basename(path)}") + + inp_path = os.path.join(inp_root, path) + need_reformat = 1 + done = 0 + try: + info = ffmpeg.probe(inp_path, cmd="ffprobe") + channels = info["streams"][0]["channels"] + sample_rate = info["streams"][0]["sample_rate"] + if log: + log.audio(f"音频信息: {channels}声道, {sample_rate}Hz") + + if ( + channels == 2 + and sample_rate == "44100" + ): + need_reformat = 0 + if log: + log.detail("格式符合要求,直接处理") + if "DeEcho" in model_name: + pre_fun._path_audio_( + inp_path, save_root_vocal, save_root_ins, format0, is_hp3=is_hp3 + ) + else: + pre_fun._path_audio_( + inp_path, save_root_ins, save_root_vocal, format0, is_hp3=is_hp3 + ) + done = 1 + except: + need_reformat = 1 + traceback.print_exc() + if log: + log.warning("无法探测音频格式,将进行重格式化") + + if need_reformat == 1: + tmp_path = "%s/%s.reformatted.wav" % ( + os.path.join(os.environ["TEMP"]), + os.path.basename(inp_path), + ) + if log: + log.detail(f"重格式化音频: {tmp_path}") + os.system( + 'ffmpeg -i "%s" -vn -acodec pcm_s16le -ac 2 -ar 44100 "%s" -y' + % (inp_path, tmp_path) + ) + inp_path = tmp_path + try: + if done == 0: + if log: + log.progress("执行人声分离...") + if "DeEcho" in model_name: + pre_fun._path_audio_( + inp_path, save_root_vocal, save_root_ins, format0 + ) + else: + pre_fun._path_audio_( + inp_path, save_root_ins, save_root_vocal, format0 + ) + infos.append("%s->Success" % (os.path.basename(inp_path))) + if log: + log.success(f"{os.path.basename(inp_path)} 处理成功") + yield "\n".join(infos) + except: + try: + if done == 0: + pre_fun._path_audio_( + inp_path, save_root_ins, save_root_vocal, format0 + ) + infos.append("%s->Success" % (os.path.basename(inp_path))) + if log: + log.success(f"{os.path.basename(inp_path)} 处理成功(重试)") + yield "\n".join(infos) + except: + error_msg = traceback.format_exc() + infos.append( + "%s->%s" % (os.path.basename(inp_path), error_msg) + ) + if log: + log.error(f"{os.path.basename(inp_path)} 处理失败:\n{error_msg}") + yield "\n".join(infos) + except: + error_msg = traceback.format_exc() + infos.append(error_msg) + if log: + log.error(f"UVR5处理失败:\n{error_msg}") + yield "\n".join(infos) + finally: + try: + if log: + log.detail("清理模型资源...") + if model_name == "onnx_dereverb_By_FoxJoy": + del pre_fun.pred.model + del pre_fun.pred.model_ + else: + del pre_fun.model + del pre_fun + except: + traceback.print_exc() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + logger.info("Executed torch.cuda.empty_cache()") + if log: + log.detail("已清理CUDA缓存") + + if log: + log.success("UVR5处理完成") + yield "\n".join(infos) diff --git a/infer/modules/uvr5/vr.py b/infer/modules/uvr5/vr.py new file mode 100644 index 0000000000000000000000000000000000000000..38050c61140224b74743b7773a4f9ccf5233f90a --- /dev/null +++ b/infer/modules/uvr5/vr.py @@ -0,0 +1,464 @@ +import os +import logging + +logger = logging.getLogger(__name__) + +import librosa +import numpy as np +import soundfile as sf +import torch + +from infer.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets +from infer.lib.uvr5_pack.lib_v5 import spec_utils +from infer.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters +from infer.lib.uvr5_pack.lib_v5.nets_new import CascadedNet +from infer.lib.uvr5_pack.utils import inference + +# 导入彩色日志 +try: + from lib.logger import log +except ImportError: + log = None + + +class AudioPre: + def __init__(self, agg, model_path, device, is_half, tta=False): + self.model_path = model_path + self.device = device + self.data = { + # Processing Options + "postprocess": False, + "tta": tta, + # Constants + "window_size": 512, + "agg": agg, + "high_end_process": "mirroring", + } + if log: + log.model(f"加载UVR5模型: {os.path.basename(model_path)}") + log.detail(f"设备: {device}, 半精度: {is_half}") + log.config(f"激进度: {agg}, 窗口大小: 512") + + mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") + model = Nets.CascadedASPPNet(mp.param["bins"] * 2) + cpk = torch.load(model_path, map_location="cpu", weights_only=False) + model.load_state_dict(cpk) + model.eval() + if is_half: + model = model.half().to(device) + else: + model = model.to(device) + + self.mp = mp + self.model = model + if log: + log.success("UVR5模型加载完成") + + def _path_audio_( + self, music_file, ins_root=None, vocal_root=None, format="flac", is_hp3=False + ): + if ins_root is None and vocal_root is None: + return "No save root." + name = os.path.basename(music_file) + if log: + log.audio(f"处理音频文件: {name}") + log.detail(f"输入路径: {music_file}") + log.detail(f"伴奏输出: {ins_root}") + log.detail(f"人声输出: {vocal_root}") + log.config(f"输出格式: {format}, HP3模式: {is_hp3}") + + if ins_root is not None: + os.makedirs(ins_root, exist_ok=True) + if vocal_root is not None: + os.makedirs(vocal_root, exist_ok=True) + X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} + bands_n = len(self.mp.param["band"]) + if log: + log.detail(f"频段数量: {bands_n}") + + # print(bands_n) + for d in range(bands_n, 0, -1): + bp = self.mp.param["band"][d] + if log: + log.detail(f"处理频段 {d}: 采样率={bp['sr']}, 重采样类型={bp['res_type']}") + if d == bands_n: # high-end band + ( + X_wave[d], + _, + ) = librosa.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 + music_file, + sr=bp["sr"], + mono=False, + dtype=np.float32, + res_type=bp["res_type"], + ) + if X_wave[d].ndim == 1: + X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) + if log: + log.detail(f"加载高频段: shape={X_wave[d].shape}") + else: # lower bands + X_wave[d] = librosa.resample( + X_wave[d + 1], + orig_sr=self.mp.param["band"][d + 1]["sr"], + target_sr=bp["sr"], + res_type=bp["res_type"], + ) + if log: + log.detail(f"重采样频段 {d}: shape={X_wave[d].shape}") + # Stft of wave source + X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( + X_wave[d], + bp["hl"], + bp["n_fft"], + self.mp.param["mid_side"], + self.mp.param["mid_side_b2"], + self.mp.param["reverse"], + ) + # pdb.set_trace() + if d == bands_n and self.data["high_end_process"] != "none": + input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( + self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] + ) + input_high_end = X_spec_s[d][ + :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : + ] + + if log: + log.progress("合并频谱图...") + X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) + aggresive_set = float(self.data["agg"] / 100) + aggressiveness = { + "value": aggresive_set, + "split_bin": self.mp.param["band"][1]["crop_stop"], + } + if log: + log.detail(f"激进度设置: {aggresive_set}") + log.progress("执行模型推理...") + + with torch.no_grad(): + pred, X_mag, X_phase = inference( + X_spec_m, self.device, self.model, aggressiveness, self.data + ) + # Postprocess + if self.data["postprocess"]: + if log: + log.detail("执行后处理...") + pred_inv = np.clip(X_mag - pred, 0, np.inf) + pred = spec_utils.mask_silence(pred, pred_inv) + y_spec_m = pred * X_phase + v_spec_m = X_spec_m - y_spec_m + + if ins_root is not None: + if log: + log.progress("生成伴奏音频...") + if self.data["high_end_process"].startswith("mirroring"): + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], y_spec_m, input_high_end, self.mp + ) + wav_instrument = spec_utils.cmb_spectrogram_to_wave( + y_spec_m, self.mp, input_high_end_h, input_high_end_ + ) + else: + wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) + logger.info("%s instruments done" % name) + if log: + log.success(f"{name} 伴奏分离完成") + if is_hp3 == True: + head = "vocal_" + else: + head = "instrument_" + if format in ["wav", "flac"]: + output_path = os.path.join( + ins_root, + head + "{}_{}.{}".format(name, self.data["agg"], format), + ) + sf.write( + output_path, + (np.array(wav_instrument) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if log: + log.audio(f"保存伴奏: {os.path.basename(output_path)}") + else: + path = os.path.join( + ins_root, head + "{}_{}.wav".format(name, self.data["agg"]) + ) + sf.write( + path, + (np.array(wav_instrument) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if os.path.exists(path): + opt_format_path = path[:-4] + ".%s" % format + if log: + log.detail(f"转换格式: {format}") + os.system('ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)) + if os.path.exists(opt_format_path): + try: + os.remove(path) + except: + pass + if log: + log.audio(f"保存伴奏: {os.path.basename(opt_format_path)}") + if vocal_root is not None: + if log: + log.progress("生成人声音频...") + if is_hp3 == True: + head = "instrument_" + else: + head = "vocal_" + if self.data["high_end_process"].startswith("mirroring"): + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], v_spec_m, input_high_end, self.mp + ) + wav_vocals = spec_utils.cmb_spectrogram_to_wave( + v_spec_m, self.mp, input_high_end_h, input_high_end_ + ) + else: + wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) + logger.info("%s vocals done" % name) + if log: + log.success(f"{name} 人声分离完成") + if format in ["wav", "flac"]: + output_path = os.path.join( + vocal_root, + head + "{}_{}.{}".format(name, self.data["agg"], format), + ) + sf.write( + output_path, + (np.array(wav_vocals) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if log: + log.audio(f"保存人声: {os.path.basename(output_path)}") + else: + path = os.path.join( + vocal_root, head + "{}_{}.wav".format(name, self.data["agg"]) + ) + sf.write( + path, + (np.array(wav_vocals) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if os.path.exists(path): + opt_format_path = path[:-4] + ".%s" % format + if log: + log.detail(f"转换格式: {format}") + os.system('ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)) + if os.path.exists(opt_format_path): + try: + os.remove(path) + except: + pass + if log: + log.audio(f"保存人声: {os.path.basename(opt_format_path)}") + + +class AudioPreDeEcho: + def __init__(self, agg, model_path, device, is_half, tta=False): + self.model_path = model_path + self.device = device + self.data = { + # Processing Options + "postprocess": False, + "tta": tta, + # Constants + "window_size": 512, + "agg": agg, + "high_end_process": "mirroring", + } + if log: + log.model(f"加载UVR5 DeEcho模型: {os.path.basename(model_path)}") + log.detail(f"设备: {device}, 半精度: {is_half}") + log.config(f"激进度: {agg}, 窗口大小: 512") + + mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json") + nout = 64 if "DeReverb" in model_path else 48 + if log: + log.detail(f"模型输出通道: {nout}") + model = CascadedNet(mp.param["bins"] * 2, nout) + cpk = torch.load(model_path, map_location="cpu", weights_only=False) + model.load_state_dict(cpk) + model.eval() + if is_half: + model = model.half().to(device) + else: + model = model.to(device) + + self.mp = mp + self.model = model + if log: + log.success("UVR5 DeEcho模型加载完成") + + def _path_audio_( + self, music_file, vocal_root=None, ins_root=None, format="flac", is_hp3=False + ): # 3个VR模型vocal和ins是反的 + if ins_root is None and vocal_root is None: + return "No save root." + name = os.path.basename(music_file) + if log: + log.audio(f"DeEcho处理音频: {name}") + log.detail(f"输入路径: {music_file}") + + if ins_root is not None: + os.makedirs(ins_root, exist_ok=True) + if vocal_root is not None: + os.makedirs(vocal_root, exist_ok=True) + X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} + bands_n = len(self.mp.param["band"]) + if log: + log.detail(f"频段数量: {bands_n}") + + for d in range(bands_n, 0, -1): + bp = self.mp.param["band"][d] + if log: + log.detail(f"处理频段 {d}: 采样率={bp['sr']}") + if d == bands_n: # high-end band + ( + X_wave[d], + _, + ) = librosa.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 + music_file, + sr=bp["sr"], + mono=False, + dtype=np.float32, + res_type=bp["res_type"], + ) + if X_wave[d].ndim == 1: + X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) + else: # lower bands + X_wave[d] = librosa.resample( + X_wave[d + 1], + orig_sr=self.mp.param["band"][d + 1]["sr"], + target_sr=bp["sr"], + res_type=bp["res_type"], + ) + # Stft of wave source + X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( + X_wave[d], + bp["hl"], + bp["n_fft"], + self.mp.param["mid_side"], + self.mp.param["mid_side_b2"], + self.mp.param["reverse"], + ) + # pdb.set_trace() + if d == bands_n and self.data["high_end_process"] != "none": + input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( + self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] + ) + input_high_end = X_spec_s[d][ + :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : + ] + + if log: + log.progress("合并频谱图并执行推理...") + X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) + aggresive_set = float(self.data["agg"] / 100) + aggressiveness = { + "value": aggresive_set, + "split_bin": self.mp.param["band"][1]["crop_stop"], + } + with torch.no_grad(): + pred, X_mag, X_phase = inference( + X_spec_m, self.device, self.model, aggressiveness, self.data + ) + # Postprocess + if self.data["postprocess"]: + pred_inv = np.clip(X_mag - pred, 0, np.inf) + pred = spec_utils.mask_silence(pred, pred_inv) + y_spec_m = pred * X_phase + v_spec_m = X_spec_m - y_spec_m + + if ins_root is not None: + if log: + log.progress("生成伴奏音频...") + if self.data["high_end_process"].startswith("mirroring"): + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], y_spec_m, input_high_end, self.mp + ) + wav_instrument = spec_utils.cmb_spectrogram_to_wave( + y_spec_m, self.mp, input_high_end_h, input_high_end_ + ) + else: + wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) + logger.info("%s instruments done" % name) + if log: + log.success(f"{name} 伴奏分离完成") + if format in ["wav", "flac"]: + output_path = os.path.join( + ins_root, + "vocal_{}_{}.{}".format(name, self.data["agg"], format), + ) + sf.write( + output_path, + (np.array(wav_instrument) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if log: + log.audio(f"保存伴奏: {os.path.basename(output_path)}") + else: + path = os.path.join( + ins_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) + ) + sf.write( + path, + (np.array(wav_instrument) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if os.path.exists(path): + opt_format_path = path[:-4] + ".%s" % format + if log: + log.detail(f"转换格式: {format}") + os.system('ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)) + if os.path.exists(opt_format_path): + try: + os.remove(path) + except: + pass + if vocal_root is not None: + if log: + log.progress("生成人声音频...") + if self.data["high_end_process"].startswith("mirroring"): + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], v_spec_m, input_high_end, self.mp + ) + wav_vocals = spec_utils.cmb_spectrogram_to_wave( + v_spec_m, self.mp, input_high_end_h, input_high_end_ + ) + else: + wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) + logger.info("%s vocals done" % name) + if log: + log.success(f"{name} 人声分离完成") + if format in ["wav", "flac"]: + output_path = os.path.join( + vocal_root, + "instrument_{}_{}.{}".format(name, self.data["agg"], format), + ) + sf.write( + output_path, + (np.array(wav_vocals) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if log: + log.audio(f"保存人声: {os.path.basename(output_path)}") + else: + path = os.path.join( + vocal_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) + ) + sf.write( + path, + (np.array(wav_vocals) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if os.path.exists(path): + opt_format_path = path[:-4] + ".%s" % format + if log: + log.detail(f"转换格式: {format}") + os.system('ffmpeg -i "%s" -vn "%s" -q:a 2 -y' % (path, opt_format_path)) + if os.path.exists(opt_format_path): + try: + os.remove(path) + except: + pass diff --git a/infer/modules/vc/__init__.py b/infer/modules/vc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..93982a68440525bda53feaecb76f38fadb213d0f --- /dev/null +++ b/infer/modules/vc/modules.py @@ -0,0 +1,376 @@ +import traceback +import logging + +logger = logging.getLogger(__name__) + +import numpy as np +import soundfile as sf +import torch +from io import BytesIO + +from infer.lib.audio import load_audio, wav2 +from infer.lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, +) +from infer.modules.vc.pipeline import Pipeline +from infer.modules.vc.utils import * + +# 导入彩色日志 +try: + from lib.logger import log +except ImportError: + log = None + + +class VC: + def __init__(self, config): + self.n_spk = None + self.tgt_sr = None + self.net_g = None + self.pipeline = None + self.cpt = None + self.version = None + self.if_f0 = None + self.version = None + self.hubert_model = None + + self.config = config + + def get_vc(self, sid, *to_return_protect): + logger.info("Get sid: " + sid) + if log: + log.model(f"获取模型: {sid}") + + to_return_protect0 = { + "visible": self.if_f0 != 0, + "value": ( + to_return_protect[0] if self.if_f0 != 0 and to_return_protect else 0.5 + ), + "__type__": "update", + } + to_return_protect1 = { + "visible": self.if_f0 != 0, + "value": ( + to_return_protect[1] if self.if_f0 != 0 and to_return_protect else 0.33 + ), + "__type__": "update", + } + + if sid == "" or sid == []: + if ( + self.hubert_model is not None + ): # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的 + logger.info("Clean model cache") + if log: + log.detail("清理模型缓存...") + del (self.net_g, self.n_spk, self.hubert_model, self.tgt_sr) # ,cpt + self.hubert_model = self.net_g = self.n_spk = self.hubert_model = ( + self.tgt_sr + ) = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() + if log: + log.detail("已清理CUDA缓存") + ###楼下不这么折腾清理不干净 + self.if_f0 = self.cpt.get("f0", 1) + self.version = self.cpt.get("version", "v1") + if self.version == "v1": + if self.if_f0 == 1: + self.net_g = SynthesizerTrnMs256NSFsid( + *self.cpt["config"], is_half=self.config.is_half + ) + else: + self.net_g = SynthesizerTrnMs256NSFsid_nono(*self.cpt["config"]) + elif self.version == "v2": + if self.if_f0 == 1: + self.net_g = SynthesizerTrnMs768NSFsid( + *self.cpt["config"], is_half=self.config.is_half + ) + else: + self.net_g = SynthesizerTrnMs768NSFsid_nono(*self.cpt["config"]) + del self.net_g, self.cpt + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return ( + {"visible": False, "__type__": "update"}, + { + "visible": True, + "value": to_return_protect0, + "__type__": "update", + }, + { + "visible": True, + "value": to_return_protect1, + "__type__": "update", + }, + "", + "", + ) + person = f'{os.getenv("weight_root")}/{sid}' + logger.info(f"Loading: {person}") + if log: + log.model(f"加载模型文件: {person}") + + self.cpt = torch.load(person, map_location="cpu", weights_only=False) + self.tgt_sr = self.cpt["config"][-1] + self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] # n_spk + self.if_f0 = self.cpt.get("f0", 1) + self.version = self.cpt.get("version", "v1") + + if log: + log.config(f"模型版本: {self.version}") + log.config(f"目标采样率: {self.tgt_sr} Hz") + log.config(f"F0支持: {'是' if self.if_f0 else '否'}") + + synthesizer_class = { + ("v1", 1): SynthesizerTrnMs256NSFsid, + ("v1", 0): SynthesizerTrnMs256NSFsid_nono, + ("v2", 1): SynthesizerTrnMs768NSFsid, + ("v2", 0): SynthesizerTrnMs768NSFsid_nono, + } + + if log: + log.detail(f"选择合成器: {synthesizer_class.get((self.version, self.if_f0), SynthesizerTrnMs256NSFsid).__name__}") + + self.net_g = synthesizer_class.get( + (self.version, self.if_f0), SynthesizerTrnMs256NSFsid + )(*self.cpt["config"], is_half=self.config.is_half) + + del self.net_g.enc_q + + self.net_g.load_state_dict(self.cpt["weight"], strict=False) + self.net_g.eval().to(self.config.device) + if self.config.is_half: + self.net_g = self.net_g.half() + if log: + log.detail("使用半精度模式") + else: + self.net_g = self.net_g.float() + if log: + log.detail("使用全精度模式") + + if log: + log.progress("初始化推理管道...") + self.pipeline = Pipeline(self.tgt_sr, self.config) + n_spk = self.cpt["config"][-3] + if log: + log.config(f"说话人数量: {n_spk}") + + index = {"value": get_index_path_from_model(sid), "__type__": "update"} + logger.info("Select index: " + index["value"]) + if log: + log.model(f"选择索引: {index['value']}") + log.success("模型加载完成") + + return ( + ( + {"visible": True, "maximum": n_spk, "__type__": "update"}, + to_return_protect0, + to_return_protect1, + index, + index, + ) + if to_return_protect + else {"visible": True, "maximum": n_spk, "__type__": "update"} + ) + + def vc_single( + self, + sid, + input_audio_path, + f0_up_key, + f0_file, + f0_method, + file_index, + file_index2, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + ): + if input_audio_path is None: + return "You need to upload an audio", None + + if log: + log.progress("开始单文件人声转换...") + log.audio(f"输入音频: {input_audio_path}") + log.config(f"音调偏移: {f0_up_key} 半音") + log.config(f"F0方法: {f0_method}") + log.config(f"索引率: {index_rate}") + log.config(f"滤波半径: {filter_radius}") + log.config(f"RMS混合率: {rms_mix_rate}") + log.config(f"保护系数: {protect}") + + f0_up_key = int(f0_up_key) + try: + if log: + log.detail("加载音频文件...") + audio = load_audio(input_audio_path, 16000) + audio_max = np.abs(audio).max() / 0.95 + if audio_max > 1: + audio /= audio_max + if log: + log.detail(f"音频归一化: 峰值={audio_max:.4f}") + + if log: + log.detail(f"音频长度: {len(audio)} 样本 ({len(audio)/16000:.2f} 秒)") + + times = [0, 0, 0] + + if self.hubert_model is None: + if log: + log.model("加载HuBERT模型...") + self.hubert_model = load_hubert(self.config) + if log: + log.success("HuBERT模型加载完成") + + if file_index: + file_index = ( + file_index.strip(" ") + .strip('"') + .strip("\n") + .strip('"') + .strip(" ") + .replace("trained", "added") + ) + elif file_index2: + file_index = file_index2 + else: + file_index = "" # 防止小白写错,自动帮他替换掉 + + if log and file_index: + log.model(f"使用索引文件: {file_index}") + + if log: + log.progress("执行推理管道...") + + audio_opt = self.pipeline.pipeline( + self.hubert_model, + self.net_g, + sid, + audio, + input_audio_path, + times, + f0_up_key, + f0_method, + file_index, + index_rate, + self.if_f0, + filter_radius, + self.tgt_sr, + resample_sr, + rms_mix_rate, + self.version, + protect, + f0_file, + ) + if self.tgt_sr != resample_sr >= 16000: + tgt_sr = resample_sr + else: + tgt_sr = self.tgt_sr + index_info = ( + "Index:\n%s." % file_index + if os.path.exists(file_index) + else "Index not used." + ) + + if log: + log.success("推理完成") + log.detail(f"NPY时间: {times[0]:.2f}s, F0时间: {times[1]:.2f}s, 推理时间: {times[2]:.2f}s") + log.audio(f"输出采样率: {tgt_sr} Hz") + log.audio(f"输出长度: {len(audio_opt)} 样本") + + return ( + "Success.\n%s\nTime:\nnpy: %.2fs, f0: %.2fs, infer: %.2fs." + % (index_info, *times), + (tgt_sr, audio_opt), + ) + except: + info = traceback.format_exc() + logger.warning(info) + if log: + log.error(f"转换失败:\n{info}") + return info, (None, None) + + def vc_multi( + self, + sid, + dir_path, + opt_root, + paths, + f0_up_key, + f0_method, + file_index, + file_index2, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + format1, + ): + try: + dir_path = ( + dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # 防止小白拷路径头尾带了空格和"和回车 + opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + os.makedirs(opt_root, exist_ok=True) + try: + if dir_path != "": + paths = [ + os.path.join(dir_path, name) for name in os.listdir(dir_path) + ] + else: + paths = [path.name for path in paths] + except: + traceback.print_exc() + paths = [path.name for path in paths] + infos = [] + for path in paths: + info, opt = self.vc_single( + sid, + path, + f0_up_key, + None, + f0_method, + file_index, + file_index2, + # file_big_npy, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + ) + if "Success" in info: + try: + tgt_sr, audio_opt = opt + if format1 in ["wav", "flac"]: + sf.write( + "%s/%s.%s" + % (opt_root, os.path.basename(path), format1), + audio_opt, + tgt_sr, + ) + else: + path = "%s/%s.%s" % ( + opt_root, + os.path.basename(path), + format1, + ) + with BytesIO() as wavf: + sf.write(wavf, audio_opt, tgt_sr, format="wav") + wavf.seek(0, 0) + with open(path, "wb") as outf: + wav2(wavf, outf, format1) + except: + info += traceback.format_exc() + infos.append("%s->%s" % (os.path.basename(path), info)) + yield "\n".join(infos) + yield "\n".join(infos) + except: + yield traceback.format_exc() diff --git a/infer/modules/vc/pipeline.py b/infer/modules/vc/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..b92d0d760f73646bfc20ea1663d47af97b13d18f --- /dev/null +++ b/infer/modules/vc/pipeline.py @@ -0,0 +1,1286 @@ +import os +import sys +import traceback +import logging + +logger = logging.getLogger(__name__) + +from functools import lru_cache +from time import time as ttime + +import faiss +import librosa +import numpy as np +import parselmouth +import pyworld +import torch +import torch.nn.functional as F +import torchcrepe +from scipy import signal +from typing import Optional + +now_dir = os.getcwd() +sys.path.append(now_dir) + +# 导入彩色日志 +try: + from lib.logger import log +except ImportError: + log = None + +from lib.audio import soft_clip + +bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) + +input_audio_path2wav = {} + + +@lru_cache +def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): + audio = input_audio_path2wav[input_audio_path] + f0, t = pyworld.harvest( + audio, + fs=fs, + f0_ceil=f0max, + f0_floor=f0min, + frame_period=frame_period, + ) + f0 = pyworld.stonemask(audio, f0, t, fs) + return f0 + + +def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 + rms1 = librosa.feature.rms( + y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 + ) # 每半秒一个点 + rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) + rms1 = torch.from_numpy(rms1) + rms1 = F.interpolate( + rms1.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.from_numpy(rms2) + rms2 = F.interpolate( + rms2.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) + gain = torch.pow(rms1, torch.tensor(1 - rate)) * torch.pow(rms2, torch.tensor(rate - 1)) + # Reduced upper clamp: 4.0x over-amplifies noise in quiet sections, + # producing buzzy/electronic artifacts. 2.0x is sufficient for RMS matching. + gain = torch.clamp(gain, 0.3, 2.0) + data2 *= gain.numpy() + return data2 + + +def repair_f0( + f0: np.ndarray, + max_gap: int = 6, + mask: Optional[np.ndarray] = None, + min_mask_ratio: float = 0.6, +) -> np.ndarray: + """Fill short unvoiced gaps in F0 to reduce crack/tearing artifacts.""" + if f0 is None or len(f0) == 0: + return f0 + f0 = np.nan_to_num(f0, nan=0.0).astype(np.float32, copy=False) + voiced = f0 > 0 + if voiced.sum() < 2: + return f0 + + if mask is not None: + mask = mask.astype(bool, copy=False) + if len(mask) < len(f0): + mask = np.pad(mask, (0, len(f0) - len(mask)), mode="edge") + else: + mask = mask[: len(f0)] + + x = np.arange(len(f0)) + interp = np.interp(x, x[voiced], f0[voiced]) + + zero_idx = np.where(~voiced)[0] + if zero_idx.size == 0: + return f0 + + run_start = zero_idx[0] + prev = zero_idx[0] + for idx in zero_idx[1:]: + if idx == prev + 1: + prev = idx + continue + run_end = prev + run_len = run_end - run_start + 1 + if run_len <= max_gap and run_start > 0 and run_end < len(f0) - 1: + if mask is None or (mask[run_start : run_end + 1].mean() >= min_mask_ratio): + f0[run_start : run_end + 1] = interp[run_start : run_end + 1] + run_start = idx + prev = idx + run_end = prev + run_len = run_end - run_start + 1 + if run_len <= max_gap and run_start > 0 and run_end < len(f0) - 1: + if mask is None or (mask[run_start : run_end + 1].mean() >= min_mask_ratio): + f0[run_start : run_end + 1] = interp[run_start : run_end + 1] + + return f0 + + +def _normalize_rmvpe_hybrid_mode(mode: Optional[str]) -> str: + """Normalize user-facing hybrid mode aliases to internal fallback modes.""" + normalized = str(mode or "off").strip().lower() + if normalized in {"", "off", "none", "strict", "official", "rmvpe_strict", "rmvpe-strict", "raw", "rmvpe"}: + return "off" + if normalized in { + "fallback", + "smart", + "rmvpe+fallback", + "rmvpe_fallback", + "rmvpe-fallback", + "hybrid_fallback", + "hybrid-fallback", + "hybrid", + "auto", + "harvest", + "harvest_fallback", + "harvest-fallback", + }: + return "fallback" + return normalized + + +def _build_protect_mix_curve(pitchf: torch.Tensor, protect: float) -> torch.Tensor: + """Create a smooth protect curve for voiced/unvoiced transitions.""" + protect = float(np.clip(protect, 0.0, 1.0)) + if protect >= 1.0: + return torch.ones_like(pitchf, dtype=torch.float32) + + voiced = (pitchf > 0).detach().float().cpu().numpy() + if voiced.ndim == 2: + voiced_curve = voiced[0] + else: + voiced_curve = voiced.reshape(-1) + + smooth_kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + smooth_kernel /= np.sum(smooth_kernel) + voiced_curve = np.convolve(voiced_curve, smooth_kernel, mode="same") + voiced_curve = np.convolve(voiced_curve, smooth_kernel, mode="same") + voiced_curve = np.clip(voiced_curve, 0.0, 1.0) + + mix_curve = protect + (1.0 - protect) * voiced_curve + mix_curve = torch.from_numpy(mix_curve.astype(np.float32)).to(pitchf.device) + if pitchf.ndim == 2: + mix_curve = mix_curve.unsqueeze(0) + return mix_curve + + +def _compute_energy_mask( + audio: np.ndarray, + hop_length: int, + frame_length: int = 1024, + threshold_db: float = -50.0, +) -> np.ndarray: + """Return frames considered voiced based on RMS energy.""" + if audio is None or len(audio) == 0: + return np.zeros(0, dtype=bool) + rms = librosa.feature.rms( + y=audio, frame_length=frame_length, hop_length=hop_length, center=True + )[0] + if rms.size == 0: + return np.zeros(0, dtype=bool) + rms_db = 20 * np.log10(rms + 1e-6) + ref_db = np.percentile(rms_db, 95) + gate_db = ref_db + threshold_db + return rms_db >= gate_db + + +def _compute_harvest_f0( + audio: np.ndarray, + sr: int, + f0_min: float, + f0_max: float, + frame_period: float = 10.0, +) -> np.ndarray: + """Compute Harvest F0 for fallback filling.""" + audio = audio.astype(np.double, copy=False) + f0, t = pyworld.harvest( + audio, + fs=sr, + f0_ceil=f0_max, + f0_floor=f0_min, + frame_period=frame_period, + ) + f0 = pyworld.stonemask(audio, f0, t, sr) + return f0 + + +def _compute_crepe_f0( + audio: np.ndarray, + sr: int, + hop_length: int, + f0_min: float, + f0_max: float, + device: str, + periodicity_threshold: float = 0.1, + return_periodicity: bool = False, +) -> np.ndarray: + """Compute CREPE F0 for fallback filling.""" + audio_tensor = torch.tensor(np.copy(audio))[None].float() + f0, pd = torchcrepe.predict( + audio_tensor, + sr, + hop_length, + f0_min, + f0_max, + "full", + batch_size=512, + device=device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0 = f0[0].cpu().numpy() + pd = pd[0].cpu().numpy() + if periodicity_threshold is not None: + f0[pd < periodicity_threshold] = 0 + if return_periodicity: + return f0, pd + return f0 + + +def _stabilize_f0( + f0: np.ndarray, + max_semitones: float = 6.0, + window: int = 2, + octave_fix: bool = True, +) -> tuple[np.ndarray, int, int]: + """Stabilize F0 by correcting octave errors and extreme jumps.""" + if f0 is None or len(f0) == 0: + return f0, 0, 0 + f0 = np.nan_to_num(f0, nan=0.0).astype(np.float32, copy=True) + voiced_idx = np.where(f0 > 0)[0] + if voiced_idx.size < 3: + return f0, 0, 0 + win = max(1, int(window)) + max_semi = float(max_semitones) + eps = 1e-6 + octave_fix_count = 0 + outlier_count = 0 + + for i in voiced_idx: + start = max(0, i - win) + end = min(len(f0), i + win + 1) + neighbors = f0[start:end] + neighbors = neighbors[neighbors > 0] + if neighbors.size < 3: + continue + med = float(np.median(neighbors)) + if med <= 0: + continue + + if octave_fix: + ratio = f0[i] / (med + eps) + if 1.9 < ratio < 2.1: + f0[i] = f0[i] * 0.5 + octave_fix_count += 1 + elif 0.48 < ratio < 0.52: + f0[i] = f0[i] * 2.0 + octave_fix_count += 1 + + if max_semi > 0: + semi_diff = 12.0 * abs(np.log2((f0[i] + eps) / (med + eps))) + if semi_diff > max_semi: + f0[i] = med + outlier_count += 1 + + return f0, octave_fix_count, outlier_count + + +def _limit_f0_slope( + f0: np.ndarray, + max_semitones: float = 8.0, +) -> tuple[np.ndarray, int]: + """Limit frame-to-frame pitch jumps to reduce harsh transitions.""" + if f0 is None or len(f0) == 0: + return f0, 0 + f0 = np.nan_to_num(f0, nan=0.0).astype(np.float32, copy=True) + max_semi = float(max_semitones) + if max_semi <= 0: + return f0, 0 + max_ratio = 2 ** (max_semi / 12.0) + min_ratio = 1.0 / max_ratio + changed = 0 + prev = None + for i in range(len(f0)): + if f0[i] <= 0: + continue + if prev is None: + prev = f0[i] + continue + ratio = f0[i] / (prev + 1e-6) + if ratio > max_ratio: + f0[i] = prev * max_ratio + changed += 1 + elif ratio < min_ratio: + f0[i] = prev * min_ratio + changed += 1 + prev = f0[i] + return f0, changed + + +class Pipeline(object): + def __init__(self, tgt_sr, config): + self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( + config.x_pad, + config.x_query, + config.x_center, + config.x_max, + config.is_half, + ) + self.disable_chunking = bool(getattr(config, "disable_chunking", False)) + self.sr = 16000 # hubert输入采样率 + self.window = 160 # 每帧点数 + self.t_pad = self.sr * self.x_pad # 每条前后pad时间 + self.t_pad_tgt = tgt_sr * self.x_pad + self.t_pad2 = self.t_pad * 2 + self.t_query = self.sr * self.x_query # 查询切点前后查询时间 + self.t_center = self.sr * self.x_center # 查询切点位置 + self.t_max = self.sr * self.x_max # 免查询时长阈值 + self.device = config.device + self.f0_min = float(getattr(config, "f0_min", 50)) + self.f0_max = float(getattr(config, "f0_max", 1100)) + if self.f0_max <= self.f0_min: + self.f0_max = max(self.f0_min + 1.0, 1100.0) + self.rmvpe_threshold = float(getattr(config, "rmvpe_threshold", 0.02)) + self.f0_energy_threshold_db = float(getattr(config, "f0_energy_threshold_db", -50)) + self.f0_hybrid_mode = _normalize_rmvpe_hybrid_mode( + getattr(config, "f0_hybrid_mode", "off") + ) + self.rmvpe_strict_modes = { + "", + "off", + "none", + "strict", + "official", + "rmvpe_strict", + "rmvpe-strict", + } + self.rmvpe_fallback_modes = { + "fallback", + "smart", + "rmvpe+fallback", + "rmvpe_fallback", + "rmvpe-fallback", + "hybrid_fallback", + "hybrid-fallback", + } + self.crepe_pd_threshold = float(getattr(config, "crepe_pd_threshold", 0.1)) + self.crepe_force_ratio = float(getattr(config, "crepe_force_ratio", 0.05)) + self.crepe_replace_semitones = float(getattr(config, "crepe_replace_semitones", 0.0)) + self.f0_fallback_context_radius = int(getattr(config, "f0_fallback_context_radius", 24)) + self.f0_fallback_repair_gap = int(getattr(config, "f0_fallback_repair_gap", 12)) + self.f0_fallback_post_gap = int(getattr(config, "f0_fallback_post_gap", 10)) + self.f0_fallback_use_crepe = bool(getattr(config, "f0_fallback_use_crepe", True)) + self.f0_fallback_crepe_max_ratio = float(getattr(config, "f0_fallback_crepe_max_ratio", 0.02)) + self.f0_fallback_crepe_max_frames = int(getattr(config, "f0_fallback_crepe_max_frames", 320)) + self.f0_stabilize = bool(getattr(config, "f0_stabilize", False)) + self.f0_stabilize_window = int(getattr(config, "f0_stabilize_window", 2)) + self.f0_stabilize_max_semitones = float( + getattr(config, "f0_stabilize_max_semitones", 6.0) + ) + self.f0_stabilize_octave = bool(getattr(config, "f0_stabilize_octave", True)) + self.f0_rate_limit = bool(getattr(config, "f0_rate_limit", False)) + self.f0_rate_limit_semitones = float( + getattr(config, "f0_rate_limit_semitones", 8.0) + ) + if self.crepe_force_ratio < 0: + self.crepe_force_ratio = 0.0 + if self.crepe_pd_threshold < 0: + self.crepe_pd_threshold = 0.0 + if self.crepe_replace_semitones < 0: + self.crepe_replace_semitones = 0.0 + if self.f0_fallback_context_radius < 1: + self.f0_fallback_context_radius = 1 + if self.f0_fallback_repair_gap < 0: + self.f0_fallback_repair_gap = 0 + if self.f0_fallback_post_gap < 0: + self.f0_fallback_post_gap = 0 + if self.f0_fallback_crepe_max_ratio < 0: + self.f0_fallback_crepe_max_ratio = 0.0 + if self.f0_fallback_crepe_max_frames < 0: + self.f0_fallback_crepe_max_frames = 0 + if self.f0_stabilize_window < 1: + self.f0_stabilize_window = 1 + if self.f0_stabilize_max_semitones < 0: + self.f0_stabilize_max_semitones = 0.0 + if self.f0_rate_limit_semitones < 0: + self.f0_rate_limit_semitones = 0.0 + + if log: + log.detail(f"Pipeline初始化: 目标采样率={tgt_sr}Hz") + log.detail(f"设备: {self.device}, 半精度: {self.is_half}") + log.detail(f"x_pad={self.x_pad}, x_query={self.x_query}, x_center={self.x_center}, x_max={self.x_max}") + log.detail(f"禁用分段: {self.disable_chunking}") + log.detail(f"F0范围: {self.f0_min}-{self.f0_max}Hz, RMVPE阈值: {self.rmvpe_threshold}") + log.detail( + f"F0混合: {self.f0_hybrid_mode}, CREPE阈值: {self.crepe_pd_threshold}, " + f"强制比率: {self.crepe_force_ratio}, 替换阈值(半音): {self.crepe_replace_semitones}" + ) + log.detail( + f"F0兜底: 上下文半径={self.f0_fallback_context_radius}, " + f"预修补长度={self.f0_fallback_repair_gap}, 后修补长度={self.f0_fallback_post_gap}, " + f"CREPE兜底={self.f0_fallback_use_crepe}, " + f"CREPE最大占比={self.f0_fallback_crepe_max_ratio:.2%}, " + f"CREPE最大帧数={self.f0_fallback_crepe_max_frames}" + ) + log.detail( + "RMVPE兜底: " + f"{'on' if self.f0_hybrid_mode in self.rmvpe_fallback_modes else 'off'}" + ) + log.detail( + f"F0稳定器: {self.f0_stabilize}, 窗口: {self.f0_stabilize_window}, " + f"最大跳变(半音): {self.f0_stabilize_max_semitones}, " + f"八度修正: {self.f0_stabilize_octave}" + ) + log.detail( + f"F0限速: {self.f0_rate_limit}, 最大跳变/帧(半音): {self.f0_rate_limit_semitones}" + ) + + def get_f0( + self, + input_audio_path, + x, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0=None, + ): + global input_audio_path2wav + time_step = self.window / self.sr * 1000 + f0_min = self.f0_min + f0_max = self.f0_max + # Mel quantization range MUST match training (50-1100Hz) regardless of + # extraction range, otherwise pitch embedding indices shift and the + # model produces degraded output on all notes. + f0_mel_min = 1127 * np.log(1 + 50.0 / 700) + f0_mel_max = 1127 * np.log(1 + 1100.0 / 700) + + if log: + log.progress(f"提取F0: 方法={f0_method}") + log.detail(f"时间步长: {time_step:.2f}ms, F0范围: {f0_min}-{f0_max}Hz") + log.detail(f"音频长度: {len(x)} 样本, p_len: {p_len}") + + # 将hybrid映射到rmvpe+crepe模式 + if f0_method == "hybrid": + f0_method = "rmvpe" + # 临时设置hybrid模式 + original_hybrid_mode = self.f0_hybrid_mode + self.f0_hybrid_mode = "rmvpe+crepe" + restore_hybrid_mode = True + else: + restore_hybrid_mode = False + + if f0_method == "pm": + if log: + log.detail("使用Parselmouth提取F0...") + f0 = ( + parselmouth.Sound(x, self.sr) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max, + ) + .selected_array["frequency"] + ) + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad( + f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" + ) + if log: + log.detail(f"PM F0提取完成: shape={f0.shape}") + elif f0_method == "harvest": + if log: + log.detail("使用PyWorld Harvest提取F0...") + input_audio_path2wav[input_audio_path] = x.astype(np.double) + f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) + if filter_radius > 2: + f0 = signal.medfilt(f0, 3) + if log: + log.detail(f"应用中值滤波: radius={filter_radius}") + if log: + log.detail(f"Harvest F0提取完成: shape={f0.shape}") + elif f0_method == "crepe": + if log: + log.detail("使用CREPE提取F0...") + model = "full" + # Pick a batch size that doesn't cause memory errors on your gpu + batch_size = 512 + if log: + log.detail(f"CREPE模型: {model}, batch_size: {batch_size}") + # Compute pitch using first gpu + audio = torch.tensor(np.copy(x))[None].float() + f0, pd = torchcrepe.predict( + audio, + self.sr, + self.window, + f0_min, + f0_max, + model, + batch_size=batch_size, + device=self.device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + if log: + log.detail(f"CREPE F0提取完成: shape={f0.shape}") + elif f0_method == "rmvpe": + if self.f0_hybrid_mode in ("crepe", "crepe_only", "crepe-only"): + if log: + log.detail("使用CREPE全量F0 (质量优先)...") + f0 = _compute_crepe_f0( + x, + self.sr, + self.window, + f0_min, + f0_max, + self.device, + periodicity_threshold=self.crepe_pd_threshold, + ) + if log: + log.detail(f"CREPE F0提取完成: shape={f0.shape}") + else: + if log: + log.detail("使用RMVPE提取F0...") + if not hasattr(self, "model_rmvpe"): + from infer.lib.rmvpe import RMVPE + + rmvpe_path = "%s/rmvpe.pt" % os.environ["rmvpe_root"] + logger.info( + "Loading rmvpe model,%s" % rmvpe_path + ) + if log: + log.model(f"加载RMVPE模型: {rmvpe_path}") + self.model_rmvpe = RMVPE( + rmvpe_path, + is_half=self.is_half, + device=self.device, + ) + if log: + log.success("RMVPE模型加载完成") + # Slightly lower threshold to reduce short unvoiced dropouts + f0 = self.model_rmvpe.infer_from_audio(x, thred=self.rmvpe_threshold) + if log: + log.detail(f"RMVPE F0提取完成: shape={f0.shape}") + + if "privateuseone" in str(self.device): # clean ortruntime memory + del self.model_rmvpe.model + del self.model_rmvpe + logger.info("Cleaning ortruntime memory") + if log: + log.detail("清理ONNX Runtime内存") + + if self.f0_hybrid_mode in ("rmvpe+crepe", "rmvpe_crepe", "hybrid", "rmvpe-crepe"): + if log: + log.detail("启用RMVPE+CREPE混合F0 (质量优先)...") + crepe_f0, crepe_pd = _compute_crepe_f0( + x, + self.sr, + self.window, + f0_min, + f0_max, + self.device, + periodicity_threshold=self.crepe_pd_threshold, + return_periodicity=True, + ) + if len(crepe_f0) < len(f0): + crepe_f0 = np.pad(crepe_f0, (0, len(f0) - len(crepe_f0)), mode="edge") + crepe_pd = np.pad(crepe_pd, (0, len(f0) - len(crepe_pd)), mode="edge") + else: + crepe_f0 = crepe_f0[: len(f0)] + crepe_pd = crepe_pd[: len(f0)] + + crepe_mask = crepe_f0 > 0 + drop_ratio = float(np.sum(f0 <= 0)) / max(len(f0), 1) + replace_mask = (f0 <= 0) & crepe_mask + if drop_ratio >= self.crepe_force_ratio: + replace_mask = crepe_mask + + if self.crepe_replace_semitones > 0: + both_voiced = (f0 > 0) & crepe_mask + if np.any(both_voiced): + diff_semi = np.zeros_like(f0, dtype=np.float32) + diff_semi[both_voiced] = np.abs( + 12.0 + * np.log2( + (f0[both_voiced] + 1e-6) / (crepe_f0[both_voiced] + 1e-6) + ) + ) + replace_mask |= both_voiced & (diff_semi >= self.crepe_replace_semitones) + + replaced = int(np.sum(replace_mask)) + f0[replace_mask] = crepe_f0[replace_mask] + if log: + log.detail( + f"CREPE混合完成: 掉线比率={drop_ratio:.2%}, " + f"替换帧={replaced}/{len(f0)}" + ) + + f0 *= pow(2, f0_up_key / 12) + if log: + log.detail(f"应用音调偏移: {f0_up_key} 半音, 倍率: {pow(2, f0_up_key / 12):.4f}") + # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + tf0 = self.sr // self.window # 每秒f0点数 + if inp_f0 is not None: + if log: + log.detail("应用自定义F0曲线...") + delta_t = np.round( + (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 + ).astype("int16") + replace_f0 = np.interp( + list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] + ) + shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] + f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ + :shape + ] + else: + use_rmvpe_fallback = ( + f0_method == "rmvpe" + and self.f0_hybrid_mode not in self.rmvpe_strict_modes + and self.f0_hybrid_mode in self.rmvpe_fallback_modes + ) + + if use_rmvpe_fallback: + energy_mask = _compute_energy_mask( + x, hop_length=self.window, threshold_db=self.f0_energy_threshold_db + ) + if energy_mask.size > 0: + if len(energy_mask) < len(f0): + energy_mask = np.pad( + energy_mask, (0, len(f0) - len(energy_mask)), mode="edge" + ) + else: + energy_mask = energy_mask[: len(f0)] + else: + energy_mask = None + + # Repair short unvoiced gaps only when fallback mode is explicitly enabled. + f0 = repair_f0( + f0, + max_gap=self.f0_fallback_repair_gap, + mask=energy_mask, + ) + + # Conservative F0 fallback: + # only fill dropouts that are surrounded by voiced context. + if energy_mask is not None: + voiced_seed = f0 > 0 + if np.any(voiced_seed): + idx = np.arange(len(f0)) + left_seen = np.where(voiced_seed, idx, -10**9) + left_seen = np.maximum.accumulate(left_seen) + right_seen = np.where(voiced_seed, idx, 10**9) + right_seen = np.minimum.accumulate(right_seen[::-1])[::-1] + context_radius = self.f0_fallback_context_radius + left_near = (idx - left_seen) <= context_radius + right_near = (right_seen - idx) <= context_radius + voiced_context = left_near & right_near + else: + voiced_context = np.zeros_like(f0, dtype=bool) + + need_fill = (f0 <= 0) & energy_mask & voiced_context + if np.any(need_fill): + if log: + log.detail( + f"RMVPE掉线帧(主唱上下文): {int(need_fill.sum())}/{len(f0)},启用保守兜底" + ) + + f0_min_fb = max(30.0, f0_min - 20.0) + f0_max_fb = min(1800.0, f0_max + 200.0) + f0_fb = _compute_harvest_f0(x, self.sr, f0_min_fb, f0_max_fb, 10.0) + if len(f0_fb) < len(f0): + f0_fb = np.pad(f0_fb, (0, len(f0) - len(f0_fb)), mode="edge") + else: + f0_fb = f0_fb[: len(f0)] + + fill_mask = need_fill & (f0_fb > 0) + f0[fill_mask] = f0_fb[fill_mask] + + need_fill2 = (f0 <= 0) & energy_mask & voiced_context + need_fill2_count = int(np.sum(need_fill2)) + need_fill2_ratio = float(need_fill2_count) / max(len(f0), 1) + if np.any(need_fill2) and self.f0_fallback_use_crepe: + allow_crepe_fallback = ( + need_fill2_count <= self.f0_fallback_crepe_max_frames + and need_fill2_ratio <= self.f0_fallback_crepe_max_ratio + ) + else: + allow_crepe_fallback = False + + if np.any(need_fill2) and allow_crepe_fallback: + if log: + log.detail( + f"Harvest后仍掉线(主唱上下文): {int(need_fill2.sum())}/{len(f0)},启用CREPE兜底" + ) + f0_cr = _compute_crepe_f0( + x, + self.sr, + self.window, + f0_min_fb, + f0_max_fb, + self.device, + periodicity_threshold=self.crepe_pd_threshold, + ) + if len(f0_cr) < len(f0): + f0_cr = np.pad(f0_cr, (0, len(f0) - len(f0_cr)), mode="edge") + else: + f0_cr = f0_cr[: len(f0)] + + # Require cross-estimator agreement when both estimators are voiced. + both_voiced = (f0_cr > 0) & (f0_fb > 0) + agree_mask = np.zeros_like(f0, dtype=bool) + if np.any(both_voiced): + semitone_diff = np.abs( + 12.0 * np.log2((f0_cr + 1e-6) / (f0_fb + 1e-6)) + ) + agree_mask = both_voiced & (semitone_diff <= 2.0) + + fill_mask2 = need_fill2 & ( + ((f0_cr > 0) & (f0_fb <= 0)) | agree_mask + ) + f0[fill_mask2] = f0_cr[fill_mask2] + elif np.any(need_fill2) and log: + log.detail( + f"Harvest后仍掉线(主唱上下文): {need_fill2_count}/{len(f0)}," + "已跳过CREPE兜底(超出保守阈值)" + ) + + final_drop = (f0 <= 0) & energy_mask & voiced_context + if np.any(final_drop) and log: + log.detail( + f"保守兜底后保留无声帧: {int(final_drop.sum())}/{len(f0)}" + ) + + # Only smooth short, context-consistent gaps. + f0 = repair_f0( + f0, + max_gap=self.f0_fallback_post_gap, + mask=voiced_context, + ) + elif f0_method == "rmvpe" and log: + log.detail("RMVPE严格模式: 不启用Harvest/CREPE兜底,仅使用RMVPE原始结果") + + if self.f0_stabilize: + f0, octave_fixed, outlier_fixed = _stabilize_f0( + f0, + max_semitones=self.f0_stabilize_max_semitones, + window=self.f0_stabilize_window, + octave_fix=self.f0_stabilize_octave, + ) + if log: + log.detail( + f"F0稳定器完成: 八度修正={octave_fixed}, 跳变修正={outlier_fixed}" + ) + if self.f0_rate_limit: + f0, rate_fixed = _limit_f0_slope( + f0, + max_semitones=self.f0_rate_limit_semitones, + ) + if log: + log.detail(f"F0限速完成: 修正帧={rate_fixed}") + + # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + f0bak = f0.copy() + + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int32) + + if log: + log.detail(f"F0处理完成: coarse shape={f0_coarse.shape}, bak shape={f0bak.shape}") + + # 恢复原始hybrid模式设置 + if restore_hybrid_mode: + self.f0_hybrid_mode = original_hybrid_mode + + return f0_coarse, f0bak + + def vc( + self, + model, + net_g, + sid, + audio0, + pitch, + pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + energy_ref_db=None, + ): # ,file_index,file_big_npy + if log: + log.detail(f"VC推理: 音频长度={len(audio0)}, 版本={version}, 保护={protect}") + + feats = torch.from_numpy(audio0) + if self.is_half: + feats = feats.half() + else: + feats = feats.float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + + inputs = { + "source": feats.to(self.device), + "padding_mask": padding_mask, + "output_layer": 9 if version == "v1" else 12, + } + if log: + log.detail(f"HuBERT输出层: {inputs['output_layer']}") + + t0 = ttime() + with torch.no_grad(): + logits = model.extract_features(**inputs) + feats = model.final_proj(logits[0]) if version == "v1" else logits[0] + + if log: + log.detail(f"特征提取完成: shape={feats.shape}") + + if protect < 0.5 and pitch is not None and pitchf is not None: + feats0 = feats.clone() + if ( + not isinstance(index, type(None)) + and not isinstance(big_npy, type(None)) + and index_rate != 0 + ): + if log: + log.detail(f"应用索引检索: index_rate={index_rate}") + npy = feats[0].cpu().numpy() + if self.is_half: + npy = npy.astype("float32") + + # _, I = index.search(npy, 1) + # npy = big_npy[I.squeeze()] + + score, ix = index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + + if self.is_half: + npy = npy.astype("float16") + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats + ) + if log: + log.detail("索引混合完成") + + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + if protect < 0.5 and pitch is not None and pitchf is not None: + feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( + 0, 2, 1 + ) + t1 = ttime() + p_len = audio0.shape[0] // self.window + if feats.shape[1] < p_len: + p_len = feats.shape[1] + if pitch is not None and pitchf is not None: + pitch = pitch[:, :p_len] + pitchf = pitchf[:, :p_len] + + if protect < 0.5 and pitch is not None and pitchf is not None: + if log: + log.detail(f"应用保护: protect={protect}") + pitchff = _build_protect_mix_curve(pitchf, protect).unsqueeze(-1) + feats = feats * pitchff + feats0 * (1 - pitchff) + feats = feats.to(feats0.dtype) + p_len = torch.tensor([p_len], device=self.device).long() + + # --- 能量感知软门控(所有特征操作完成后、推理前)--- + # 使用连续衰减曲线代替硬二值化,避免静音/有声边界的撕裂伪影。 + _p_len_val = p_len.item() if isinstance(p_len, torch.Tensor) else int(p_len) + _audio_np = audio0.astype(np.float32) + _frame_rms = librosa.feature.rms( + y=_audio_np, frame_length=self.window * 2, hop_length=self.window, center=True + )[0] + if _frame_rms.ndim > 1: + _frame_rms = _frame_rms[0] + if len(_frame_rms) > _p_len_val: + _frame_rms = _frame_rms[:_p_len_val] + elif len(_frame_rms) < _p_len_val: + _frame_rms = np.pad(_frame_rms, (0, _p_len_val - len(_frame_rms)), mode='edge') + + _energy_db = 20.0 * np.log10(_frame_rms + 1e-8) + _ref = energy_ref_db if energy_ref_db is not None else float(np.percentile(_energy_db, 95)) + # Soft gate: sigmoid curve centered at ref-45dB with 6dB transition width. + # Frames well above threshold → gain≈1; frames well below → gain≈0.05 + # (keep a small floor to avoid zero-feature shock to the network). + _silence_center = _ref - 45.0 + _transition_width = 6.0 # dB for the sigmoid ramp + _energy_gate = 1.0 / (1.0 + np.exp(-(_energy_db - _silence_center) / (_transition_width / 4.0))) + # Apply floor: never fully zero features (network handles near-zero better than hard zero) + _energy_gate = np.clip(_energy_gate, 0.05, 1.0) + # Smooth temporally + _sm = np.array([1, 2, 3, 2, 1], dtype=np.float32) + _sm /= _sm.sum() + _energy_gate = np.convolve(_energy_gate, _sm, mode='same')[:_p_len_val] + _energy_gate = np.clip(_energy_gate, 0.05, 1.0) + + # Apply soft gate to features + _feat_len = feats.shape[1] + if len(_energy_gate) > _feat_len: + _feat_gate = _energy_gate[:_feat_len] + elif len(_energy_gate) < _feat_len: + _feat_gate = np.pad(_energy_gate, (0, _feat_len - len(_energy_gate)), mode='constant', constant_values=1.0) + else: + _feat_gate = _energy_gate + _gate_t = torch.from_numpy(_feat_gate.astype(np.float32)).to(feats.device).unsqueeze(0).unsqueeze(-1) + feats = feats * _gate_t + + # F0 soft gating: consistently soft-attenuate both pitch confidence and pitch value + if pitch is not None and pitchf is not None: + _pitch_len = pitch.shape[1] + if len(_energy_gate) > _pitch_len: + _f0_gate = _energy_gate[:_pitch_len] + elif len(_energy_gate) < _pitch_len: + _f0_gate = np.pad(_energy_gate, (0, _pitch_len - len(_energy_gate)), mode='constant', constant_values=1.0) + else: + _f0_gate = _energy_gate + _f0_gate_t = torch.from_numpy(_f0_gate.astype(np.float32)).to(pitch.device).unsqueeze(0) + pitchf = pitchf * _f0_gate_t + # Soft-blend pitch toward silence bin (1) instead of hard switch + _silence_pitch = torch.ones_like(pitch) + _blend = _f0_gate_t.unsqueeze(-1) if _f0_gate_t.dim() < pitch.dim() else _f0_gate_t + pitch = (pitch.float() * _blend + _silence_pitch.float() * (1.0 - _blend)).long() + + if log: + log.detail("执行神经网络推理...") + + with torch.no_grad(): + hasp = pitch is not None and pitchf is not None + arg = (feats, p_len, pitch, pitchf, sid) if hasp else (feats, p_len, sid) + audio1 = (net_g.infer(*arg)[0][0, 0]).data.cpu().float().numpy() + del hasp, arg + del feats, p_len, padding_mask + if torch.cuda.is_available(): + torch.cuda.empty_cache() + t2 = ttime() + times[0] += t1 - t0 + times[2] += t2 - t1 + + if log: + log.detail(f"VC推理完成: 输出长度={len(audio1)}, 耗时={t2-t0:.3f}s") + + return audio1 + + def pipeline( + self, + model, + net_g, + sid, + audio, + input_audio_path, + times, + f0_up_key, + f0_method, + file_index, + index_rate, + if_f0, + filter_radius, + tgt_sr, + resample_sr, + rms_mix_rate, + version, + protect, + f0_file=None, + ): + if log: + log.progress("开始推理管道...") + log.detail(f"输入音频: {input_audio_path}") + log.detail(f"音频长度: {len(audio)} 样本 ({len(audio)/16000:.2f}秒)") + log.config(f"F0方法: {f0_method}, 音调偏移: {f0_up_key}") + log.config(f"索引率: {index_rate}, 滤波半径: {filter_radius}") + log.config(f"目标采样率: {tgt_sr}Hz, 重采样: {resample_sr}Hz") + log.config(f"RMS混合率: {rms_mix_rate}, 保护: {protect}") + log.config(f"版本: {version}, F0启用: {if_f0}") + + if ( + file_index != "" + # and file_big_npy != "" + # and os.path.exists(file_big_npy) == True + and os.path.exists(file_index) + and index_rate != 0 + ): + try: + if log: + log.model(f"加载索引文件: {file_index}") + index = faiss.read_index(file_index) + # big_npy = np.load(file_big_npy) + big_npy = index.reconstruct_n(0, index.ntotal) + if log: + log.detail(f"索引加载完成: {index.ntotal} 个向量") + except: + traceback.print_exc() + if log: + log.warning("索引加载失败,将不使用索引") + index = big_npy = None + else: + index = big_npy = None + if log: + log.detail("未使用索引文件") + + if log: + log.detail("应用高通滤波...") + audio = signal.filtfilt(bh, ah, audio) + + # 全局能量参考(用于分段 vc() 的能量遮蔽阈值一致性) + _global_rms = librosa.feature.rms( + y=audio, frame_length=self.window * 2, hop_length=self.window, center=True + )[0] + if _global_rms.ndim > 1: + _global_rms = _global_rms[0] + if _global_rms.size > 0: + _global_energy_db = 20.0 * np.log10(_global_rms + 1e-8) + _global_ref_db = float(np.percentile(_global_energy_db, 95)) + else: + _global_ref_db = -20.0 + + audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") + opt_ts = [] + if not self.disable_chunking and audio_pad.shape[0] > self.t_max: + if log: + log.detail(f"音频较长,进行分段处理: {audio_pad.shape[0]} > {self.t_max}") + audio_sum = np.zeros_like(audio) + for i in range(self.window): + audio_sum += np.abs(audio_pad[i : i - self.window]) + for t in range(self.t_center, audio.shape[0], self.t_center): + opt_ts.append( + t + - self.t_query + + np.where( + audio_sum[t - self.t_query : t + self.t_query] + == audio_sum[t - self.t_query : t + self.t_query].min() + )[0][0] + ) + if log: + log.detail(f"分段数量: {len(opt_ts) + 1}") + else: + if log: + if self.disable_chunking: + log.detail("已禁用分段,单次处理") + else: + log.detail("音频较短,单次处理") + + s = 0 + audio_opt = [] + t = None + t1 = ttime() + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // self.window + if log: + log.detail(f"填充后音频长度: {audio_pad.shape[0]}, p_len: {p_len}") + + inp_f0 = None + if hasattr(f0_file, "name"): + try: + if log: + log.detail(f"加载自定义F0文件: {f0_file.name}") + with open(f0_file.name, "r") as f: + lines = f.read().strip("\n").split("\n") + inp_f0 = [] + for line in lines: + inp_f0.append([float(i) for i in line.split(",")]) + inp_f0 = np.array(inp_f0, dtype="float32") + if log: + log.detail(f"自定义F0加载完成: {inp_f0.shape}") + except: + traceback.print_exc() + if log: + log.warning("自定义F0加载失败") + + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + pitch, pitchf = None, None + if if_f0 == 1: + if log: + log.progress("提取基频(F0)...") + pitch, pitchf = self.get_f0( + input_audio_path, + audio_pad, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if "mps" not in str(self.device) or "xpu" not in str(self.device): + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + if log: + log.success("F0提取完成") + t2 = ttime() + times[1] += t2 - t1 + if log: + log.detail(f"F0提取耗时: {t2-t1:.3f}s") + + # 分段推理(带交叉淡入淡出消除边界撕裂) + segment_count = len(opt_ts) + 1 + current_segment = 0 + + # Crossfade length at target rate (~12ms). Each boundary segment + # keeps this many extra samples from the normally-trimmed padding + # region. The overlap between adjacent segments is 2 * _xfade_tgt. + _xfade_tgt = min(int(0.012 * tgt_sr), self.t_pad_tgt // 4) if len(opt_ts) > 0 else 0 + + def _trim_segment(raw, is_first, is_last): + """Trim padding from vc() output, keeping crossfade overlap.""" + left = self.t_pad_tgt if is_first else (self.t_pad_tgt - _xfade_tgt) + right = self.t_pad_tgt if is_last else (self.t_pad_tgt - _xfade_tgt) + return raw[left : -right] if right > 0 else raw[left:] + + for idx, t in enumerate(opt_ts): + current_segment += 1 + if log: + log.progress(f"处理分段 {current_segment}/{segment_count}...") + t = t // self.window * self.window + if if_f0 == 1: + raw = self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + pitch[:, s // self.window : (t + self.t_pad2) // self.window], + pitchf[:, s // self.window : (t + self.t_pad2) // self.window], + times, + index, + big_npy, + index_rate, + version, + protect, + energy_ref_db=_global_ref_db, + ) + else: + raw = self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + energy_ref_db=_global_ref_db, + ) + audio_opt.append(_trim_segment(raw, is_first=(idx == 0), is_last=False)) + s = t + + # 最后一段 + if log: + log.progress(f"处理分段 {segment_count}/{segment_count}...") + if if_f0 == 1: + raw = self.vc( + model, + net_g, + sid, + audio_pad[t:], + pitch[:, t // self.window :] if t is not None else pitch, + pitchf[:, t // self.window :] if t is not None else pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + energy_ref_db=_global_ref_db, + ) + else: + raw = self.vc( + model, + net_g, + sid, + audio_pad[t:], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + energy_ref_db=_global_ref_db, + ) + audio_opt.append(_trim_segment(raw, is_first=(len(opt_ts) == 0), is_last=True)) + + if log: + log.detail("合并音频分段...") + + # Overlap-add crossfade: adjacent segments share 2*_xfade_tgt + # samples of overlapping content (same original audio region + # processed as part of different chunks). Linear crossfade + # ensures amplitude-preserving smooth transition. + if len(audio_opt) > 1 and _xfade_tgt > 0: + overlap = 2 * _xfade_tgt + result = audio_opt[0] + for seg in audio_opt[1:]: + xf = min(overlap, len(result), len(seg)) + if xf > 1: + fade_out = np.linspace(1.0, 0.0, xf, dtype=np.float32) + fade_in = 1.0 - fade_out + blended = result[-xf:] * fade_out + seg[:xf] * fade_in + result = np.concatenate([result[:-xf], blended, seg[xf:]]) + else: + result = np.concatenate([result, seg]) + audio_opt = result + else: + audio_opt = np.concatenate(audio_opt) if audio_opt else np.array([], dtype=np.float32) + + if rms_mix_rate != 1: + if log: + log.detail(f"应用RMS混合: rate={rms_mix_rate}") + audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) + + if tgt_sr != resample_sr >= 16000: + if log: + log.detail(f"重采样: {tgt_sr}Hz -> {resample_sr}Hz") + audio_opt = librosa.resample( + audio_opt, orig_sr=tgt_sr, target_sr=resample_sr + ) + + peak_before_clip = float(np.max(np.abs(audio_opt))) + audio_opt = soft_clip(audio_opt, threshold=0.9, ceiling=0.99) + if log and peak_before_clip > 0.9: + peak_after_clip = float(np.max(np.abs(audio_opt))) + log.detail( + f"音频软削波: 峰值 {peak_before_clip:.4f} -> {peak_after_clip:.4f}" + ) + audio_opt = np.clip(audio_opt, -0.99, 0.99) + audio_opt = (audio_opt * 32767.0).astype(np.int16) + + del pitch, pitchf, sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + if log: + log.detail("已清理CUDA缓存") + + if log: + log.success(f"推理管道完成: 输出长度={len(audio_opt)} 样本") + + return audio_opt + diff --git a/infer/modules/vc/utils.py b/infer/modules/vc/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c12a27a10e97aa236d248bb08089123ef8b8fc0f --- /dev/null +++ b/infer/modules/vc/utils.py @@ -0,0 +1,104 @@ +import os +import logging +import functools +import torch + +logger = logging.getLogger(__name__) + +try: + from fairseq import checkpoint_utils + FAIRSEQ_AVAILABLE = True +except Exception: + FAIRSEQ_AVAILABLE = False + + +def _patch_torch_load(): + """Patch torch.load to default weights_only=False for fairseq compatibility (PyTorch 2.6+).""" + _original = torch.load + + @functools.wraps(_original) + def _patched(*args, **kwargs): + kwargs.setdefault("weights_only", False) + return _original(*args, **kwargs) + + return _original, _patched + + +def get_index_path_from_model(sid): + return next( + ( + f + for f in [ + os.path.join(root, name) + for root, _, files in os.walk(os.getenv("index_root"), topdown=False) + for name in files + if name.endswith(".index") and "trained" not in name + ] + if sid.split(".")[0] in f + ), + "", + ) + + +def load_hubert(config): + if FAIRSEQ_AVAILABLE: + _original, _patched = _patch_torch_load() + torch.load = _patched + try: + models, _, _ = checkpoint_utils.load_model_ensemble_and_task( + ["assets/hubert/hubert_base.pt"], + suffix="", + ) + finally: + torch.load = _original + hubert_model = models[0] + hubert_model = hubert_model.to(config.device) + if config.is_half: + hubert_model = hubert_model.half() + else: + hubert_model = hubert_model.float() + return hubert_model.eval() + + try: + import torchaudio + + class HubertWrapper: + def __init__(self, model): + self.model = model + self.final_proj = getattr(model, "final_proj", torch.nn.Identity()) + + def extract_features(self, source, padding_mask=None, output_layer=None): + feats, _ = self.model.extract_features(source) + if output_layer is None: + idx = -1 + else: + idx = min(output_layer - 1, len(feats) - 1) + return (feats[idx], None) + + def to(self, device): + self.model = self.model.to(device) + return self + + def half(self): + self.model = self.model.half() + return self + + def float(self): + self.model = self.model.float() + return self + + def eval(self): + self.model.eval() + return self + + model = torchaudio.pipelines.HUBERT_BASE.get_model() + hubert_model = HubertWrapper(model).to(config.device) + if config.is_half: + hubert_model = hubert_model.half() + else: + hubert_model = hubert_model.float() + return hubert_model.eval() + except Exception as e: + raise RuntimeError( + "HuBERT 模型加载失败,请检查 fairseq 和 torchaudio 是否已安装" + ) from e diff --git a/infer/official_adapter.py b/infer/official_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..776b3e0814d160a22ee380b161c86714227b2967 --- /dev/null +++ b/infer/official_adapter.py @@ -0,0 +1,678 @@ +# -*- coding: utf-8 -*- +""" +Adapter for official RVC WebUI modules (VC + UVR5). +""" +from __future__ import annotations + +import json +import os +import re +import shutil +import subprocess +import sys +from pathlib import Path +from typing import Optional, Tuple + +import soundfile as sf + +from configs.config import Config as OfficialConfig +from lib.logger import log + + +def _load_app_config(root_dir: Path) -> dict: + config_path = root_dir / "configs" / "config.json" + if config_path.exists(): + with open(config_path, "r", encoding="utf-8") as f: + return json.load(f) + return {} + + +def _get_cfg_value(app_cfg: dict, key: str, default): + cover_cfg = app_cfg.get("cover") + if isinstance(cover_cfg, dict) and key in cover_cfg: + return cover_cfg.get(key, default) + return app_cfg.get(key, default) + + +def _to_float(value, default): + try: + return float(value) + except (TypeError, ValueError): + return float(default) + + +def _resolve_index_path(model_path: Path, index_path: Optional[str]) -> Optional[Path]: + """Best-effort resolve of the matching FAISS index for a model.""" + if index_path: + idx_path = Path(index_path) + if idx_path.exists(): + return idx_path + + direct_candidate = model_path.with_suffix(".index") + if direct_candidate.exists(): + return direct_candidate + + index_files = list(model_path.parent.glob("*.index")) + if not index_files: + return None + if len(index_files) == 1: + return index_files[0] + + def _normalize_name(text: str) -> str: + return re.sub(r"[^a-z0-9]+", "", text.lower()) + + def _tokenize_name(text: str): + return [token for token in re.split(r"[^a-z0-9]+", text.lower()) if len(token) >= 2] + + model_norm = _normalize_name(model_path.stem) + model_tokens = set(_tokenize_name(model_path.stem)) + + best_match = None + best_score = -1 + for idx in index_files: + idx_norm = _normalize_name(idx.stem) + idx_tokens = set(_tokenize_name(idx.stem)) + score = 0 + if idx_norm == model_norm: + score += 1000 + if model_norm and (model_norm in idx_norm or idx_norm in model_norm): + score += 300 + score += len(model_tokens & idx_tokens) * 40 + if "added" in idx.stem.lower(): + score += 10 + if score > best_score: + best_score = score + best_match = idx + + if best_match is not None and best_score > 0: + return best_match + return None + + +def setup_official_env(root_dir: Path) -> dict: + """Set env vars used by official modules.""" + log.detail("配置官方模块环境变量...") + app_cfg = _load_app_config(root_dir) + paths = app_cfg.get("paths", {}) + + weights_dir = root_dir / app_cfg.get("weights_dir", paths.get("weights", "assets/weights")) + rmvpe_root = root_dir / app_cfg.get("rmvpe_path", paths.get("rmvpe", "assets/rmvpe/rmvpe.pt")) + rmvpe_root = rmvpe_root.parent + uvr5_root = root_dir / "assets" / "uvr5_weights" + + official_models = weights_dir / "official_models" + official_indexes = weights_dir / "official_indexes" + official_models.mkdir(parents=True, exist_ok=True) + official_indexes.mkdir(parents=True, exist_ok=True) + uvr5_root.mkdir(parents=True, exist_ok=True) + + os.environ["weight_root"] = str(official_models) + os.environ["index_root"] = str(official_indexes) + os.environ["outside_index_root"] = str(official_indexes) + os.environ["rmvpe_root"] = str(rmvpe_root) + os.environ["weight_uvr5_root"] = str(uvr5_root) + + log.detail(f"模型目录: {official_models}") + log.detail(f"索引目录: {official_indexes}") + log.detail(f"UVR5目录: {uvr5_root}") + log.detail(f"RMVPE目录: {rmvpe_root}") + + # Ensure official config cache directories exist. + inuse_root = root_dir / "configs" / "inuse" + (inuse_root / "v1").mkdir(parents=True, exist_ok=True) + (inuse_root / "v2").mkdir(parents=True, exist_ok=True) + + return { + "official_models": official_models, + "official_indexes": official_indexes, + "uvr5_root": uvr5_root, + } + + +def export_model_to_official( + official_models: Path, + official_indexes: Path, + model_path: str, + index_path: Optional[str] +) -> Tuple[str, Optional[str]]: + """Copy model/index into official layout and return sid + index path.""" + model_path = Path(model_path) + sid = f"{model_path.stem}.pth" + target_model = official_models / sid + + log.detail(f"导出模型到官方目录: {sid}") + + if not target_model.exists() or target_model.stat().st_size != model_path.stat().st_size: + log.detail(f"复制模型文件: {model_path} -> {target_model}") + shutil.copy(model_path, target_model) + else: + log.detail(f"模型文件已存在,跳过复制") + + target_index_path = None + resolved_index = _resolve_index_path(model_path, index_path) + if resolved_index is not None: + if index_path and Path(index_path).exists(): + log.detail(f"使用指定索引文件: {resolved_index.name}") + else: + log.detail(f"自动匹配索引文件: {resolved_index.name}") + target_index = official_indexes / f"{model_path.stem}.index" + if not target_index.exists() or target_index.stat().st_size != resolved_index.stat().st_size: + log.detail(f"复制索引文件: {resolved_index} -> {target_index}") + shutil.copy(resolved_index, target_index) + else: + log.detail("索引文件已存在,跳过复制") + target_index_path = str(target_index) + + return sid, target_index_path + + +def _resolve_uvr5_model(uvr5_root: Path, model_name: Optional[str]) -> Optional[str]: + """Resolve UVR5 model name (without extension).""" + if model_name: + stem = model_name.replace(".pth", "").replace(".onnx", "") + cand_pth = uvr5_root / f"{stem}.pth" + cand_onnx = uvr5_root / f"{stem}.onnx" + if cand_pth.exists() or cand_onnx.exists(): + return stem + + for name in os.listdir(uvr5_root): + if name.endswith(".pth") or "onnx" in name: + return name.replace(".pth", "").replace(".onnx", "") + return None + + +def separate_uvr5( + input_audio: str, + temp_dir: Path, + model_name: Optional[str], + agg: int = 10, + fmt: str = "wav" +) -> Tuple[str, str]: + """Run UVR5 separation and return vocals/ins paths.""" + log.progress("开始UVR5人声分离...") + log.detail(f"输入音频: {input_audio}") + log.detail(f"临时目录: {temp_dir}") + log.config(f"激进度: {agg}, 输出格式: {fmt}") + + setup_official_env(Path(__file__).parent.parent) + from infer.modules.uvr5.modules import uvr as uvr5_run + try: + import ffmpeg # noqa: F401 + except Exception as e: + raise ImportError("请先安装 ffmpeg-python") from e + temp_dir.mkdir(parents=True, exist_ok=True) + vocals_dir = temp_dir / "vocal" + ins_dir = temp_dir / "ins" + vocals_dir.mkdir(parents=True, exist_ok=True) + ins_dir.mkdir(parents=True, exist_ok=True) + + log.detail(f"人声输出目录: {vocals_dir}") + log.detail(f"伴奏输出目录: {ins_dir}") + + uvr5_root = Path(os.environ["weight_uvr5_root"]) + model_name = _resolve_uvr5_model(uvr5_root, model_name) + if not model_name: + raise FileNotFoundError( + f"UVR5 模型未找到,请将模型放入: {uvr5_root}" + ) + + log.model(f"使用UVR5模型: {model_name}") + + # Official UVR5 expects a directory input; ensure only one file is present. + input_dir = temp_dir / "input" + if input_dir.exists(): + shutil.rmtree(input_dir) + input_dir.mkdir(parents=True, exist_ok=True) + input_file = input_dir / Path(input_audio).name + log.detail(f"复制输入文件到: {input_file}") + shutil.copy2(input_audio, input_file) + + log.progress("正在执行UVR5分离...") + # generator yields progress + for progress_info in uvr5_run( + model_name, + str(input_dir), + str(vocals_dir), + [], + str(ins_dir), + agg, + fmt, + ): + if progress_info: + log.detail(f"UVR5进度: {progress_info}") + + vocal_files = sorted(vocals_dir.glob(f"*.{fmt}"), key=lambda p: p.stat().st_mtime) + ins_files = sorted(ins_dir.glob(f"*.{fmt}"), key=lambda p: p.stat().st_mtime) + if not vocal_files or not ins_files: + raise RuntimeError("UVR5 分离失败,未生成输出文件") + + log.success(f"UVR5分离完成") + log.audio(f"人声文件: {vocal_files[-1].name}") + log.audio(f"伴奏文件: {ins_files[-1].name}") + + return str(vocal_files[-1]), str(ins_files[-1]) + + +def convert_vocals_official( + vocals_path: str, + output_path: str, + model_path: str, + index_path: Optional[str], + f0_method: str, + pitch_shift: int, + index_rate: float, + filter_radius: int, + rms_mix_rate: float, + protect: float, + speaker_id: int = 0, + repair_profile: bool = False, +) -> str: + """Run official VC pipeline on vocals.""" + rms_mix_rate = float(max(0.0, min(1.0, rms_mix_rate))) + # Official vc pipeline uses the opposite convention: 1=off, 0=strongest. + official_rms_mix_rate = 1.0 - rms_mix_rate + + log.progress("开始官方VC人声转换...") + log.detail(f"输入人声: {vocals_path}") + log.detail(f"输出路径: {output_path}") + log.model(f"RVC模型: {Path(model_path).name}") + if index_path: + log.model(f"索引文件: {Path(index_path).name}") + + log.config(f"F0方法: {f0_method}") + log.config(f"音调偏移: {pitch_shift} 半音") + log.config(f"索引率: {index_rate}") + log.config(f"滤波半径: {filter_radius}") + log.config(f"RMS混合率: {rms_mix_rate}") + log.config(f"保护系数: {protect}") + if repair_profile: + log.config("唱歌修复: 开启") + + root_dir = Path(__file__).parent.parent + env_paths = setup_official_env(root_dir) + + log.detail("导入官方VC模块...") + from infer.modules.vc.modules import VC + + sid, official_index = export_model_to_official( + env_paths["official_models"], + env_paths["official_indexes"], + model_path, + index_path + ) + log.detail(f"模型SID: {sid}") + if official_index: + log.detail(f"官方索引路径: {official_index}") + + log.detail("初始化官方配置...") + config = OfficialConfig() + app_cfg = _load_app_config(root_dir) + config.disable_chunking = bool(app_cfg.get("disable_chunking", False)) + if "cover" in app_cfg and isinstance(app_cfg["cover"], dict): + config.disable_chunking = bool(app_cfg["cover"].get("disable_chunking", config.disable_chunking)) + config.f0_min = _to_float(_get_cfg_value(app_cfg, "f0_min", 50), 50) + config.f0_max = _to_float(_get_cfg_value(app_cfg, "f0_max", 1100), 1100) + if config.f0_max <= config.f0_min: + config.f0_max = max(config.f0_min + 1.0, 1100.0) + # Keep RMVPE extraction aligned with RVC training pitch embedding range. + # Allowing much wider ranges (e.g. 1600Hz) often tracks higher harmonics + # instead of the fundamental and introduces synthetic buzzing artifacts. + if f0_method == "rmvpe": + if config.f0_min != 50.0 or config.f0_max != 1100.0: + log.warning( + "检测到RMVPE F0范围偏离RVC训练范围,已强制使用 50-1100Hz 以避免误跟踪高次谐波" + ) + config.f0_min = 50.0 + config.f0_max = 1100.0 + config.rmvpe_threshold = _to_float(_get_cfg_value(app_cfg, "rmvpe_threshold", 0.02), 0.02) + config.f0_energy_threshold_db = _to_float( + _get_cfg_value(app_cfg, "f0_energy_threshold_db", -50), -50 + ) + config.f0_hybrid_mode = str(_get_cfg_value(app_cfg, "f0_hybrid_mode", "off")) + config.crepe_pd_threshold = _to_float( + _get_cfg_value(app_cfg, "crepe_pd_threshold", 0.1), 0.1 + ) + config.crepe_force_ratio = _to_float( + _get_cfg_value(app_cfg, "crepe_force_ratio", 0.05), 0.05 + ) + config.crepe_replace_semitones = _to_float( + _get_cfg_value(app_cfg, "crepe_replace_semitones", 0.0), 0.0 + ) + config.f0_stabilize = bool(_get_cfg_value(app_cfg, "f0_stabilize", False)) + config.f0_stabilize_window = int(_get_cfg_value(app_cfg, "f0_stabilize_window", 2)) + config.f0_stabilize_max_semitones = _to_float( + _get_cfg_value(app_cfg, "f0_stabilize_max_semitones", 6.0), 6.0 + ) + config.f0_stabilize_octave = bool(_get_cfg_value(app_cfg, "f0_stabilize_octave", True)) + config.f0_rate_limit = bool(_get_cfg_value(app_cfg, "f0_rate_limit", False)) + config.f0_rate_limit_semitones = _to_float( + _get_cfg_value(app_cfg, "f0_rate_limit_semitones", 8.0), 8.0 + ) + if repair_profile: + config.is_half = False + config.f0_hybrid_mode = "fallback" + config.f0_energy_threshold_db = -42.0 + config.f0_fallback_context_radius = 12 + config.f0_fallback_repair_gap = 6 + config.f0_fallback_post_gap = 4 + config.f0_fallback_use_crepe = True + config.f0_fallback_crepe_max_ratio = 0.006 + config.f0_fallback_crepe_max_frames = 160 + config.f0_stabilize = True + config.f0_rate_limit = True + log.detail("唱歌修复配置已应用: FP32, 更保守F0兜底, F0稳定器, F0限速") + log.detail(f"设备: {config.device}, 半精度: {config.is_half}") + log.config(f"F0范围: {config.f0_min}-{config.f0_max}Hz") + log.config(f"RMVPE阈值: {config.rmvpe_threshold}") + log.config(f"F0能量阈值: {config.f0_energy_threshold_db}dB") + log.config( + f"F0混合: {config.f0_hybrid_mode}, CREPE阈值: {config.crepe_pd_threshold}, " + f"强制比率: {config.crepe_force_ratio}, 替换阈值(半音): {config.crepe_replace_semitones}" + ) + log.config( + f"F0稳定器: {config.f0_stabilize}, 窗口: {config.f0_stabilize_window}, " + f"最大跳变(半音): {config.f0_stabilize_max_semitones}, " + f"八度修正: {config.f0_stabilize_octave}" + ) + log.config( + f"F0限速: {config.f0_rate_limit}, 最大跳变/帧(半音): {config.f0_rate_limit_semitones}" + ) + + log.model("初始化VC实例...") + vc = VC(config) + + log.progress(f"加载模型: {sid}") + vc.get_vc(sid) + + spk_max = 1 + try: + if getattr(vc, "cpt", None) is not None: + spk_max = int(vc.cpt["config"][-3]) + except Exception: + spk_max = 1 + spk_max = max(1, spk_max) + spk_id = int(max(0, min(spk_max - 1, int(speaker_id)))) + if spk_id != int(speaker_id): + log.warning(f"说话人ID超出范围,已自动修正为 {spk_id} (可用范围: 0-{spk_max - 1})") + log.progress("执行人声转换...") + log.detail(f"说话人ID: {spk_id}") + + info, (sr, audio) = vc.vc_single( + spk_id, + vocals_path, + pitch_shift, + None, + f0_method, + official_index or "", + "", + index_rate, + filter_radius, + 0, + official_rms_mix_rate, + protect, + ) + + if sr is None or audio is None: + log.error(f"VC转换失败: {info}") + raise RuntimeError(info) + + log.detail(f"转换信息: {info}") + log.detail(f"输出采样率: {sr} Hz") + log.detail(f"输出音频长度: {len(audio)} 样本") + + log.progress(f"保存输出文件: {output_path}") + sf.write(output_path, audio, sr) + + output_size = Path(output_path).stat().st_size + log.success(f"官方VC转换完成: {output_path}") + log.audio(f"输出文件大小: {output_size / 1024 / 1024:.2f} MB") + + return output_path + + + + + + +def _sync_upstream_reference_asset(src: Path, dst: Path, label: str) -> None: + """Ensure vendored official tree has the same runtime asset expected upstream.""" + if not src.exists(): + raise FileNotFoundError(f"{label} not found: {src}") + dst.parent.mkdir(parents=True, exist_ok=True) + if not dst.exists() or dst.stat().st_size != src.stat().st_size: + shutil.copy2(src, dst) + log.detail(f"同步官方资源: {src.name} -> {dst}") + + + +def setup_upstream_official_env(root_dir: Path) -> dict: + """Prepare vendored upstream RVC layout and environment.""" + log.detail("准备内置官方 RVC 环境...") + official_root = root_dir / "_official_rvc" + if not official_root.exists(): + raise FileNotFoundError(f"Upstream RVC directory not found: {official_root}") + + official_models = official_root / "assets" / "weights" + official_indexes = official_root / "assets" / "indices" + official_rmvpe_root = official_root / "assets" / "rmvpe" + official_hubert_root = official_root / "assets" / "hubert" + official_uvr5_root = official_root / "assets" / "uvr5_weights" + official_models.mkdir(parents=True, exist_ok=True) + official_indexes.mkdir(parents=True, exist_ok=True) + official_rmvpe_root.mkdir(parents=True, exist_ok=True) + official_hubert_root.mkdir(parents=True, exist_ok=True) + official_uvr5_root.mkdir(parents=True, exist_ok=True) + + _sync_upstream_reference_asset( + root_dir / "assets" / "hubert" / "hubert_base.pt", + official_hubert_root / "hubert_base.pt", + "HuBERT model", + ) + _sync_upstream_reference_asset( + root_dir / "assets" / "rmvpe" / "rmvpe.pt", + official_rmvpe_root / "rmvpe.pt", + "RMVPE model", + ) + + os.environ["weight_root"] = str(official_models) + os.environ["index_root"] = str(official_indexes) + os.environ["outside_index_root"] = str(official_indexes) + os.environ["rmvpe_root"] = str(official_rmvpe_root) + os.environ["weight_uvr5_root"] = str(official_uvr5_root) + + log.detail(f"官方根目录: {official_root}") + log.detail(f"官方模型目录: {official_models}") + log.detail(f"官方索引目录: {official_indexes}") + log.detail(f"官方RMVPE目录: {official_rmvpe_root}") + log.detail(f"官方HuBERT目录: {official_hubert_root}") + log.detail(f"官方UVR5目录: {official_uvr5_root}") + + return { + "official_root": official_root, + "official_models": official_models, + "official_indexes": official_indexes, + "official_rmvpe_root": official_rmvpe_root, + "official_hubert_root": official_hubert_root, + "official_uvr5_root": official_uvr5_root, + } + + + +def convert_vocals_official_upstream( + vocals_path: str, + output_path: str, + model_path: str, + index_path: Optional[str], + f0_method: str, + pitch_shift: int, + index_rate: float, + filter_radius: int, + rms_mix_rate: float, + protect: float, + speaker_id: int = 0, +) -> str: + """Run vendored upstream official RVC in an isolated subprocess.""" + root_dir = Path(__file__).parent.parent + env_paths = setup_upstream_official_env(root_dir) + + sid, official_index = export_model_to_official( + env_paths["official_models"], + env_paths["official_indexes"], + model_path, + index_path, + ) + + official_rms_mix_rate = 1.0 - float(rms_mix_rate) + runner_path = root_dir / "infer" / "official_upstream_runner.py" + env = os.environ.copy() + env["PYTHONIOENCODING"] = "utf-8" + + command = [ + sys.executable, + str(runner_path), + "--sid", + sid, + "--vocals-path", + str(vocals_path), + "--output-path", + str(output_path), + "--f0-method", + str(f0_method), + "--pitch-shift", + str(int(pitch_shift)), + "--index-path", + str(official_index or ""), + "--index-rate", + str(float(index_rate)), + "--filter-radius", + str(int(filter_radius)), + "--rms-mix-rate", + str(float(official_rms_mix_rate)), + "--protect", + str(float(protect)), + "--speaker-id", + str(int(speaker_id)), + ] + + log.progress("开始内置官方VC转换...") + log.detail(f"官方模型SID: {sid}") + if official_index: + log.detail(f"官方索引路径: {official_index}") + log.detail(f"官方RMS混合率: {official_rms_mix_rate}") + + try: + subprocess.run( + command, + cwd=env_paths["official_root"], + env=env, + check=True, + ) + except subprocess.CalledProcessError as exc: + raise RuntimeError(f"内置官方VC转换失败,退出码: {exc.returncode}") from exc + + output_file = Path(output_path) + if not output_file.exists(): + raise RuntimeError(f"内置官方VC未生成输出文件: {output_path}") + + output_size = output_file.stat().st_size + log.success(f"内置官方VC转换完成: {output_path}") + log.audio(f"输出文件大小: {output_size / 1024 / 1024:.2f} MB") + return output_path + + + +def _sync_upstream_uvr5_model(root_dir: Path, official_uvr5_root: Path, model_name: Optional[str]) -> str: + """Copy the selected UVR5 model into vendored official layout and return the stem.""" + source_root = root_dir / "assets" / "uvr5_weights" + if not source_root.exists(): + raise FileNotFoundError(f"UVR5 模型目录未找到: {source_root}") + + candidates = [] + if model_name: + stem = model_name.replace('.pth', '').replace('.onnx', '') + candidates.extend([source_root / f"{stem}.pth", source_root / f"{stem}.onnx", source_root / stem]) + else: + candidates.extend(sorted(source_root.glob('*.pth'))) + candidates.extend(sorted(source_root.glob('*.onnx'))) + + source_model = next((candidate for candidate in candidates if candidate.exists()), None) + if source_model is None: + raise FileNotFoundError(f"未找到可用的 UVR5 模型: {model_name or '自动选择'}") + + target_model = official_uvr5_root / source_model.name + if not target_model.exists() or target_model.stat().st_size != source_model.stat().st_size: + shutil.copy2(source_model, target_model) + log.detail(f"同步官方UVR5模型: {source_model.name} -> {target_model}") + return source_model.stem + + + +def separate_uvr5_official_upstream( + input_audio: str, + temp_dir: Path, + model_name: Optional[str], + agg: int = 10, + fmt: str = "wav", +) -> Tuple[str, str]: + """Run vendored upstream UVR5 separation in an isolated subprocess.""" + root_dir = Path(__file__).parent.parent + env_paths = setup_upstream_official_env(root_dir) + resolved_model_name = _sync_upstream_uvr5_model(root_dir, env_paths["official_uvr5_root"], model_name) + + temp_dir.mkdir(parents=True, exist_ok=True) + input_dir = temp_dir / "input" + vocals_dir = temp_dir / "vocal" + ins_dir = temp_dir / "ins" + if input_dir.exists(): + shutil.rmtree(input_dir) + input_dir.mkdir(parents=True, exist_ok=True) + vocals_dir.mkdir(parents=True, exist_ok=True) + ins_dir.mkdir(parents=True, exist_ok=True) + input_file = input_dir / Path(input_audio).name + shutil.copy2(input_audio, input_file) + + runner_path = root_dir / "infer" / "official_upstream_uvr_runner.py" + env = os.environ.copy() + env["PYTHONIOENCODING"] = "utf-8" + command = [ + sys.executable, + str(runner_path), + "--model-name", + resolved_model_name, + "--input-dir", + str(input_dir), + "--save-root-vocal", + str(vocals_dir), + "--save-root-ins", + str(ins_dir), + "--agg", + str(int(agg)), + "--format", + str(fmt), + ] + + log.progress("开始内置官方UVR5分离...") + log.detail(f"官方UVR5模型: {resolved_model_name}") + log.detail(f"官方UVR5输入目录: {input_dir}") + log.detail(f"官方UVR5人声输出: {vocals_dir}") + log.detail(f"官方UVR5伴奏输出: {ins_dir}") + + try: + subprocess.run( + command, + cwd=env_paths["official_root"], + env=env, + check=True, + ) + except subprocess.CalledProcessError as exc: + raise RuntimeError(f"内置官方UVR5分离失败,退出码: {exc.returncode}") from exc + + vocal_files = sorted(vocals_dir.glob(f"*.{fmt}"), key=lambda p: p.stat().st_mtime) + ins_files = sorted(ins_dir.glob(f"*.{fmt}"), key=lambda p: p.stat().st_mtime) + if not vocal_files or not ins_files: + raise RuntimeError("内置官方UVR5分离失败,未生成输出文件") + + log.success("内置官方UVR5分离完成") + log.audio(f"人声文件: {vocal_files[-1].name}") + log.audio(f"伴奏文件: {ins_files[-1].name}") + return str(vocal_files[-1]), str(ins_files[-1]) diff --git a/infer/official_upstream_runner.py b/infer/official_upstream_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..8fa433aed2da58e583ca610befcad123d36ce482 --- /dev/null +++ b/infer/official_upstream_runner.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +"""Subprocess runner for vendored upstream RVC conversion.""" +from __future__ import annotations + +import argparse +import logging +import os +import re +import sys +from pathlib import Path + +import soundfile as sf + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run vendored upstream RVC VC") + parser.add_argument("--sid", required=True) + parser.add_argument("--vocals-path", required=True) + parser.add_argument("--output-path", required=True) + parser.add_argument("--f0-method", required=True) + parser.add_argument("--pitch-shift", type=int, required=True) + parser.add_argument("--index-path", default="") + parser.add_argument("--index-rate", type=float, required=True) + parser.add_argument("--filter-radius", type=int, required=True) + parser.add_argument("--rms-mix-rate", type=float, required=True) + parser.add_argument("--protect", type=float, required=True) + parser.add_argument("--speaker-id", type=int, required=True) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + repo_root = Path(__file__).resolve().parent.parent + official_root = repo_root / "_official_rvc" + + logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(name)s | %(message)s") + + sys.path.insert(0, str(official_root)) + os.chdir(official_root) + sys.argv = [sys.argv[0]] + + from configs.config import Config # type: ignore + from infer.modules.vc.modules import VC # type: ignore + + config = Config() + vc = VC(config) + vc.get_vc(args.sid) + + spk_max = 1 + try: + if getattr(vc, "cpt", None) is not None: + spk_max = int(vc.cpt["config"][-3]) + except Exception: + spk_max = 1 + spk_id = max(0, min(max(1, spk_max) - 1, int(args.speaker_id))) + + info, (sr, audio) = vc.vc_single( + spk_id, + args.vocals_path, + args.pitch_shift, + None, + args.f0_method, + args.index_path, + "", + args.index_rate, + args.filter_radius, + 0, + args.rms_mix_rate, + args.protect, + ) + if sr is None or audio is None: + raise RuntimeError(info) + + output_path = Path(args.output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + sf.write(output_path, audio, sr) + + match = re.search( + r"Time:\s*npy:\s*([0-9.]+)s,\s*f0:\s*([0-9.]+)s,\s*infer:\s*([0-9.]+)s\.", + str(info), + re.IGNORECASE | re.MULTILINE, + ) + print("转换成功。", flush=True) + if args.index_path: + print(f"索引:{args.index_path}", flush=True) + if match: + print( + f"耗时:npy {match.group(1)}s,f0 {match.group(2)}s,推理 {match.group(3)}s", + flush=True, + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/infer/official_upstream_uvr_runner.py b/infer/official_upstream_uvr_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..ad8cf05cb9874708251b26b6b3c0ad2503896fd5 --- /dev/null +++ b/infer/official_upstream_uvr_runner.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +"""Subprocess runner for vendored upstream UVR5 separation.""" +from __future__ import annotations + +import argparse +import logging +import os +import sys +from pathlib import Path + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run vendored upstream UVR5 separation") + parser.add_argument("--model-name", required=True) + parser.add_argument("--input-dir", required=True) + parser.add_argument("--save-root-vocal", required=True) + parser.add_argument("--save-root-ins", required=True) + parser.add_argument("--agg", type=int, required=True) + parser.add_argument("--format", required=True) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + repo_root = Path(__file__).resolve().parent.parent + official_root = repo_root / "_official_rvc" + + logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(name)s | %(message)s") + + sys.path.insert(0, str(official_root)) + os.chdir(official_root) + sys.argv = [sys.argv[0]] + + from infer.modules.uvr5.modules import uvr # type: ignore + + for progress_info in uvr( + args.model_name, + args.input_dir, + args.save_root_vocal, + [], + args.save_root_ins, + args.agg, + args.format, + ): + if progress_info: + print(progress_info, flush=True) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/infer/pipeline.py b/infer/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..c737dac8049192bd9b87a24286f9d8132e5b4ee7 --- /dev/null +++ b/infer/pipeline.py @@ -0,0 +1,1183 @@ +# -*- coding: utf-8 -*- +""" +RVC 推理管道 - 端到端 AI 翻唱 +""" +import os +import gc +import torch +import numpy as np +import faiss +from pathlib import Path +from typing import Optional, Tuple, Union +from scipy import signal as sp_signal + +from lib.audio import load_audio, save_audio, normalize_audio, soft_clip +from lib.device import get_device, empty_device_cache, supports_fp16 +from lib.logger import log +from infer.f0_extractor import get_f0_extractor, shift_f0, F0Method + +# 48Hz 高通 Butterworth 滤波器(与官方管道一致,去除低频隆隆声) +_bh, _ah = sp_signal.butter(N=5, Wn=48, btype="high", fs=16000) + + +class VoiceConversionPipeline: + """RVC 推理管道""" + + def __init__(self, device: str = "cuda"): + """ + 初始化管道 + + Args: + device: 计算设备 ("cuda" 或 "cpu") + """ + self.device = get_device(device) + self.hubert_model = None + self.hubert_model_type = None + self.hubert_layer = 12 + self.voice_model = None + self.index = None + self.f0_extractor = None + self.spk_count = 1 + self.model_version = "v2" # 默认 v2(768 维) + + # 默认参数 + self.sample_rate = 16000 # HuBERT 输入采样率 + self.output_sr = 48000 # 输出采样率 + + def unload_hubert(self): + """卸载 HuBERT 模型释放显存""" + if self.hubert_model is not None: + self.hubert_model.cpu() + del self.hubert_model + self.hubert_model = None + self.hubert_model_type = None + gc.collect() + empty_device_cache(self.device) + + def unload_f0_extractor(self): + """卸载 F0 提取器释放显存""" + if self.f0_extractor is not None: + # RMVPEExtractor.model 是 RMVPE 包装类,内部有 model 和 mel_extractor + if hasattr(self.f0_extractor, 'model') and self.f0_extractor.model is not None: + rmvpe = self.f0_extractor.model + # 卸载内部的 E2E 模型 + if hasattr(rmvpe, 'model') and rmvpe.model is not None: + rmvpe.model.cpu() + del rmvpe.model + rmvpe.model = None + # 卸载 mel_extractor + if hasattr(rmvpe, 'mel_extractor') and rmvpe.mel_extractor is not None: + rmvpe.mel_extractor.cpu() + del rmvpe.mel_extractor + rmvpe.mel_extractor = None + del self.f0_extractor.model + self.f0_extractor.model = None + del self.f0_extractor + self.f0_extractor = None + gc.collect() + empty_device_cache(self.device) + + def unload_voice_model(self): + """卸载语音模型释放显存""" + if self.voice_model is not None: + self.voice_model.cpu() + del self.voice_model + self.voice_model = None + gc.collect() + empty_device_cache(self.device) + + def unload_all(self): + """卸载所有模型""" + self.unload_hubert() + self.unload_f0_extractor() + self.unload_voice_model() + self.index = None + + def load_hubert(self, model_path: str): + """ + 加载 HuBERT 模型 + + Args: + model_path: HuBERT 模型路径(可以是本地 .pt 文件或 Hugging Face 模型名) + """ + # 优先使用 fairseq 兼容实现(官方实现) + if os.path.isfile(model_path): + try: + from fairseq import checkpoint_utils + + models, _, _ = checkpoint_utils.load_model_ensemble_and_task( + [model_path], + suffix="" + ) + model = models[0] + model = model.to(self.device).eval() + self.hubert_model = model + self.hubert_model_type = "fairseq" + log.info(f"HuBERT 模型已加载: {model_path} ({self.device})") + return + except Exception as e: + log.warning(f"fairseq 加载失败,尝试 torchaudio: {e}") + + try: + import torchaudio + + bundle = torchaudio.pipelines.HUBERT_BASE + model = bundle.get_model() + model = model.to(self.device).eval() + self.hubert_model = model + self.hubert_model_type = "torchaudio" + log.info( + f"HuBERT 模型已加载: torchaudio HUBERT_BASE ({self.device})" + ) + return + except Exception as e: + log.warning(f"torchaudio 加载失败,尝试 transformers: {e}") + + from transformers import HubertModel + + if os.path.isfile(model_path): + log.info("检测到本地模型文件,将使用 Hugging Face 预训练模型替代") + model_name = "facebook/hubert-base-ls960" + else: + model_name = model_path + + try: + self.hubert_model = HubertModel.from_pretrained(model_name) + except Exception as e: + log.warning(f"从网络加载失败,尝试使用本地缓存: {e}") + self.hubert_model = HubertModel.from_pretrained( + model_name, + local_files_only=True + ) + self.hubert_model = self.hubert_model.to(self.device).eval() + self.hubert_model_type = "transformers" + log.info(f"HuBERT 模型已加载: {model_name} ({self.device})") + + def load_voice_model(self, model_path: str) -> dict: + """ + 加载语音模型 + + Args: + model_path: 模型文件路径 (.pth) + + Returns: + dict: 模型信息 + """ + log.debug(f"正在加载语音模型: {model_path}") + cpt = torch.load(model_path, map_location="cpu", weights_only=False) + + log.debug(f"模型文件 keys: {cpt.keys()}") + + # 提取模型配置 + config = cpt.get("config", []) + self.output_sr = cpt.get("sr", 48000) + + log.debug(f"config 类型: {type(config)}, 内容: {config}") + log.debug(f"采样率: {self.output_sr}") + + # 处理 list 格式的 config(RVC v2 标准格式) + if isinstance(config, list) and len(config) >= 18: + model_config = { + "spec_channels": config[0], + "segment_size": config[1], + "inter_channels": config[2], + "hidden_channels": config[3], + "filter_channels": config[4], + "n_heads": config[5], + "n_layers": config[6], + "kernel_size": config[7], + "p_dropout": config[8], + "resblock": config[9], + "resblock_kernel_sizes": config[10], + "resblock_dilation_sizes": config[11], + "upsample_rates": config[12], + "upsample_initial_channel": config[13], + "upsample_kernel_sizes": config[14], + "spk_embed_dim": config[15], + "gin_channels": config[16], + } + # 使用 config 中的采样率(如果有) + if len(config) > 17: + self.output_sr = config[17] + elif isinstance(config, dict): + # 兼容 dict 格式 + model_config = config + else: + # 使用默认值 + log.warning("无法解析 config,使用默认值") + model_config = {} + + log.debug(f"解析后的配置: {model_config}") + + # 根据gin_channels选择正确的合成器 + # v1模型: gin_channels=256, 使用256维HuBERT特征 + # v2模型: gin_channels=256, 使用768维HuBERT特征 + # 判断依据:检查模型文件中是否有'version'字段,或根据实际权重形状判断 + gin_channels = model_config.get("gin_channels", 256) + + # 判断模型版本的优先级: + # 1. 检查'version'字段 + # 2. 检查权重形状 enc_p.emb_phone.weight + # 3. 默认v2 + model_version = None + + if "version" in cpt: + model_version = cpt["version"] + log.debug(f"从version字段检测到: {model_version}") + elif "weight" in cpt and "enc_p.emb_phone.weight" in cpt["weight"]: + # 检查 enc_p.emb_phone.weight 的形状 + # v1: [hidden_channels, 256] + # v2: [hidden_channels, 768] + emb_shape = cpt["weight"]["enc_p.emb_phone.weight"].shape + log.debug(f"enc_p.emb_phone.weight 形状: {emb_shape}") + if emb_shape[1] == 256: + model_version = "v1" + log.debug("从权重形状检测到: v1 (256维)") + elif emb_shape[1] == 768: + model_version = "v2" + log.debug("从权重形状检测到: v2 (768维)") + + # 根据检测结果选择合成器 + if model_version == "v1": + # v1模型:256维 + from infer.lib.infer_pack.models import SynthesizerTrnMs256NSFsid + synthesizer_class = SynthesizerTrnMs256NSFsid + self.model_version = "v1" + log.debug(f"使用v1合成器 (256维)") + else: + # v2模型:768维(默认) + from infer.lib.infer_pack.models import SynthesizerTrnMs768NSFsid + synthesizer_class = SynthesizerTrnMs768NSFsid + self.model_version = "v2" + log.debug(f"使用v2合成器 (768维)") + + # 加载模型权重 + self.voice_model = synthesizer_class( + spec_channels=model_config.get("spec_channels", 1025), + segment_size=model_config.get("segment_size", 32), + inter_channels=model_config.get("inter_channels", 192), + hidden_channels=model_config.get("hidden_channels", 192), + filter_channels=model_config.get("filter_channels", 768), + n_heads=model_config.get("n_heads", 2), + n_layers=model_config.get("n_layers", 6), + kernel_size=model_config.get("kernel_size", 3), + p_dropout=model_config.get("p_dropout", 0), + resblock=model_config.get("resblock", "1"), + resblock_kernel_sizes=model_config.get("resblock_kernel_sizes", [3, 7, 11]), + resblock_dilation_sizes=model_config.get("resblock_dilation_sizes", [[1, 3, 5], [1, 3, 5], [1, 3, 5]]), + upsample_rates=model_config.get("upsample_rates", [10, 10, 2, 2]), + upsample_initial_channel=model_config.get("upsample_initial_channel", 512), + upsample_kernel_sizes=model_config.get("upsample_kernel_sizes", [16, 16, 4, 4]), + spk_embed_dim=model_config.get("spk_embed_dim", 109), + gin_channels=model_config.get("gin_channels", 256), + sr=self.output_sr, + is_half=supports_fp16(self.device) # 根据设备能力决定是否使用半精度 + ) + self.spk_count = int(model_config.get("spk_embed_dim", 1) or 1) + + # 加载权重 + self.voice_model.load_state_dict(cpt["weight"], strict=False) + self.voice_model = self.voice_model.to(self.device).eval() + + model_info = { + "name": Path(model_path).stem, + "sample_rate": self.output_sr, + "version": cpt.get("version", "v2") + } + + log.info(f"语音模型已加载: {model_info['name']} ({self.output_sr}Hz)") + return model_info + + def load_index(self, index_path: str): + """ + 加载 FAISS 索引 + + Args: + index_path: 索引文件路径 (.index) + """ + self.index = faiss.read_index(index_path) + # 启用 direct_map 以支持 reconstruct() + try: + self.index.make_direct_map() + except Exception: + pass # 某些索引类型不支持,忽略 + log.info(f"索引已加载: {index_path}") + + def load_f0_extractor(self, method: F0Method = "rmvpe", + rmvpe_path: str = None): + """ + 加载 F0 提取器 + + Args: + method: F0 提取方法 + rmvpe_path: RMVPE 模型路径 + """ + self.f0_extractor = get_f0_extractor( + method, + device=str(self.device), + rmvpe_path=rmvpe_path + ) + log.info(f"F0 提取器已加载: {method}") + + @torch.no_grad() + def extract_features(self, audio: np.ndarray, use_final_proj: bool = False) -> torch.Tensor: + """ + 使用 HuBERT 提取特征 + + Args: + audio: 16kHz 音频数据 + use_final_proj: 是否使用 final_proj 将 768 维降到 256 维(v1 模型需要) + + Returns: + torch.Tensor: HuBERT 特征 + """ + if self.hubert_model is None: + raise RuntimeError("请先加载 HuBERT 模型") + + # 转换为张量 + audio_tensor = torch.from_numpy(audio).float().to(self.device) + if audio_tensor.dim() == 1: + audio_tensor = audio_tensor.unsqueeze(0) + + if self.hubert_model_type == "fairseq": + # v1 模型使用第 9 层,v2 模型使用第 12 层 + output_layer = 9 if use_final_proj else 12 + feats = self.hubert_model.extract_features( + audio_tensor, + padding_mask=None, + output_layer=output_layer + )[0] + # v1 模型需要 256 维特征,使用 final_proj 投影 + # v2 模型需要 768 维特征,不使用 final_proj + if use_final_proj and hasattr(self.hubert_model, 'final_proj'): + feats = self.hubert_model.final_proj(feats) + return feats + + if self.hubert_model_type == "torchaudio": + feats_list, _ = self.hubert_model.extract_features(audio_tensor) + layer_idx = min(self.hubert_layer - 1, len(feats_list) - 1) + return feats_list[layer_idx] + + # transformers fallback + outputs = self.hubert_model(audio_tensor, output_hidden_states=True) + layer_idx = min(self.hubert_layer, len(outputs.hidden_states) - 1) + return outputs.hidden_states[layer_idx] + + def search_index(self, features: np.ndarray, k: int = 8) -> np.ndarray: + """ + 在索引中搜索相似特征 + + Args: + features: 输入特征 + k: 返回的近邻数量 + + Returns: + np.ndarray: 检索到的特征 + """ + if self.index is None: + return features + + # 检查特征维度是否与索引匹配 + if features.shape[-1] != self.index.d: + log.warning(f"特征维度 ({features.shape[-1]}) 与索引维度 ({self.index.d}) 不匹配,跳过索引搜索") + return features + + # 搜索(使用距离倒数平方加权,与官方管道一致) + scores, indices = self.index.search(features, k) + + # 尝试重建特征,如果索引不支持则跳过 + try: + big_npy = self.index.reconstruct_n(0, self.index.ntotal) + except RuntimeError as e: + if "direct map" in str(e): + log.warning("索引不支持向量重建,跳过索引混合") + return features + raise + + # 距离倒数平方加权 + weight = np.square(1.0 / (scores + 1e-6)) + weight /= weight.sum(axis=1, keepdims=True) + retrieved = np.sum( + big_npy[indices] * np.expand_dims(weight, axis=2), axis=1 + ) + return retrieved + @staticmethod + def _f0_to_coarse( + f0: np.ndarray, + f0_min: float = 50.0, + f0_max: float = 1100.0 + ) -> np.ndarray: + """Convert F0 (Hz) to official RVC coarse bins (1-255).""" + f0 = np.asarray(f0, dtype=np.float32) + f0_max = max(float(f0_max), float(f0_min) + 1.0) + f0_mel_min = 1127 * np.log(1 + float(f0_min) / 700.0) + f0_mel_max = 1127 * np.log(1 + f0_max / 700.0) + f0_mel = 1127 * np.log1p(np.maximum(f0, 0.0) / 700.0) + voiced = f0_mel > 0 + f0_mel[voiced] = (f0_mel[voiced] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + return np.rint(f0_mel).astype(np.int64) + def _apply_rms_mix( + self, + audio_out: np.ndarray, + audio_in: np.ndarray, + sr_out: int, + sr_in: int, + hop_length: int, + rms_mix_rate: float + ) -> np.ndarray: + """Match output RMS envelope to input RMS (0=off, 1=full match).""" + if rms_mix_rate <= 0: + return audio_out + + import librosa + + frame_length_in = 1024 + rms_in = librosa.feature.rms( + y=audio_in, + frame_length=frame_length_in, + hop_length=hop_length, + center=True + )[0] + + hop_out = int(round(hop_length * sr_out / sr_in)) + frame_length_out = int(round(frame_length_in * sr_out / sr_in)) + rms_out = librosa.feature.rms( + y=audio_out, + frame_length=frame_length_out, + hop_length=hop_out, + center=True + )[0] + + min_len = min(len(rms_in), len(rms_out)) + if min_len == 0: + return audio_out + + rms_in = rms_in[:min_len] + rms_out = rms_out[:min_len] + + gain = rms_in / (rms_out + 1e-6) + gain = np.clip(gain, 0.2, 4.0) + gain = gain ** rms_mix_rate + + gain_samples = np.repeat(gain, hop_out) + if len(gain_samples) < len(audio_out): + gain_samples = np.pad( + gain_samples, + (0, len(audio_out) - len(gain_samples)), + mode="edge" + ) + else: + gain_samples = gain_samples[:len(audio_out)] + + return audio_out * gain_samples + + def _apply_silence_gate( + self, + audio_out: np.ndarray, + audio_in: np.ndarray, + f0: np.ndarray, + sr_out: int, + sr_in: int, + hop_length: int, + threshold_db: float, + smoothing_ms: float, + min_silence_ms: float, + protect: float + ) -> np.ndarray: + """Silence gate based on input RMS and F0.""" + import librosa + + frame_length = 1024 + rms = librosa.feature.rms( + y=audio_in, + frame_length=frame_length, + hop_length=hop_length, + center=True + )[0] + + if len(rms) == 0 or len(f0) == 0: + return audio_out + + # Align RMS length to F0 length + if len(rms) < len(f0): + rms = np.pad(rms, (0, len(f0) - len(rms)), mode="edge") + else: + rms = rms[:len(f0)] + + rms_db = 20 * np.log10(rms + 1e-6) + ref_db = np.percentile(rms_db, 95) + gate_db = ref_db + threshold_db # threshold_db should be negative + + silent = (rms_db < gate_db) & (f0 <= 0) + + if min_silence_ms > 0: + min_frames = int( + round((min_silence_ms / 1000) * (sr_in / hop_length)) + ) + if min_frames > 1: + silent_int = silent.astype(int) + changes = np.diff( + np.concatenate(([0], silent_int, [0])) + ) + starts = np.where(changes == 1)[0] + ends = np.where(changes == -1)[0] + keep_silent = np.zeros_like(silent, dtype=bool) + for s, e in zip(starts, ends): + if e - s >= min_frames: + keep_silent[s:e] = True + silent = keep_silent + + mask = 1.0 - silent.astype(float) + + if smoothing_ms > 0: + smooth_frames = int( + round((smoothing_ms / 1000) * (sr_in / hop_length)) + ) + if smooth_frames > 1: + kernel = np.ones(smooth_frames) / smooth_frames + mask = np.convolve( + mask, + kernel, + mode="same" + ) + mask = np.clip(mask, 0.0, 1.0) + protect = float(np.clip(protect, 0.0, 1.0)) + if protect > 0: + mask = mask * (1.0 - protect) + protect + + samples_per_frame = int(round(sr_out * hop_length / sr_in)) + mask_samples = np.repeat(mask, samples_per_frame) + + if len(mask_samples) < len(audio_out): + mask_samples = np.pad( + mask_samples, + (0, len(audio_out) - len(mask_samples)), + mode="edge" + ) + else: + mask_samples = mask_samples[:len(audio_out)] + + return audio_out * mask_samples + + def _process_chunk( + self, + features: np.ndarray, + f0: np.ndarray, + use_fp16: bool = False, + speaker_id: int = 0, + ) -> np.ndarray: + """ + 处理单个音频块 + + Args: + features: HuBERT 特征 [T, C] + f0: F0 数组 + use_fp16: 是否使用 FP16 推理 + + Returns: + np.ndarray: 合成的音频 + """ + import torch.nn.functional as F + + log.debug(f"[_process_chunk] 输入特征: shape={features.shape}, dtype={features.dtype}") + log.debug(f"[_process_chunk] 输入特征统计: max={np.max(np.abs(features)):.4f}, mean={np.mean(np.abs(features)):.4f}, std={np.std(features):.4f}") + log.debug(f"[_process_chunk] 输入 F0: len={len(f0)}, max={np.max(f0):.1f}, min={np.min(f0):.1f}, non-zero={np.sum(f0 > 0)}") + + # 转换为张量 + features_tensor = torch.from_numpy(features).float().to(self.device).unsqueeze(0) + # HuBERT 输出帧率是 50fps (hop=320 @ 16kHz),但 RVC 模型期望 100fps + # 需要 2x 上采样特征 + # 注意:interpolate 需要 [B, C, T] 格式,但模型需要 [B, T, C] 格式 + features_tensor = F.interpolate(features_tensor.transpose(1, 2), scale_factor=2, mode='nearest').transpose(1, 2) + log.debug(f"[_process_chunk] 2x上采样后特征: shape={features_tensor.shape}") + + # F0 对齐到上采样后的特征长度 + # features_tensor 形状是 [B, T, C],所以时间维度是 shape[1] + target_len = features_tensor.shape[1] + original_f0_len = len(f0) + if len(f0) > target_len: + f0 = f0[:target_len] + elif len(f0) < target_len: + f0 = np.pad(f0, (0, target_len - len(f0)), mode='edge') + log.debug(f"[_process_chunk] F0 对齐: {original_f0_len} -> {len(f0)} (目标: {target_len})") + + f0_tensor = torch.from_numpy(f0.copy()).float().to(self.device).unsqueeze(0) + # 将 F0 (Hz) 转换为 pitch 索引 (0-255) + # RVC mel 量化映射到 coarse pitch bins + f0_coarse = torch.from_numpy(self._f0_to_coarse(f0)).to(self.device).unsqueeze(0) + log.debug(f"[_process_chunk] F0 张量: shape={f0_tensor.shape}, max={f0_tensor.max().item():.1f}, min={f0_tensor.min().item():.1f}") + log.debug(f"[_process_chunk] F0 coarse (pitch索引): shape={f0_coarse.shape}, max={f0_coarse.max().item()}, min={f0_coarse.min().item()}") + + safe_speaker_id = int(max(0, min(max(1, int(self.spk_count)) - 1, int(speaker_id)))) + sid = torch.tensor([safe_speaker_id], device=self.device) + log.debug(f"[_process_chunk] 说话人 ID: {sid.item()}") + + # FP16 推理 + log.debug(f"[_process_chunk] 开始推理, use_fp16={use_fp16}, device={self.device.type}") + if use_fp16 and supports_fp16(self.device): + with torch.amp.autocast(str(self.device.type)): + audio_out, x_mask, _ = self.voice_model.infer( + features_tensor, + torch.tensor([features_tensor.shape[1]], device=self.device), + f0_coarse, + f0_tensor, + sid + ) + else: + audio_out, x_mask, _ = self.voice_model.infer( + features_tensor, + torch.tensor([features_tensor.shape[1]], device=self.device), + f0_coarse, + f0_tensor, + sid + ) + + log.debug(f"[_process_chunk] 推理完成, audio_out: shape={audio_out.shape}, dtype={audio_out.dtype}") + log.debug(f"[_process_chunk] x_mask: shape={x_mask.shape}, sum={x_mask.sum().item()}") + + # 清理 + del features_tensor, f0_tensor, f0_coarse + empty_device_cache(self.device) + + audio_out = audio_out.squeeze().cpu().detach().float().numpy() + log.debug(f"Chunk audio: len={len(audio_out)}, max={np.max(np.abs(audio_out)):.4f}, min={np.min(audio_out):.4f}") + + # 注意:不再对 F0=0 区域应用硬静音 mask + # 辅音(如 k, t, s, p)通常没有基频(F0=0),硬静音会导致只剩元音 + # 如果需要降噪,应该在后处理阶段使用更智能的方法 + + return audio_out + + def convert( + self, + audio_path: str, + output_path: str, + pitch_shift: float = 0, + index_ratio: float = 0.2, + filter_radius: int = 3, + resample_sr: int = 0, + rms_mix_rate: float = 0.25, + protect: float = 0.33, + speaker_id: int = 0, + silence_gate: bool = True, + silence_threshold_db: float = -45.0, + silence_smoothing_ms: float = 50.0, + silence_min_duration_ms: float = 200.0 + ) -> str: + """ + 执行 RVC 推理 + + Args: + audio_path: 输入音频路径 + output_path: 输出音频路径 + pitch_shift: 音调偏移 (半音) + index_ratio: 索引混合比率 (0-1) + filter_radius: 中值滤波半径 + resample_sr: 重采样率 (0 表示不重采样) + rms_mix_rate: RMS 混合比率 + protect: 保护清辅音 + speaker_id: 说话人 ID(多说话人模型可调) + silence_gate: 启用静音门限(默认开启以消除静音段底噪) + silence_threshold_db: 静音阈值 (dB, 相对峰值) + silence_smoothing_ms: 门限平滑时长 (ms) + silence_min_duration_ms: 最短静音时长 (ms) + + Returns: + str: 输出文件路径 + """ + # 检查模型 + if self.voice_model is None: + raise RuntimeError("请先加载语音模型") + if self.hubert_model is None: + raise RuntimeError("请先加载 HuBERT 模型") + if self.f0_extractor is None: + raise RuntimeError("请先加载 F0 提取器") + + # 加载音频 + audio = load_audio(audio_path, sr=self.sample_rate) + audio = normalize_audio(audio) + rms_mix_rate = float(np.clip(rms_mix_rate, 0.0, 1.0)) + speaker_id = int(max(0, min(max(1, int(self.spk_count)) - 1, int(speaker_id)))) + + # 高通滤波去除低频隆隆声(与官方管道一致) + audio = sp_signal.filtfilt(_bh, _ah, audio).astype(np.float32) + + # 步骤1: 提取 F0 (使用 RMVPE 或 Hybrid) + f0 = self.f0_extractor.extract(audio) + + # 音调偏移 + if pitch_shift != 0: + f0 = shift_f0(f0, pitch_shift) + + # 智能中值滤波 - 仅在F0跳变过大时应用,保留自然颤音 + if filter_radius > 0: + from scipy.ndimage import median_filter + + # 计算F0跳变(半音) + f0_semitone_diff = np.abs(12 * np.log2((f0 + 1e-6) / (np.roll(f0, 1) + 1e-6))) + f0_semitone_diff[0] = 0 + + # 只对跳变超过2个半音的区域应用滤波 + need_filter = f0_semitone_diff > 2.0 + + # 扩展需要滤波的区域(前后各1帧) + kernel = np.ones(3, dtype=bool) + need_filter = np.convolve(need_filter, kernel, mode='same') + + # 应用滤波 + f0_filtered = median_filter(f0, size=filter_radius) + + # 高音区域 (>500Hz) 使用更温和的滤波,避免高音被过度平滑 + # 参考: RMVPE论文建议高频区域使用自适应平滑 + high_pitch_mask = f0 > 500 + + # 对高音区域使用更小的滤波半径 + if np.any(high_pitch_mask): + f0_filtered_high = median_filter(f0, size=max(1, filter_radius // 2)) + f0_filtered = np.where(high_pitch_mask, f0_filtered_high, f0_filtered) + + # 混合:只在需要的地方滤波,其他保留原始 + f0 = np.where(need_filter, f0_filtered, f0) + + # 释放 F0 提取器显存 + self.unload_f0_extractor() + + # 步骤2: 提取 HuBERT 特征 + # v1 模型需要 256 维特征(使用 final_proj),v2 模型需要 768 维 + use_final_proj = (self.model_version == "v1") + features = self.extract_features(audio, use_final_proj=use_final_proj) + features = features.squeeze(0).cpu().numpy() + + # 释放 HuBERT 显存 + self.unload_hubert() + + # 索引检索 (CPU 操作) + if self.index is not None and index_ratio > 0: + features_before_index = features.copy() + retrieved = self.search_index(features) + + # 简单的自适应索引混合(不使用白化和残差去除) + # 高音区域使用稍高的索引率 + adaptive_index_ratio = np.ones(len(features)) * index_ratio + + f0_per_feat = 2 + for fi in range(len(features)): + f0_start = fi * f0_per_feat + f0_end = min(f0_start + f0_per_feat, len(f0)) + if f0_end > f0_start: + f0_segment = f0[f0_start:f0_end] + avg_f0 = np.mean(f0_segment[f0_segment > 0]) if np.any(f0_segment > 0) else 0 + # 高音区域提升索引率 + if avg_f0 > 450: + adaptive_index_ratio[fi] = min(0.75, index_ratio * 1.3) + + adaptive_index_ratio = adaptive_index_ratio[:, np.newaxis] + features = features * (1 - adaptive_index_ratio) + retrieved * adaptive_index_ratio + + # 动态辅音保护:基于F0置信度和能量调整protect强度 + # 避免索引检索破坏辅音清晰度,与官方管道行为一致 + if protect < 0.5: + # 构建逐帧保护掩码:F0>0 的帧用 1.0(完全使用索引混合后特征), + # F0=0 的帧用 protect 值(大部分保留原始特征) + # F0 帧率是特征帧率的 2 倍 (hop 160 vs 320),需要下采样对齐 + f0_per_feat = 2 # 每个特征帧对应 2 个 F0 帧 + n_feat = features.shape[0] + protect_mask = np.ones(n_feat, dtype=np.float32) + + # 计算每个特征帧的F0稳定性和能量 + for fi in range(n_feat): + f0_start = fi * f0_per_feat + f0_end = min(f0_start + f0_per_feat, len(f0)) + if f0_end > f0_start: + f0_segment = f0[f0_start:f0_end] + # 无声段(F0=0):强保护,保留更多原始特征 + # 参考: "Voice Conversion for Articulation Disorders" 建议保护辅音 + if np.all(f0_segment <= 0): + # 提高无声段保护强度,从 protect 提升到 protect * 1.5 + protect_mask[fi] = min(0.8, protect * 1.5) + # F0不稳定段(方差大):中等保护 + elif len(f0_segment) > 1 and np.std(f0_segment) > 50: + protect_mask[fi] = protect + (1.0 - protect) * 0.3 + # 低能量段(可能是呼吸音):增强保护 + # 使用特征的L2范数作为能量指标 + feat_energy = np.linalg.norm(features_before_index[fi]) + if feat_energy < 0.5: # 低能量阈值 + protect_mask[fi] = min(0.8, protect * 1.3) + + # 平滑保护掩码,避免突变 + smooth_kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + smooth_kernel /= np.sum(smooth_kernel) + protect_mask = np.convolve(protect_mask, smooth_kernel, mode="same") + protect_mask = np.convolve(protect_mask, smooth_kernel, mode="same") + protect_mask = np.clip(protect_mask, protect, 1.0) + protect_mask = protect_mask[:, np.newaxis] # [T, 1] 广播到 [T, C] + features = features * protect_mask + features_before_index * (1 - protect_mask) + + # --- 能量感知软门控(索引+protect 之后、分块推理之前)--- + # 注意:使用软门控而非硬清零,避免音量损失 + import librosa as _librosa_local + _hop_feat = 320 # HuBERT hop + _n_feat = features.shape[0] + _frame_rms = _librosa_local.feature.rms( + y=audio, frame_length=_hop_feat * 2, hop_length=_hop_feat, center=True + )[0] + if _frame_rms.ndim > 1: + _frame_rms = _frame_rms[0] + if len(_frame_rms) > _n_feat: + _frame_rms = _frame_rms[:_n_feat] + elif len(_frame_rms) < _n_feat: + _frame_rms = np.pad(_frame_rms, (0, _n_feat - len(_frame_rms)), mode='edge') + _energy_db = 20.0 * np.log10(_frame_rms + 1e-8) + _ref_db = float(np.percentile(_energy_db, 95)) if _frame_rms.size > 0 else -20.0 + + # 改进的软门控:使用渐变衰减而非硬清零,保留低能量内容 + _silence_threshold = _ref_db - 65.0 # 进一步放宽到-65dB(只处理极端静音) + _is_very_quiet = (_energy_db < _silence_threshold).astype(np.float32) + + # 检查F0:F0=0的帧更可能是静音(但也可能是辅音) + _f0_50fps = f0[::2] if len(f0) >= _n_feat * 2 else np.pad(f0[::2], (0, _n_feat - len(f0[::2])), mode='edge') + _f0_50fps = _f0_50fps[:_n_feat] + _is_unvoiced = (_f0_50fps <= 0).astype(np.float32) + + # 组合判断:极低能量 + 无声 = 可能静音 + _is_silence = _is_very_quiet * _is_unvoiced + + # 平滑门控曲线 + _sm = np.array([1, 2, 3, 2, 1], dtype=np.float32) + _sm /= _sm.sum() + _is_silence = np.convolve(_is_silence, _sm, mode='same')[:_n_feat] + + # 最小静音时长过滤(避免误判短暂的低能量辅音) + _min_silence_frames = 10 # 约200ms @ 50fps(更保守) + _silence_binary = (_is_silence > 0.7).astype(int) # 提高阈值到0.7 + _changes = np.diff(np.concatenate(([0], _silence_binary, [0]))) + _starts = np.where(_changes == 1)[0] + _ends = np.where(_changes == -1)[0] + _keep_silence = np.zeros_like(_silence_binary, dtype=bool) + for _s, _e in zip(_starts, _ends): + if _e - _s >= _min_silence_frames: + _keep_silence[_s:_e] = True + + # 软门控:使用渐变衰减而非硬清零(0.3-1.0 而非 0-1) + _energy_gate = np.where(_keep_silence, 0.3, 1.0).astype(np.float32) + + # 特征软门控(50fps)- 保留30%而非完全清零 + features = features * _energy_gate[:, np.newaxis] + + # F0 软清零(100fps = 特征帧率 × 2)- 保留30%而非完全清零 + _f0_gate = np.repeat(_energy_gate, 2) + if len(_f0_gate) > len(f0): + _f0_gate = _f0_gate[:len(f0)] + elif len(_f0_gate) < len(f0): + _f0_gate = np.pad(_f0_gate, (0, len(f0) - len(_f0_gate)), mode='constant', constant_values=1.0) + f0 = f0 * _f0_gate + + # 步骤3: 语音合成 (voice_model 推理) - 分块处理 + # 分块参数 - 增加重叠以减少边界伪影 + CHUNK_SECONDS = 30 # 每块 30 秒 + OVERLAP_SECONDS = 2.0 # 重叠 2.0 秒(从1.0增加到2.0,减少破音) + HOP_LENGTH = 320 # HuBERT hop length + + # 计算分块大小(以特征帧为单位) + chunk_frames = int(CHUNK_SECONDS * self.sample_rate / HOP_LENGTH) + overlap_frames = int(OVERLAP_SECONDS * self.sample_rate / HOP_LENGTH) + + total_frames = features.shape[0] + + # 如果音频短于一块,直接处理 + if total_frames <= chunk_frames: + audio_out = self._process_chunk(features, f0, speaker_id=speaker_id) + else: + # 分块处理 + log.info(f"音频较长 ({total_frames} 帧),启用分块处理...") + audio_chunks = [] + chunk_idx = 0 + + for start in range(0, total_frames, chunk_frames - overlap_frames): + end = min(start + chunk_frames, total_frames) + chunk_features = features[start:end] + + # 计算对应的 F0 范围 + # F0 帧率是特征帧率的 2 倍 (hop 160 vs 320) + f0_start = start * 2 + f0_end = min(end * 2, len(f0)) + chunk_f0 = f0[f0_start:f0_end] + + log.debug(f"处理块 {chunk_idx}: 帧 {start}-{end}") + + # 处理当前块 + chunk_audio = self._process_chunk(chunk_features, chunk_f0, speaker_id=speaker_id) + audio_chunks.append(chunk_audio) + chunk_idx += 1 + + # 清理显存 + gc.collect() + empty_device_cache(self.device) + + # 交叉淡入淡出拼接 + audio_out = self._crossfade_chunks(audio_chunks, overlap_frames) + log.info(f"分块处理完成,共 {chunk_idx} 块") + + # 后处理 + if isinstance(audio_out, tuple): + audio_out = audio_out[0] + audio_out = np.asarray(audio_out).flatten() + + # 重采样 + if resample_sr > 0 and resample_sr != self.output_sr: + import librosa + audio_out = librosa.resample( + audio_out, + orig_sr=self.output_sr, + target_sr=resample_sr + ) + save_sr = resample_sr + else: + save_sr = self.output_sr + + # 可选 RMS 包络混合 + if rms_mix_rate > 0: + audio_out = self._apply_rms_mix( + audio_out=audio_out, + audio_in=audio, + sr_out=save_sr, + sr_in=self.sample_rate, + hop_length=160, + rms_mix_rate=rms_mix_rate + ) + + # 可选静音门限 (减少无声段气声/噪声) + if silence_gate: + audio_out = self._apply_silence_gate( + audio_out=audio_out, + audio_in=audio, + f0=f0, + sr_out=save_sr, + sr_in=self.sample_rate, + hop_length=160, + threshold_db=silence_threshold_db, + smoothing_ms=silence_smoothing_ms, + min_silence_ms=silence_min_duration_ms, + protect=protect + ) + + # 应用人声清理后处理(减少齿音和呼吸音) + # 注意:为避免过度处理导致音质下降,默认禁用 + try: + from lib.vocal_cleanup import apply_vocal_cleanup + audio_out = apply_vocal_cleanup( + audio_out, + sr=save_sr, + reduce_sibilance_enabled=False, # 禁用齿音处理,避免音质损失 + reduce_breath_enabled=False, + sibilance_reduction_db=2.0, + breath_reduction_db=0.0 + ) + log.detail("已应用人声清理") + except Exception as e: + log.warning(f"人声清理失败: {e}") + + # 应用vocoder伪影修复(呼吸音电音和长音撕裂) + # 注意:只保留相位修复,禁用其他处理避免音量损失 + try: + from lib.vocoder_fix import apply_vocoder_artifact_fix + + # 将F0重采样到音频帧率 + if len(f0) > 0: + import librosa + # F0是100fps,需要对齐到音频帧率 + f0_resampled = librosa.resample( + f0.astype(np.float32), + orig_sr=100, # F0帧率 + target_sr=save_sr / (save_sr / 16000 * 160) # 音频帧率 + ) + else: + f0_resampled = None + + audio_out = apply_vocoder_artifact_fix( + audio_out, + sr=save_sr, + f0=f0_resampled, + chunk_boundaries=None, + fix_phase=True, # 保留相位修复(修复长音撕裂) + fix_breath=True, # 启用底噪清理(使用优化后的精准检测) + fix_sustained=False # 禁用长音稳定,避免音质损失 + ) + log.detail("已应用vocoder伪影修复(相位+底噪清理)") + except Exception as e: + log.warning(f"Vocoder伪影修复失败: {e}") + + # 峰值限幅(不改变整体响度,后续由 cover_pipeline 控制音量) + audio_out = soft_clip(audio_out, threshold=0.9, ceiling=0.99) + + # 保存 + save_audio(output_path, audio_out, sr=save_sr) + + return output_path + + def _crossfade_chunks(self, chunks: list, overlap_frames: int) -> np.ndarray: + """ + 使用 SOLA (Synchronized Overlap-Add) 拼接音频块 + + SOLA 通过在重叠区域搜索最佳相位对齐点来避免分块边界的撕裂伪影。 + 参考: w-okada/voice-changer Issue #163, DDSP-SVC 实现 + + Args: + chunks: 音频块列表 + overlap_frames: 重叠帧数(特征帧) + + Returns: + np.ndarray: 拼接后的音频 + """ + if len(chunks) == 1: + return chunks[0] + + # 正确计算重叠的音频样本数 + # 1 特征帧 = HOP_LENGTH 输入样本 @ 16kHz + # 输出样本数 = HOP_LENGTH * (output_sr / input_sr) + HOP_LENGTH = 320 + INPUT_SR = 16000 + output_sr = getattr(self, 'output_sr', 40000) + + # 每个特征帧对应的输出样本数 + samples_per_frame = int(HOP_LENGTH * output_sr / INPUT_SR) + overlap_samples = overlap_frames * samples_per_frame + + log.debug(f"SOLA Crossfade: overlap_frames={overlap_frames}, samples_per_frame={samples_per_frame}, overlap_samples={overlap_samples}") + + result = chunks[0] + + for i in range(1, len(chunks)): + chunk = chunks[i] + + # 确保重叠区域不超过任一块的长度 + actual_overlap = min(overlap_samples, len(result), len(chunk)) + + if actual_overlap > 0: + # SOLA: 在重叠区域搜索最佳相位对齐点 + # 搜索范围:不超过一个基频周期(约 100-200 样本 @ 48kHz) + search_range = min(int(output_sr * 0.005), actual_overlap // 4) # 5ms 或 1/4 重叠 + + # 提取前一块的尾部作为参考 + reference = result[-actual_overlap:] + + # 在新块的开头搜索最佳对齐位置 + best_offset = 0 + max_correlation = -1.0 + + for offset in range(max(0, -search_range), min(search_range + 1, len(chunk) - actual_overlap + 1)): + # 提取候选区域 + candidate_start = max(0, offset) + candidate_end = candidate_start + actual_overlap + + if candidate_end > len(chunk): + continue + + candidate = chunk[candidate_start:candidate_end] + + # 计算归一化互相关 + ref_norm = np.linalg.norm(reference) + cand_norm = np.linalg.norm(candidate) + + if ref_norm > 1e-6 and cand_norm > 1e-6: + correlation = np.dot(reference, candidate) / (ref_norm * cand_norm) + + if correlation > max_correlation: + max_correlation = correlation + best_offset = offset + + log.debug(f"SOLA chunk {i}: best_offset={best_offset}, correlation={max_correlation:.4f}") + + # 如果相关性太低(<0.3),说明信号不连续,使用简单crossfade避免伪影 + if max_correlation < 0.3: + log.debug(f"SOLA chunk {i}: low correlation, using simple crossfade") + fade_out = np.linspace(1, 0, actual_overlap) + fade_in = np.linspace(0, 1, actual_overlap) + result_end = result[-actual_overlap:] * fade_out + chunk_start = chunk[:actual_overlap] * fade_in + result = np.concatenate([ + result[:-actual_overlap], + result_end + chunk_start, + chunk[actual_overlap:] + ]) + continue + + # 在最佳对齐点应用交叉淡入淡出 + aligned_start = max(0, best_offset) + aligned_end = aligned_start + actual_overlap + + if aligned_end <= len(chunk): + # 创建淡入淡出曲线(使用余弦窗以获得更平滑的过渡) + fade_out = np.cos(np.linspace(0, np.pi / 2, actual_overlap)) ** 2 + fade_in = np.sin(np.linspace(0, np.pi / 2, actual_overlap)) ** 2 + + # 应用交叉淡入淡出 + result_end = result[-actual_overlap:] * fade_out + chunk_aligned = chunk[aligned_start:aligned_end] * fade_in + + # 拼接 + result = np.concatenate([ + result[:-actual_overlap], + result_end + chunk_aligned, + chunk[aligned_end:] + ]) + else: + # 对齐失败,回退到简单拼接 + log.warning(f"SOLA alignment failed for chunk {i}, using simple crossfade") + fade_out = np.linspace(1, 0, actual_overlap) + fade_in = np.linspace(0, 1, actual_overlap) + result_end = result[-actual_overlap:] * fade_out + chunk_start = chunk[:actual_overlap] * fade_in + result = np.concatenate([ + result[:-actual_overlap], + result_end + chunk_start, + chunk[actual_overlap:] + ]) + else: + # 无重叠,直接拼接 + result = np.concatenate([result, chunk]) + + return result + + +def list_voice_models(weights_dir: str = "assets/weights") -> list: + """ + 列出可用的语音模型 + + Args: + weights_dir: 模型目录 + + Returns: + list: 模型信息列表 + """ + models = [] + weights_path = Path(weights_dir) + + if not weights_path.exists(): + return models + + # 递归搜索所有子目录 + for pth_file in weights_path.glob("**/*.pth"): + # 查找对应的索引文件(同目录下) + index_file = pth_file.with_suffix(".index") + if not index_file.exists(): + # 尝试其他命名方式 + index_file = pth_file.parent / f"{pth_file.stem}_v2.index" + if not index_file.exists(): + # 尝试不区分大小写匹配 + for f in pth_file.parent.glob("*.index"): + if f.stem.lower() == pth_file.stem.lower(): + index_file = f + break + + models.append({ + "name": pth_file.stem, + "model_path": str(pth_file), + "index_path": str(index_file) if index_file.exists() else None + }) + + return models + + diff --git a/infer/separator.py b/infer/separator.py new file mode 100644 index 0000000000000000000000000000000000000000..74623671906aadbff3e758bab85da447483f240c --- /dev/null +++ b/infer/separator.py @@ -0,0 +1,586 @@ +# -*- coding: utf-8 -*- +""" +人声分离模块 - 支持 Demucs 和 Mel-Band Roformer (audio-separator) +""" +import os +import gc +import shutil +import torch +import numpy as np +import soundfile as sf +from pathlib import Path +from typing import Tuple, Optional, Callable + +from lib.logger import log +from lib.device import get_device, empty_device_cache + +# Demucs 导入 +try: + from demucs.pretrained import get_model + from demucs.apply import apply_model + import torchaudio + DEMUCS_AVAILABLE = True +except ImportError: + DEMUCS_AVAILABLE = False + +# audio-separator 导入 (Mel-Band Roformer 等) +try: + from audio_separator.separator import Separator + AUDIO_SEPARATOR_AVAILABLE = True + # 抑制 audio-separator 的英文日志,我们有自己的中文日志 + import logging as _logging + _logging.getLogger("audio_separator").setLevel(_logging.WARNING) +except ImportError: + AUDIO_SEPARATOR_AVAILABLE = False + + +# Mel-Band Roformer 默认模型 +ROFORMER_DEFAULT_MODEL = "vocals_mel_band_roformer.ckpt" +KARAOKE_DEFAULT_MODEL = "mel_band_roformer_karaoke_gabox.ckpt" +KARAOKE_FALLBACK_MODELS = [ + "mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt", +] + + +def _resolve_output_files(output_files, output_dir: Path) -> list[str]: + """Resolve relative output filenames returned by audio-separator.""" + resolved_files = [] + for file_name in output_files: + file_path = Path(file_name) + if not file_path.is_absolute(): + file_path = output_dir / file_path + resolved_files.append(str(file_path)) + return resolved_files + + +def _safe_move(src_path: str, dst_path: str) -> None: + """Move file with overwrite.""" + if src_path == dst_path: + return + dst = Path(dst_path) + if dst.exists(): + dst.unlink() + shutil.move(src_path, dst_path) + + +def _get_audio_activity_stats(audio_path: str) -> tuple[float, float, int]: + """Return simple activity stats for validating separator outputs.""" + audio, _ = sf.read(audio_path, dtype="float32", always_2d=True) + if audio.size == 0: + return 0.0, 0.0, 0 + + mono = np.mean(audio, axis=1, dtype=np.float32) + rms = float(np.sqrt(np.mean(np.square(mono), dtype=np.float64) + 1e-12)) + peak = float(np.max(np.abs(mono))) + nonzero = int(np.count_nonzero(np.abs(mono) > 1e-6)) + return rms, peak, nonzero + + +class RoformerSeparator: + """人声分离器 - 基于 Mel-Band Roformer (通过 audio-separator)""" + + def __init__( + self, + model_filename: str = ROFORMER_DEFAULT_MODEL, + device: str = "cuda", + ): + if not AUDIO_SEPARATOR_AVAILABLE: + raise ImportError( + "请安装 audio-separator: pip install audio-separator[gpu]" + ) + self.model_filename = model_filename + self.device = str(get_device(device)) + self.separator = None + + def load_model(self, output_dir: str = ""): + """加载 Roformer 模型""" + model_dir = str( + Path(__file__).parent.parent / "assets" / "separator_models" + ) + Path(model_dir).mkdir(parents=True, exist_ok=True) + + target_dir = output_dir or str( + Path(__file__).parent.parent / "temp" / "separator" + ) + + # Recreate the Separator when output_dir changes, because + # some audio-separator versions cache internal paths at init. + if self.separator is not None: + if getattr(self, '_init_output_dir', None) == target_dir: + return + # output_dir changed — rebuild Separator + del self.separator + self.separator = None + gc.collect() + + log.info(f"正在加载 Mel-Band Roformer 模型: {self.model_filename}") + + self.separator = Separator( + log_level=_logging.WARNING, + output_dir=target_dir, + model_file_dir=model_dir, + ) + self._init_output_dir = target_dir + self.separator.load_model(self.model_filename) + log.info("Mel-Band Roformer 模型已加载") + + def separate( + self, + audio_path: str, + output_dir: str, + progress_callback: Optional[Callable[[str, float], None]] = None, + ) -> Tuple[str, str]: + """ + 分离人声和伴奏 + + Returns: + Tuple[vocals_path, accompaniment_path] + """ + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + if progress_callback: + progress_callback("正在加载 Roformer 模型...", 0.1) + + self.load_model(output_dir=str(output_path)) + + # audio-separator 需要 output_dir 在实例上设置 + self.separator.output_dir = str(output_path) + + if progress_callback: + progress_callback("正在使用 Mel-Band Roformer 分离人声...", 0.3) + + output_files = self.separator.separate(audio_path) + + # audio-separator 返回的可能是纯文件名,需要拼上 output_dir + resolved_files = [] + for f in output_files: + p = Path(f) + if not p.is_absolute(): + p = output_path / p + resolved_files.append(str(p)) + + # Fallback: if resolved files don't exist, search the output dir + # for freshly created files. This handles cases where audio-separator + # writes to a slightly different path (e.g. after output_dir update + # on a reused Separator instance). + if resolved_files and not any(Path(f).exists() for f in resolved_files): + import glob as _glob + all_wavs = sorted( + _glob.glob(str(output_path / "*.wav")), + key=lambda x: os.path.getmtime(x), + reverse=True, + ) + # Take the most recent files (should be our separation output) + if len(all_wavs) >= 2: + resolved_files = all_wavs[:2] + elif len(all_wavs) == 1: + resolved_files = all_wavs[:1] + + # audio-separator 返回文件列表,通常 [primary, secondary] + # primary = Vocals, secondary = Instrumental (或反过来,取决于模型) + vocals_path = None + accompaniment_path = None + + for f in resolved_files: + f_lower = Path(f).name.lower() + # audio-separator uses parenthesized stem markers like (vocals), (other) + # Check these first to avoid false matches from model names (e.g. vocals_mel_band_roformer) + if "(other)" in f_lower or "(instrumental)" in f_lower or "(no_vocal" in f_lower: + accompaniment_path = f + elif "(vocal" in f_lower or "(primary)" in f_lower: + vocals_path = f + elif "instrument" in f_lower or "no_vocal" in f_lower or "secondary" in f_lower: + accompaniment_path = f + elif "vocal" in f_lower or "primary" in f_lower: + vocals_path = f + + # 如果无法通过文件名判断,按顺序分配 + if vocals_path is None and accompaniment_path is None and len(resolved_files) >= 2: + vocals_path = resolved_files[0] + accompaniment_path = resolved_files[1] + elif vocals_path is None and len(resolved_files) >= 1: + vocals_path = resolved_files[0] + elif accompaniment_path is None and len(resolved_files) >= 2: + accompaniment_path = resolved_files[1] + + # 重命名为标准名称 + final_vocals = str(output_path / "vocals.wav") + final_accompaniment = str(output_path / "accompaniment.wav") + + if vocals_path and vocals_path != final_vocals: + if not Path(vocals_path).exists(): + raise FileNotFoundError( + f"分离器输出人声文件不存在: {vocals_path}\n" + f"输出目录内容: {list(output_path.glob('*'))}" + ) + shutil.move(vocals_path, final_vocals) + if accompaniment_path and accompaniment_path != final_accompaniment: + if not Path(accompaniment_path).exists(): + raise FileNotFoundError( + f"分离器输出伴奏文件不存在: {accompaniment_path}\n" + f"输出目录内容: {list(output_path.glob('*'))}" + ) + shutil.move(accompaniment_path, final_accompaniment) + + if progress_callback: + progress_callback("Mel-Band Roformer 人声分离完成", 1.0) + + return final_vocals, final_accompaniment + + def unload_model(self): + """卸载模型释放显存""" + if self.separator is not None: + del self.separator + self.separator = None + gc.collect() + empty_device_cache() + + +class KaraokeSeparator: + """主唱/和声分离器 - 基于 Mel-Band Roformer Karaoke 模型""" + + def __init__( + self, + model_filename: str = KARAOKE_DEFAULT_MODEL, + device: str = "cuda", + ): + if not AUDIO_SEPARATOR_AVAILABLE: + raise ImportError( + "请安装 audio-separator: pip install audio-separator[gpu]" + ) + self.device = str(get_device(device)) + self.separator = None + self.active_model = None + + models = [model_filename] + for fallback in KARAOKE_FALLBACK_MODELS: + if fallback not in models: + models.append(fallback) + self.model_candidates = models + + def load_model(self, output_dir: str = ""): + """加载 Karaoke 模型(主模型失败时自动回退)""" + model_dir = str(Path(__file__).parent.parent / "assets" / "separator_models") + Path(model_dir).mkdir(parents=True, exist_ok=True) + + target_dir = output_dir or str( + Path(__file__).parent.parent / "temp" / "separator" + ) + + # Recreate the Separator when output_dir changes + if self.separator is not None: + if getattr(self, '_init_output_dir', None) == target_dir: + return + del self.separator + self.separator = None + self.active_model = None + gc.collect() + + last_error = None + for model_name in self.model_candidates: + try: + log.info(f"正在加载 Karaoke 模型: {model_name}") + separator = Separator( + log_level=_logging.WARNING, + output_dir=target_dir, + model_file_dir=model_dir, + ) + separator.load_model(model_name) + self.separator = separator + self._init_output_dir = target_dir + self.active_model = model_name + log.info("Karaoke 模型已加载") + return + except Exception as exc: + last_error = exc + log.warning(f"Karaoke 模型加载失败: {model_name} ({exc})") + + raise RuntimeError(f"无法加载 Karaoke 模型: {last_error}") + + @staticmethod + def _classify_stem(file_name: str) -> Optional[str]: + lower_name = file_name.lower() + + lead_markers = [ + "(vocals)", + "(lead)", + "(karaoke)", + "(main_vocal)", + "(main vocals)", + "_(vocals)_", + ] + backing_markers = [ + "(instrumental)", + "(other)", + "(backing)", + "(no_vocal", + "_(instrumental)_", + "_(other)_", + ] + + for marker in lead_markers: + if marker in lower_name: + return "lead" + for marker in backing_markers: + if marker in lower_name: + return "backing" + + if "vocals" in lower_name: + return "lead" + if "instrumental" in lower_name or "other" in lower_name: + return "backing" + return None + + def separate(self, audio_path: str, output_dir: str) -> Tuple[str, str]: + """ + 分离主唱和和声 + + Returns: + Tuple[lead_vocals_path, backing_vocals_path] + """ + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + self.load_model(output_dir=str(output_path)) + self.separator.output_dir = str(output_path) + output_files = self.separator.separate(audio_path) + resolved_files = _resolve_output_files(output_files, output_path) + log.detail( + f"Karaoke分离器输出文件: {[Path(file_path).name for file_path in resolved_files]}" + ) + + lead_vocals_path = None + backing_vocals_path = None + for file_path in resolved_files: + stem_role = self._classify_stem(Path(file_path).name) + log.detail( + f" {Path(file_path).name} -> 分类为: {stem_role or 'unknown'}" + ) + if stem_role == "lead" and lead_vocals_path is None: + lead_vocals_path = file_path + elif stem_role == "backing" and backing_vocals_path is None: + backing_vocals_path = file_path + + if lead_vocals_path is None and resolved_files: + lead_vocals_path = resolved_files[0] + if backing_vocals_path is None: + for file_path in resolved_files: + if file_path != lead_vocals_path: + backing_vocals_path = file_path + break + + if not lead_vocals_path or not Path(lead_vocals_path).exists(): + raise FileNotFoundError( + f"Karaoke主唱轨未找到,输出文件: {[Path(p).name for p in resolved_files]}" + ) + if not backing_vocals_path or not Path(backing_vocals_path).exists(): + raise FileNotFoundError( + f"Karaoke和声轨未找到,输出文件: {[Path(p).name for p in resolved_files]}" + ) + + lead_rms, lead_peak, lead_nonzero = _get_audio_activity_stats(lead_vocals_path) + backing_rms, backing_peak, backing_nonzero = _get_audio_activity_stats(backing_vocals_path) + log.detail( + "Karaoke输出能量检测: " + f"lead_rms={lead_rms:.6f}, lead_peak={lead_peak:.6f}, lead_nonzero={lead_nonzero}; " + f"backing_rms={backing_rms:.6f}, backing_peak={backing_peak:.6f}, backing_nonzero={backing_nonzero}" + ) + + lead_is_nearly_silent = lead_nonzero == 0 or (lead_rms < 1e-5 and lead_peak < 1e-4) + backing_has_content = backing_nonzero > 0 and (backing_rms >= 5e-5 or backing_peak >= 5e-4) + if lead_is_nearly_silent and backing_has_content: + log.warning("Karaoke主唱轨几乎静音,检测到输出疑似反转,已自动交换主唱/和声") + lead_vocals_path, backing_vocals_path = backing_vocals_path, lead_vocals_path + + final_lead = str(output_path / "lead_vocals.wav") + final_backing = str(output_path / "backing_vocals.wav") + _safe_move(lead_vocals_path, final_lead) + _safe_move(backing_vocals_path, final_backing) + + return final_lead, final_backing + + def unload_model(self): + """卸载模型释放显存""" + if self.separator is not None: + del self.separator + self.separator = None + self.active_model = None + gc.collect() + empty_device_cache() + + +class VocalSeparator: + """人声分离器 - 基于 Demucs""" + + def __init__( + self, + model_name: str = "htdemucs", + device: str = "cuda", + shifts: int = 2, + overlap: float = 0.25, + split: bool = True + ): + """ + 初始化分离器 + + Args: + model_name: Demucs 模型名称 (htdemucs, htdemucs_ft, mdx_extra) + device: 计算设备 + """ + if not DEMUCS_AVAILABLE: + raise ImportError("请安装 demucs: pip install demucs") + + self.model_name = model_name + self.device = str(get_device(device)) + self.model = None + self.shifts = shifts + self.overlap = overlap + self.split = split + + def load_model(self): + """加载 Demucs 模型""" + if self.model is not None: + return + + log.info(f"正在加载 Demucs 模型: {self.model_name}") + self.model = get_model(self.model_name) + self.model.to(self.device) + self.model.eval() + log.info(f"Demucs 模型已加载 ({self.device})") + + def separate( + self, + audio_path: str, + output_dir: str, + progress_callback: Optional[Callable[[str, float], None]] = None + ) -> Tuple[str, str]: + """ + 分离人声和伴奏 + + Args: + audio_path: 输入音频路径 + output_dir: 输出目录 + progress_callback: 进度回调 (message, progress) + + Returns: + Tuple[vocals_path, accompaniment_path] + """ + self.load_model() + + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + if progress_callback: + progress_callback("正在加载音频...", 0.1) + + # 加载音频 + waveform, sample_rate = torchaudio.load(audio_path) + + # 重采样到模型采样率 + if sample_rate != self.model.samplerate: + resampler = torchaudio.transforms.Resample(sample_rate, self.model.samplerate) + waveform = resampler(waveform) + + # 确保是立体声 + if waveform.shape[0] == 1: + waveform = waveform.repeat(2, 1) + elif waveform.shape[0] > 2: + waveform = waveform[:2] + + # 添加 batch 维度 + waveform = waveform.unsqueeze(0).to(self.device) + + if progress_callback: + progress_callback("正在分离人声...", 0.3) + + # 执行分离 + with torch.no_grad(): + try: + sources = apply_model( + self.model, + waveform, + device=self.device, + shifts=self.shifts, + overlap=self.overlap, + split=self.split + ) + except TypeError: + sources = apply_model(self.model, waveform, device=self.device) + + # sources 形状: (batch, sources, channels, samples) + # 获取各音轨索引 + source_names = self.model.sources + vocals_idx = source_names.index("vocals") + drums_idx = source_names.index("drums") + bass_idx = source_names.index("bass") + other_idx = source_names.index("other") + + # 提取人声 + vocals = sources[0, vocals_idx] # (channels, samples) + + # 合并非人声音轨作为伴奏 + accompaniment = sources[0, drums_idx] + sources[0, bass_idx] + sources[0, other_idx] + + if progress_callback: + progress_callback("正在保存分离结果...", 0.8) + + # 保存结果 + vocals_path = output_path / "vocals.wav" + accompaniment_path = output_path / "accompaniment.wav" + + # 保存为 WAV + torchaudio.save( + str(vocals_path), + vocals.cpu(), + self.model.samplerate + ) + torchaudio.save( + str(accompaniment_path), + accompaniment.cpu(), + self.model.samplerate + ) + + if progress_callback: + progress_callback("人声分离完成", 1.0) + + # 释放显存 + empty_device_cache() + + return str(vocals_path), str(accompaniment_path) + + def unload_model(self): + """卸载模型释放显存""" + if self.model is not None: + self.model.cpu() # 先移到 CPU + del self.model + self.model = None + gc.collect() + empty_device_cache() + + +def check_demucs_available() -> bool: + """检查 Demucs 是否可用""" + return DEMUCS_AVAILABLE + + +def check_roformer_available() -> bool: + """检查 audio-separator (Roformer) 是否可用""" + return AUDIO_SEPARATOR_AVAILABLE + + +def get_available_models() -> list: + """获取可用的分离模型列表""" + models = [] + if AUDIO_SEPARATOR_AVAILABLE: + models.append({ + "name": "roformer", + "description": "Mel-Band Roformer (Kimberley Jensen) - 高质量人声分离" + }) + if DEMUCS_AVAILABLE: + models.extend([ + {"name": "htdemucs", "description": "Demucs 默认模型,平衡质量和速度 (SDR ~9dB)"}, + {"name": "htdemucs_ft", "description": "Demucs 微调版本,质量更高但更慢"}, + {"name": "mdx_extra", "description": "MDX 模型,适合某些音乐类型"}, + ]) + return models