Spaces:

mason369
/

AI-RVC

Sleeping

App Files Files Community

mason369 commited on Mar 10

Commit

762eecb

verified ·

1 Parent(s): b6f9c90

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

models/__init__.py +7 -0
models/rmvpe.py +439 -0
models/synthesizer.py +853 -0

models/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# -*- coding: utf-8 -*-
+"""
+模型定义模块
+"""
+from .rmvpe import RMVPE
+__all__ = ["RMVPE"]

models/rmvpe.py ADDED Viewed

	@@ -0,0 +1,439 @@

+# -*- coding: utf-8 -*-
+"""
+RMVPE 模型 - 用于高质量 F0 提取
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from typing import Optional
+class BiGRU(nn.Module):
+    """双向 GRU 层"""
+    def __init__(self, input_features: int, hidden_features: int, num_layers: int):
+        super().__init__()
+        self.gru = nn.GRU(
+            input_features,
+            hidden_features,
+            num_layers=num_layers,
+            batch_first=True,
+            bidirectional=True
+        )
+    def forward(self, x):
+        return self.gru(x)[0]
+class ConvBlockRes(nn.Module):
+    """残差卷积块"""
+    def __init__(self, in_channels: int, out_channels: int, momentum: float = 0.01,
+                 force_shortcut: bool = False):
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+            nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU()
+        )
+        # 当通道数不同或强制使用时才创建 shortcut
+        if in_channels != out_channels or force_shortcut:
+            self.shortcut = nn.Conv2d(in_channels, out_channels, 1)
+            self.has_shortcut = True
+        else:
+            self.has_shortcut = False
+    def forward(self, x):
+        if self.has_shortcut:
+            return self.conv(x) + self.shortcut(x)
+        else:
+            return self.conv(x) + x
+class EncoderBlock(nn.Module):
+    """编码器块 - 包含多个 ConvBlockRes 和一个池化层"""
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: int,
+                 n_blocks: int, momentum: float = 0.01):
+        super().__init__()
+        self.conv = nn.ModuleList()
+        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
+        for _ in range(n_blocks - 1):
+            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
+        self.pool = nn.AvgPool2d(kernel_size)
+    def forward(self, x):
+        for block in self.conv:
+            x = block(x)
+        # 返回池化前的张量用于 skip connection
+        return self.pool(x), x
+class Encoder(nn.Module):
+    """RMVPE 编码器"""
+    def __init__(self, in_channels: int, in_size: int, n_encoders: int,
+                 kernel_size: int, n_blocks: int, out_channels: int = 16,
+                 momentum: float = 0.01):
+        super().__init__()
+        self.n_encoders = n_encoders
+        self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
+        self.layers = nn.ModuleList()
+        self.latent_channels = []
+        for i in range(n_encoders):
+            self.layers.append(
+                EncoderBlock(
+                    in_channels if i == 0 else out_channels * (2 ** (i - 1)),
+                    out_channels * (2 ** i),
+                    kernel_size,
+                    n_blocks,
+                    momentum
+                )
+            )
+            self.latent_channels.append(out_channels * (2 ** i))
+    def forward(self, x):
+        x = self.bn(x)
+        concat_tensors = []
+        for layer in self.layers:
+            x, skip = layer(x)
+            concat_tensors.append(skip)
+        return x, concat_tensors
+class Intermediate(nn.Module):
+    """中间层"""
+    def __init__(self, in_channels: int, out_channels: int, n_inters: int,
+                 n_blocks: int, momentum: float = 0.01):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for i in range(n_inters):
+            if i == 0:
+                # 第一层: in_channels -> out_channels (256 -> 512)
+                self.layers.append(
+                    IntermediateBlock(in_channels, out_channels, n_blocks, momentum, first_block_shortcut=True)
+                )
+            else:
+                # 后续层: out_channels -> out_channels (512 -> 512)
+                self.layers.append(
+                    IntermediateBlock(out_channels, out_channels, n_blocks, momentum, first_block_shortcut=False)
+                )
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+class IntermediateBlock(nn.Module):
+    """中间层块"""
+    def __init__(self, in_channels: int, out_channels: int, n_blocks: int,
+                 momentum: float = 0.01, first_block_shortcut: bool = False):
+        super().__init__()
+        self.conv = nn.ModuleList()
+        # 第一个块可能需要强制使用 shortcut
+        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum, force_shortcut=first_block_shortcut))
+        for _ in range(n_blocks - 1):
+            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
+    def forward(self, x):
+        for block in self.conv:
+            x = block(x)
+        return x
+class DecoderBlock(nn.Module):
+    """解码器块"""
+    def __init__(self, in_channels: int, out_channels: int, stride: int,
+                 n_blocks: int, momentum: float = 0.01):
+        super().__init__()
+        # conv1: 转置卷积 + BatchNorm (kernel_size=3, stride=stride, padding=1, output_padding=1)
+        self.conv1 = nn.Sequential(
+            nn.ConvTranspose2d(in_channels, out_channels, 3, stride, padding=1, output_padding=1, bias=False),
+            nn.BatchNorm2d(out_channels, momentum=momentum)
+        )
+        # conv2: ConvBlockRes 列表
+        # 第一个块: in_channels = out_channels * 2 (concat 后), out_channels = out_channels
+        # 后续块: in_channels = out_channels, out_channels = out_channels
+        self.conv2 = nn.ModuleList()
+        self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
+        for _ in range(n_blocks - 1):
+            self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
+    def forward(self, x, concat_tensor):
+        x = self.conv1(x)
+        # 处理尺寸不匹配：填充较小的张量使其匹配较大的
+        diff_h = concat_tensor.size(2) - x.size(2)
+        diff_w = concat_tensor.size(3) - x.size(3)
+        if diff_h != 0 or diff_w != 0:
+            # 填充 x 使其与 concat_tensor 尺寸匹配
+            x = F.pad(x, [0, diff_w, 0, diff_h])
+        x = torch.cat([x, concat_tensor], dim=1)
+        for block in self.conv2:
+            x = block(x)
+        return x
+class Decoder(nn.Module):
+    """RMVPE 解码器"""
+    def __init__(self, in_channels: int, n_decoders: int, stride: int,
+                 n_blocks: int, out_channels: int = 16, momentum: float = 0.01):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for i in range(n_decoders):
+            out_ch = out_channels * (2 ** (n_decoders - 1 - i))
+            in_ch = in_channels if i == 0 else out_channels * (2 ** (n_decoders - i))
+            self.layers.append(
+                DecoderBlock(in_ch, out_ch, stride, n_blocks, momentum)
+            )
+    def forward(self, x, concat_tensors):
+        for i, layer in enumerate(self.layers):
+            x = layer(x, concat_tensors[-1 - i])
+        return x
+class DeepUnet(nn.Module):
+    """Deep U-Net 架构"""
+    def __init__(self, kernel_size: int, n_blocks: int, en_de_layers: int = 5,
+                 inter_layers: int = 4, in_channels: int = 1, en_out_channels: int = 16):
+        super().__init__()
+        # Encoder 输出通道: en_out_channels * 2^(en_de_layers-1) = 16 * 16 = 256
+        encoder_out_channels = en_out_channels * (2 ** (en_de_layers - 1))
+        # Intermediate 输出通道: encoder_out_channels * 2 = 512
+        intermediate_out_channels = encoder_out_channels * 2
+        self.encoder = Encoder(
+            in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
+        )
+        self.intermediate = Intermediate(
+            encoder_out_channels,
+            intermediate_out_channels,
+            inter_layers, n_blocks
+        )
+        self.decoder = Decoder(
+            intermediate_out_channels,
+            en_de_layers, kernel_size, n_blocks, en_out_channels
+        )
+    def forward(self, x):
+        x, concat_tensors = self.encoder(x)
+        x = self.intermediate(x)
+        x = self.decoder(x, concat_tensors)
+        return x
+class E2E(nn.Module):
+    """端到端 RMVPE 模型"""
+    def __init__(self, n_blocks: int, n_gru: int, kernel_size: int,
+                 en_de_layers: int = 5, inter_layers: int = 4,
+                 in_channels: int = 1, en_out_channels: int = 16):
+        super().__init__()
+        self.unet = DeepUnet(
+            kernel_size, n_blocks, en_de_layers, inter_layers,
+            in_channels, en_out_channels
+        )
+        self.cnn = nn.Conv2d(en_out_channels, 3, 3, 1, 1)
+        if n_gru:
+            self.fc = nn.Sequential(
+                BiGRU(3 * 128, 256, n_gru),
+                nn.Linear(512, 360),
+                nn.Dropout(0.25),
+                nn.Sigmoid()
+            )
+        else:
+            self.fc = nn.Sequential(
+                nn.Linear(3 * 128, 360),
+                nn.Dropout(0.25),
+                nn.Sigmoid()
+            )
+    def forward(self, mel):
+        # 输入 mel: [B, 128, T] 或 [B, 1, 128, T]
+        # 官方实现期望 [B, 1, T, 128]，即 time 在 height，mel bins 在 width
+        if mel.dim() == 3:
+            # [B, 128, T] -> [B, T, 128] -> [B, 1, T, 128]
+            mel = mel.transpose(-1, -2).unsqueeze(1)
+        elif mel.dim() == 4 and mel.shape[1] == 1:
+            # [B, 1, 128, T] -> [B, 1, T, 128]
+            mel = mel.transpose(-1, -2)
+        x = self.unet(mel)
+        x = self.cnn(x)
+        # x shape: (batch, 3, T, 128)
+        # 转换为 (batch, T, 384) 其中 384 = 3 * 128
+        x = x.transpose(1, 2).flatten(-2)  # (batch, T, 384)
+        x = self.fc(x)
+        return x
+class MelSpectrogram(nn.Module):
+    """Mel 频谱提取"""
+    def __init__(self, n_mel: int = 128, n_fft: int = 1024, win_size: int = 1024,
+                 hop_length: int = 160, sample_rate: int = 16000,
+                 fmin: int = 30, fmax: int = 8000):
+        super().__init__()
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_size = win_size
+        self.sample_rate = sample_rate
+        self.n_mel = n_mel
+        # 创建 Mel 滤波器组
+        mel_basis = self._mel_filterbank(sample_rate, n_fft, n_mel, fmin, fmax)
+        self.register_buffer("mel_basis", mel_basis)
+        self.register_buffer("window", torch.hann_window(win_size))
+    def _mel_filterbank(self, sr, n_fft, n_mels, fmin, fmax):
+        """创建 Mel 滤波器组"""
+        import librosa
+        # 必须使用 htk=True，与官方 RVC RMVPE 保持一致
+        mel = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=True)
+        return torch.from_numpy(mel).float()
+    def forward(self, audio):
+        # STFT
+        spec = torch.stft(
+            audio,
+            self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_size,
+            window=self.window,
+            center=True,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True
+        )
+        # 使用功率谱（幅度的平方），与官方 RMVPE 一致
+        spec = torch.abs(spec) ** 2
+        # Mel 变换
+        mel = torch.matmul(self.mel_basis, spec)
+        mel = torch.log(torch.clamp(mel, min=1e-5))
+        return mel
+class RMVPE:
+    """RMVPE F0 提取器封装类"""
+    def __init__(self, model_path: str, device: str = "cuda"):
+        self.device = device
+        # 加载模型
+        self.model = E2E(n_blocks=4, n_gru=1, kernel_size=2)
+        ckpt = torch.load(model_path, map_location="cpu", weights_only=False)
+        self.model.load_state_dict(ckpt)
+        self.model = self.model.to(device).eval()
+        # Mel 频谱提取器
+        self.mel_extractor = MelSpectrogram().to(device)
+        # 频率映射
+        cents_mapping = 20 * np.arange(360) + 1997.3794084376191
+        self.cents_mapping = np.pad(cents_mapping, (4, 4))
+    @torch.no_grad()
+    def infer_from_audio(self, audio: np.ndarray, thred: float = 0.03) -> np.ndarray:
+        """
+        从音频提取 F0
+        Args:
+            audio: 16kHz 音频数据
+            thred: 置信度阈值
+        Returns:
+            np.ndarray: F0 序列
+        """
+        # 转换为张量
+        audio = torch.from_numpy(audio).float().to(self.device)
+        if audio.dim() == 1:
+            audio = audio.unsqueeze(0)
+        # 提取 Mel 频谱: [B, 128, T]
+        mel = self.mel_extractor(audio)
+        # 记录原始帧数
+        n_frames = mel.shape[-1]
+        # 填充时间维度使其可被 32 整除（5 层池化，每层 /2）
+        n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
+        if n_pad > 0:
+            mel = F.pad(mel, (0, n_pad), mode='constant', value=0)
+        # 模型推理 - E2E.forward 会处理 transpose
+        hidden = self.model(mel)
+        # 移除填充部分，只保留原始帧数
+        hidden = hidden[:, :n_frames, :]
+        hidden = hidden.squeeze(0).cpu().numpy()
+        # 解码 F0
+        f0 = self._decode(hidden, thred)
+        return f0
+    def _decode(self, hidden: np.ndarray, thred: float) -> np.ndarray:
+        """解码隐藏状态为 F0 - 使用官方 RVC 算法"""
+        # 使用官方的 to_local_average_cents 算法
+        cents = self._to_local_average_cents(hidden, thred)
+        # 转换 cents 到 Hz
+        f0 = 10 * (2 ** (cents / 1200))
+        f0[f0 == 10] = 0  # cents=0 时 f0=10，需要置零
+        return f0
+    def _to_local_average_cents(self, salience: np.ndarray, thred: float) -> np.ndarray:
+        """官方 RVC 的 to_local_average_cents 算法"""
+        # Step 1: 找到每帧的峰值 bin
+        center = np.argmax(salience, axis=1)  # [T]
+        # Step 2: 对 salience 进行 padding
+        salience = np.pad(salience, ((0, 0), (4, 4)))  # [T, 368]
+        center += 4  # 调整 center 索引
+        # Step 3: 提取峰值附近 9 个 bin 的窗口并计算加权平均
+        todo_salience = []
+        todo_cents_mapping = []
+        starts = center - 4
+        ends = center + 5
+        for idx in range(salience.shape[0]):
+            todo_salience.append(salience[idx, starts[idx]:ends[idx]])
+            todo_cents_mapping.append(self.cents_mapping[starts[idx]:ends[idx]])
+        todo_salience = np.array(todo_salience)  # [T, 9]
+        todo_cents_mapping = np.array(todo_cents_mapping)  # [T, 9]
+        # Step 4: 加权平均
+        product_sum = np.sum(todo_salience * todo_cents_mapping, axis=1)
+        weight_sum = np.sum(todo_salience, axis=1) + 1e-9
+        cents = product_sum / weight_sum
+        # Step 5: 阈值过滤 - 使用原始 salience 的最大值
+        maxx = np.max(salience, axis=1)
+        cents[maxx <= thred] = 0
+        return cents

models/synthesizer.py ADDED Viewed

	@@ -0,0 +1,853 @@

+# -*- coding: utf-8 -*-
+"""
+RVC v2 合成器模型定义
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple
+import numpy as np
+class LayerNorm(nn.Module):
+    """Layer normalization for channels-first tensors"""
+    def __init__(self, channels: int, eps: float = 1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        # x: [B, C, T]
+        x = x.transpose(1, -1)  # [B, T, C]
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)  # [B, C, T]
+class MultiHeadAttention(nn.Module):
+    """Multi-head attention module"""
+    def __init__(self, channels: int, out_channels: int, n_heads: int,
+                 p_dropout: float = 0.0, window_size: Optional[int] = None,
+                 heads_share: bool = True, block_length: Optional[int] = None,
+                 proximal_bias: bool = False, proximal_init: bool = False):
+        super().__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.attn = None
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels ** -0.5
+            self.emb_rel_k = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev
+            )
+            self.emb_rel_v = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev
+            )
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+        if proximal_init:
+            with torch.no_grad():
+                self.conv_k.weight.copy_(self.conv_q.weight)
+                self.conv_k.bias.copy_(self.conv_q.bias)
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+        x = self.conv_o(x)
+        return x
+    def attention(self, query, key, value, mask=None):
+        # query, key, value: [B, C, T]
+        b, d, t_s = key.size()
+        t_t = query.size(2)
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+        if self.window_size is not None:
+            assert t_s == t_t, "Relative attention only for self-attention"
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
+            scores_local = self._relative_position_to_absolute_position(rel_logits)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias only for self-attention"
+            scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                assert t_s == t_t, "Block length only for self-attention"
+                block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
+                scores = scores.masked_fill(block_mask == 0, -1e4)
+        p_attn = F.softmax(scores, dim=-1)
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
+            output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
+        output = output.transpose(2, 3).contiguous().view(b, d, t_t)
+        return output, p_attn
+    def _matmul_with_relative_values(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+    def _matmul_with_relative_keys(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                (0, 0, pad_length, pad_length, 0, 0)
+            )
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
+        return used_relative_embeddings
+    def _relative_position_to_absolute_position(self, x):
+        batch, heads, length, _ = x.size()
+        x = F.pad(x, (0, 1, 0, 0, 0, 0, 0, 0))
+        x_flat = x.view(batch, heads, length * 2 * length)
+        x_flat = F.pad(x_flat, (0, length - 1, 0, 0, 0, 0))
+        x_final = x_flat.view(batch, heads, length + 1, 2 * length - 1)[:, :, :length, length - 1:]
+        return x_final
+    def _absolute_position_to_relative_position(self, x):
+        batch, heads, length, _ = x.size()
+        x = F.pad(x, (0, length - 1, 0, 0, 0, 0, 0, 0))
+        x_flat = x.view(batch, heads, length ** 2 + length * (length - 1))
+        x_flat = F.pad(x_flat, (length, 0, 0, 0, 0, 0))
+        x_final = x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:]
+        return x_final
+    def _attention_bias_proximal(self, length):
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+class FFN(nn.Module):
+    """Feed-forward network with optional causal convolution"""
+    def __init__(self, in_channels: int, out_channels: int, filter_channels: int,
+                 kernel_size: int, p_dropout: float = 0.0, activation: str = None,
+                 causal: bool = False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+        self.causal = causal
+        if causal:
+            self.padding = self._causal_padding
+        else:
+            self.padding = self._same_padding
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+        self.drop = nn.Dropout(p_dropout)
+    def forward(self, x, x_mask):
+        x = self.conv_1(self.padding(x))
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(self.padding(x))
+        return x * x_mask
+    def _causal_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = self.kernel_size - 1
+        pad_r = 0
+        return F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
+    def _same_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = (self.kernel_size - 1) // 2
+        pad_r = self.kernel_size // 2
+        return F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
+class Encoder(nn.Module):
+    """Transformer encoder with multi-head attention"""
+    def __init__(self, hidden_channels: int, filter_channels: int, n_heads: int,
+                 n_layers: int, kernel_size: int = 1, p_dropout: float = 0.0,
+                 window_size: int = 10):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for _ in range(n_layers):
+            self.attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels, hidden_channels, n_heads,
+                    p_dropout=p_dropout, window_size=window_size
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(hidden_channels, hidden_channels, filter_channels,
+                    kernel_size, p_dropout=p_dropout)
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+class TextEncoder(nn.Module):
+    """Text encoder for RVC - encodes phone and pitch embeddings"""
+    def __init__(self, out_channels: int, hidden_channels: int, filter_channels: int,
+                 n_heads: int, n_layers: int, kernel_size: int, p_dropout: float,
+                 f0: bool = True):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.f0 = f0
+        # Phone embedding: Linear projection from 768-dim HuBERT features
+        self.emb_phone = nn.Linear(768, hidden_channels)
+        # Pitch embedding (only if f0 is enabled)
+        if f0:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)
+        # Transformer encoder
+        self.encoder = Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers,
+            kernel_size, p_dropout
+        )
+        # Output projection to mean and log-variance
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, phone, pitch, lengths):
+        """
+        Args:
+            phone: [B, 768, T] phone features from HuBERT (channels first)
+            pitch: [B, T] pitch indices (0-255)
+            lengths: [B] sequence lengths
+        Returns:
+            m: [B, out_channels, T] mean
+            logs: [B, out_channels, T] log-variance
+            x_mask: [B, 1, T] mask
+        """
+        import logging
+        log = logging.getLogger(__name__)
+        log.debug(f"[TextEncoder] 输入 phone: shape={phone.shape}")
+        log.debug(f"[TextEncoder] 输入 pitch: shape={pitch.shape}, max={pitch.max().item()}, min={pitch.min().item()}")
+        log.debug(f"[TextEncoder] 输入 lengths: {lengths}")
+        # Transpose phone from [B, C, T] to [B, T, C] for linear layer
+        phone = phone.transpose(1, 2)  # [B, T, 768]
+        log.debug(f"[TextEncoder] 转置后 phone: shape={phone.shape}")
+        # Create mask
+        x_mask = torch.unsqueeze(
+            self._sequence_mask(lengths, phone.size(1)), 1
+        ).to(phone.dtype)
+        log.debug(f"[TextEncoder] x_mask: shape={x_mask.shape}, sum={x_mask.sum().item()}")
+        # Phone embedding
+        x = self.emb_phone(phone)  # [B, T, hidden_channels]
+        log.debug(f"[TextEncoder] emb_phone 输出: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}")
+        # Add pitch embedding if enabled
+        if self.f0 and pitch is not None:
+            # Clamp pitch to valid range
+            pitch_clamped = torch.clamp(pitch, 0, 255)
+            pitch_emb = self.emb_pitch(pitch_clamped)
+            log.debug(f"[TextEncoder] emb_pitch 输出: shape={pitch_emb.shape}, max={pitch_emb.abs().max().item():.4f}")
+            x = x + pitch_emb
+        # Transpose for conv layers: [B, hidden_channels, T]
+        x = x.transpose(1, 2)
+        log.debug(f"[TextEncoder] 转置后 x: shape={x.shape}")
+        # Apply mask
+        x = x * x_mask
+        # Transformer encoder
+        x = self.encoder(x, x_mask)
+        log.debug(f"[TextEncoder] Transformer 输出: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}")
+        # Project to mean and log-variance
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        log.debug(f"[TextEncoder] 最终输出 m: shape={m.shape}, max={m.abs().max().item():.4f}")
+        log.debug(f"[TextEncoder] 最终输出 logs: shape={logs.shape}, max={logs.max().item():.4f}, min={logs.min().item():.4f}")
+        return m, logs, x_mask
+    def _sequence_mask(self, length, max_length=None):
+        if max_length is None:
+            max_length = length.max()
+        x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+        return x.unsqueeze(0) < length.unsqueeze(1)
+class ResidualCouplingBlock(nn.Module):
+    """残差耦合块"""
+    def __init__(self, channels: int, hidden_channels: int, kernel_size: int,
+                 dilation_rate: int, n_layers: int, n_flows: int = 4,
+                 gin_channels: int = 0):
+        super().__init__()
+        self.flows = nn.ModuleList()
+        for _ in range(n_flows):
+            self.flows.append(
+                ResidualCouplingLayer(
+                    channels, hidden_channels, kernel_size,
+                    dilation_rate, n_layers, gin_channels=gin_channels
+                )
+            )
+            self.flows.append(Flip())
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+class ResidualCouplingLayer(nn.Module):
+    """残差耦合层"""
+    def __init__(self, channels: int, hidden_channels: int, kernel_size: int,
+                 dilation_rate: int, n_layers: int, mean_only: bool = True,
+                 gin_channels: int = 0):
+        super().__init__()
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels)
+        self.post = nn.Conv1d(hidden_channels, self.half_channels, 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, dim=1)
+        h = self.pre(x0) * x_mask
+        h = self.enc(h, x_mask, g=g)
+        stats = self.post(h) * x_mask
+        m = stats
+        if not reverse:
+            x1 = m + x1 * x_mask
+            x = torch.cat([x0, x1], dim=1)
+            return x, None
+        else:
+            x1 = (x1 - m) * x_mask
+            x = torch.cat([x0, x1], dim=1)
+            return x
+class Flip(nn.Module):
+    """翻转层"""
+    def forward(self, x, *args, reverse=False, **kwargs):
+        x = torch.flip(x, [1])
+        return x
+class WN(nn.Module):
+    """WaveNet 风格网络 (带权重归一化)"""
+    def __init__(self, hidden_channels: int, kernel_size: int,
+                 dilation_rate: int, n_layers: int, gin_channels: int = 0,
+                 p_dropout: float = 0):
+        super().__init__()
+        self.n_layers = n_layers
+        self.hidden_channels = hidden_channels
+        self.gin_channels = gin_channels
+        self.in_layers = nn.ModuleList()
+        self.res_skip_layers = nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+        if gin_channels > 0:
+            self.cond_layer = nn.utils.weight_norm(
+                nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
+            )
+        for i in range(n_layers):
+            dilation = dilation_rate ** i
+            padding = (kernel_size * dilation - dilation) // 2
+            self.in_layers.append(
+                nn.utils.weight_norm(
+                    nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size,
+                              dilation=dilation, padding=padding)
+                )
+            )
+            # 前 n-1 层输出 2 * hidden_channels，最后一层输出 hidden_channels
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+            self.res_skip_layers.append(
+                nn.utils.weight_norm(
+                    nn.Conv1d(hidden_channels, res_skip_channels, 1)
+                )
+            )
+    def forward(self, x, x_mask, g=None):
+        output = torch.zeros_like(x)
+        if g is not None and self.gin_channels > 0:
+            g = self.cond_layer(g)
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+            if g is not None:
+                cond_offset = i * 2 * self.hidden_channels
+                g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
+                x_in = x_in + g_l
+            acts = torch.tanh(x_in[:, :self.hidden_channels]) * torch.sigmoid(x_in[:, self.hidden_channels:])
+            acts = self.drop(acts)
+            res_skip = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                # 前 n-1 层：residual + skip
+                x = (x + res_skip[:, :self.hidden_channels]) * x_mask
+                output = output + res_skip[:, self.hidden_channels:]
+            else:
+                # 最后一层：只有 residual，加到 output
+                x = (x + res_skip) * x_mask
+                output = output + res_skip
+        return output * x_mask
+class PosteriorEncoder(nn.Module):
+    """后验编码器"""
+    def __init__(self, in_channels: int, out_channels: int, hidden_channels: int,
+                 kernel_size: int, dilation_rate: int, n_layers: int,
+                 gin_channels: int = 0):
+        super().__init__()
+        self.out_channels = out_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels)
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(
+            self._sequence_mask(x_lengths, x.size(2)), 1
+        ).to(x.dtype)
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+    def _sequence_mask(self, length, max_length=None):
+        if max_length is None:
+            max_length = length.max()
+        x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+        return x.unsqueeze(0) < length.unsqueeze(1)
+class Generator(nn.Module):
+    """NSF-HiFi-GAN 生成器 (带权重归一化)"""
+    def __init__(self, initial_channel: int, resblock_kernel_sizes: list,
+                 resblock_dilation_sizes: list, upsample_rates: list,
+                 upsample_initial_channel: int, upsample_kernel_sizes: list,
+                 gin_channels: int = 0, sr: int = 40000, is_half: bool = False):
+        super().__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.sr = sr
+        self.is_half = is_half
+        # 计算上采样因子
+        self.upp = int(np.prod(upsample_rates))
+        self.conv_pre = nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, 3)
+        # NSF 源模块
+        self.m_source = SourceModuleHnNSF(sample_rate=sr, harmonic_num=0)
+        # 噪声卷积层
+        self.noise_convs = nn.ModuleList()
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                nn.utils.weight_norm(
+                    nn.ConvTranspose1d(
+                        upsample_initial_channel // (2 ** i),
+                        c_cur,
+                        k, u, (k - u) // 2
+                    )
+                )
+            )
+            # 噪声卷积
+            if i + 1 < len(upsample_rates):
+                stride_f0 = int(np.prod(upsample_rates[i + 1:]))
+                self.noise_convs.append(
+                    nn.Conv1d(1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2)
+                )
+            else:
+                self.noise_convs.append(nn.Conv1d(1, c_cur, kernel_size=1))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d))
+        self.conv_post = nn.Conv1d(ch, 1, 7, 1, 3, bias=False)
+        if gin_channels > 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, f0, g=None):
+        import logging
+        log = logging.getLogger(__name__)
+        log.debug(f"[Generator] 输入 x: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}")
+        log.debug(f"[Generator] 输入 f0: shape={f0.shape}, max={f0.max().item():.1f}, min={f0.min().item():.1f}")
+        if g is not None:
+            log.debug(f"[Generator] 输入 g: shape={g.shape}, max={g.abs().max().item():.4f}")
+        # 生成 NSF 激励信号
+        har_source, _, _ = self.m_source(f0, self.upp)
+        har_source = har_source.transpose(1, 2)  # [B, 1, T*upp]
+        log.debug(f"[Generator] NSF har_source: shape={har_source.shape}, max={har_source.abs().max().item():.4f}")
+        x = self.conv_pre(x)
+        log.debug(f"[Generator] conv_pre 输出: shape={x.shape}, max={x.abs().max().item():.4f}")
+        if g is not None:
+            x = x + self.cond(g)
+            log.debug(f"[Generator] 加入条件后: max={x.abs().max().item():.4f}")
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, 0.1)
+            x = self.ups[i](x)
+            # 融合噪声
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+            log.debug(f"[Generator] 上采样层 {i}: shape={x.shape}, max={x.abs().max().item():.4f}")
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        log.debug(f"[Generator] conv_post 输出: shape={x.shape}, max={x.abs().max().item():.4f}")
+        x = torch.tanh(x)
+        log.debug(f"[Generator] tanh 输出: shape={x.shape}, max={x.abs().max().item():.4f}")
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            nn.utils.remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+class ResBlock(nn.Module):
+    """残差��� (带权重归一化)"""
+    def __init__(self, channels: int, kernel_size: int = 3, dilation: tuple = (1, 3, 5)):
+        super().__init__()
+        self.convs1 = nn.ModuleList([
+            nn.utils.weight_norm(
+                nn.Conv1d(channels, channels, kernel_size, 1,
+                          (kernel_size * d - d) // 2, dilation=d)
+            )
+            for d in dilation
+        ])
+        self.convs2 = nn.ModuleList([
+            nn.utils.weight_norm(
+                nn.Conv1d(channels, channels, kernel_size, 1,
+                          (kernel_size - 1) // 2)
+            )
+            for _ in dilation
+        ])
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, 0.1)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, 0.1)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            nn.utils.remove_weight_norm(l)
+        for l in self.convs2:
+            nn.utils.remove_weight_norm(l)
+class SineGenerator(nn.Module):
+    """正弦波生成器 - NSF 的核心组件"""
+    def __init__(self, sample_rate: int, harmonic_num: int = 0,
+                 sine_amp: float = 0.1, noise_std: float = 0.003,
+                 voiced_threshold: float = 10):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.harmonic_num = harmonic_num
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.voiced_threshold = voiced_threshold
+        self.dim = harmonic_num + 1
+    def forward(self, f0: torch.Tensor, upp: int):
+        """
+        生成正弦波激励信号
+        Args:
+            f0: 基频张量 [B, T]
+            upp: 上采样因子
+        Returns:
+            正弦波信号 [B, T*upp, 1]
+        """
+        with torch.no_grad():
+            # 上采样 F0
+            f0 = f0.unsqueeze(1)  # [B, 1, T]
+            f0_up = F.interpolate(f0, scale_factor=upp, mode='nearest')
+            f0_up = f0_up.transpose(1, 2)  # [B, T*upp, 1]
+            # 生成正弦波
+            rad = f0_up / self.sample_rate  # 归一化频率
+            rad_acc = torch.cumsum(rad, dim=1) % 1  # 累积相位
+            sine_wave = torch.sin(2 * np.pi * rad_acc) * self.sine_amp
+            # 静音区域（F0=0）使用噪声
+            voiced_mask = (f0_up > self.voiced_threshold).float()
+            noise = torch.randn_like(sine_wave) * self.noise_std
+            sine_wave = sine_wave * voiced_mask + noise * (1 - voiced_mask)
+            return sine_wave
+class SourceModuleHnNSF(nn.Module):
+    """谐波加噪声源模块"""
+    def __init__(self, sample_rate: int, harmonic_num: int = 0,
+                 sine_amp: float = 0.1, noise_std: float = 0.003,
+                 add_noise_std: float = 0.003):
+        super().__init__()
+        self.sine_generator = SineGenerator(
+            sample_rate, harmonic_num, sine_amp, noise_std
+        )
+        self.l_linear = nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = nn.Tanh()
+    def forward(self, f0: torch.Tensor, upp: int):
+        sine = self.sine_generator(f0, upp)  # [B, T*upp, 1]
+        sine = self.l_tanh(self.l_linear(sine))
+        noise = torch.randn_like(sine) * 0.003
+        return sine, noise, None  # 返回 3 个值以匹配接口
+class SynthesizerTrnMs768NSFsid(nn.Module):
+    """RVC v2 合成器 (768 维 HuBERT + NSF + SID)"""
+    def __init__(self, spec_channels: int, segment_size: int,
+                 inter_channels: int, hidden_channels: int, filter_channels: int,
+                 n_heads: int, n_layers: int, kernel_size: int, p_dropout: float,
+                 resblock: str, resblock_kernel_sizes: list,
+                 resblock_dilation_sizes: list, upsample_rates: list,
+                 upsample_initial_channel: int, upsample_kernel_sizes: list,
+                 spk_embed_dim: int, gin_channels: int, sr: int):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        self.spk_embed_dim = spk_embed_dim
+        self.sr = sr
+        # 文本编码器 (使用 TextEncoder 替代 PosteriorEncoder)
+        self.enc_p = TextEncoder(
+            inter_channels, hidden_channels, filter_channels,
+            n_heads, n_layers, kernel_size, p_dropout, f0=True
+        )
+        # 解码器/生成器 (NSF-HiFiGAN，内部包含 m_source)
+        self.dec = Generator(
+            inter_channels, resblock_kernel_sizes, resblock_dilation_sizes,
+            upsample_rates, upsample_initial_channel, upsample_kernel_sizes,
+            gin_channels, sr=sr
+        )
+        # 流
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        # 说话人嵌入
+        self.emb_g = nn.Embedding(spk_embed_dim, gin_channels)
+    def forward(self, phone, phone_lengths, pitch, nsff0, sid, skip_head=0, return_length=0):
+        """前向传播"""
+        g = self.emb_g(sid).unsqueeze(-1)
+        # TextEncoder 返回 mean 和 log-variance
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        # 在编码器外部采样
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        # 正向 flow
+        z = self.flow(z_p, x_mask, g=g)
+        # 生成音频 (传入 f0)
+        o = self.dec(z, nsff0, g=g)
+        return o
+    def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=1.0):
+        """推理"""
+        import logging
+        log = logging.getLogger(__name__)
+        log.debug(f"[infer] 输入 phone: shape={phone.shape}, dtype={phone.dtype}")
+        log.debug(f"[infer] 输入 phone 统计: max={phone.abs().max().item():.4f}, mean={phone.abs().mean().item():.4f}")
+        log.debug(f"[infer] 输入 phone_lengths: {phone_lengths}")
+        log.debug(f"[infer] 输入 pitch: shape={pitch.shape}, max={pitch.max().item()}, min={pitch.min().item()}")
+        log.debug(f"[infer] 输入 nsff0: shape={nsff0.shape}, max={nsff0.max().item():.1f}, min={nsff0.min().item():.1f}")
+        log.debug(f"[infer] 输入 sid: {sid}")
+        g = self.emb_g(sid).unsqueeze(-1)
+        log.debug(f"[infer] 说话人嵌入 g: shape={g.shape}, max={g.abs().max().item():.4f}")
+        # TextEncoder 返回 mean 和 log-variance
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        log.debug(f"[infer] TextEncoder 输出:")
+        log.debug(f"[infer]   m_p: shape={m_p.shape}, max={m_p.abs().max().item():.4f}, mean={m_p.abs().mean().item():.4f}")
+        log.debug(f"[infer]   logs_p: shape={logs_p.shape}, max={logs_p.max().item():.4f}, min={logs_p.min().item():.4f}")
+        log.debug(f"[infer]   x_mask: shape={x_mask.shape}, sum={x_mask.sum().item()}")
+        # 在编码器外部采样 (使用较小的噪声系数以获得更稳定的输出)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        log.debug(f"[infer] 采样后 z_p: shape={z_p.shape}, max={z_p.abs().max().item():.4f}, mean={z_p.abs().mean().item():.4f}")
+        # 反向 flow
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        log.debug(f"[infer] Flow 输出 z: shape={z.shape}, max={z.abs().max().item():.4f}, mean={z.abs().mean().item():.4f}")
+        # 生成音频 (传入 f0，Generator 内部会生成 NSF 激励信号)
+        o = self.dec(z * x_mask, nsff0, g=g)
+        log.debug(f"[infer] Generator 输出 o: shape={o.shape}, max={o.abs().max().item():.4f}, mean={o.abs().mean().item():.4f}")
+        return o, x_mask